From a83586a7ddba25065ec37323c05deb9019ce4fa9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 23 Feb 2021 21:14:57 +0800 Subject: bpf: Remove blank line in bpf helper description comment Commit 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") added an extra blank line in bpf helper description. This will make bpf_helpers_doc.py stop building bpf_helper_defs.h immediately after bpf_check_mtu(), which will affect future added functions. Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210223131457.1378978-1-liuhangbin@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. -- cgit v1.2.3 From 69c087ba6225b574afb6e505b72cb75242a3d844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:25 -0800 Subject: bpf: Add bpf_for_each_map_elem() helper The bpf_for_each_map_elem() helper is introduced which iterates all map elements with a callback function. The helper signature looks like long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags) and for each map element, the callback_fn will be called. For example, like hashmap, the callback signature may look like long callback_fn(map, key, val, callback_ctx) There are two known use cases for this. One is from upstream ([1]) where a for_each_map_elem helper may help implement a timeout mechanism in a more generic way. Another is from our internal discussion for a firewall use case where a map contains all the rules. The packet data can be compared to all these rules to decide allow or deny the packet. For array maps, users can already use a bounded loop to traverse elements. Using this helper can avoid using bounded loop. For other type of maps (e.g., hash maps) where bounded loop is hard or impossible to use, this helper provides a convenient way to operate on all elements. For callback_fn, besides map and map element, a callback_ctx, allocated on caller stack, is also passed to the callback function. This callback_ctx argument can provide additional input and allow to write to caller stack for output. If the callback_fn returns 0, the helper will iterate through next element if available. If the callback_fn returns 1, the helper will stop iterating and returns to the bpf program. Other return values are not used for now. Currently, this helper is only available with jit. It is possible to make it work with interpreter with so effort but I leave it as the future work. [1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/ Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com --- include/linux/bpf.h | 13 +++ include/linux/bpf_verifier.h | 3 + include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/bpf_iter.c | 16 ++++ kernel/bpf/helpers.c | 2 + kernel/bpf/verifier.c | 208 ++++++++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 2 + tools/include/uapi/linux/bpf.h | 38 ++++++++ 8 files changed, 307 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1e4d2f60527..aeb1b93a4d75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; +struct bpf_func_state; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -129,6 +130,13 @@ struct bpf_map_ops { bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); + + int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -295,6 +303,8 @@ enum bpf_arg_type { ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ + ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ __BPF_ARG_TYPE_MAX, }; @@ -411,6 +421,8 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ }; /* The information passed from prog-specific *_is_valid_access @@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_for_each_map_elem_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 971b33aca13d..51c2ffa3d901 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -68,6 +68,8 @@ struct bpf_reg_state { unsigned long raw1; unsigned long raw2; } raw; + + u32 subprogno; /* for PTR_TO_FUNC */ }; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. @@ -204,6 +206,7 @@ struct bpf_func_state { int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; + bool in_callback_fn; struct bpf_stack_state *stack; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..074800226327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbdca49ac6cc..53afe9461b03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +254,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -552,6 +561,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { + if (bpf_pseudo_func(insn + i)) { + if (!env->bpf_capable) { + verbose(env, + "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + return -EPERM; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + /* remember subprog */ + insn[i + 1].imm = ret; + continue; + } if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { @@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3396,7 +3430,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4729,6 +4785,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); } +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem) return 0; if (map == NULL) { @@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, else *ptr_limit = -off; return 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; @@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", @@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e9701744d8e4..0d23755c2747 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 8fd886911a6a99acf4a8facf619a2e7b5225be78 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:47 +0100 Subject: bpf: Add BTF_KIND_FLOAT to uapi Add a new kind value and expand the kind bitfield. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-2-iii@linux.ibm.com --- include/uapi/linux/btf.h | 5 +++-- tools/include/uapi/linux/btf.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately -- cgit v1.2.3 From 7799e4d9d84f6f8231dfd9dca4da5f4b2f0aa932 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:33 -0800 Subject: bpf: Import syscall arg documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These descriptions are present in the man-pages project from the original submissions around 2015-2016. Import them so that they can be kept up to date as developers extend the bpf syscall commands. These descriptions follow the pattern used by scripts/bpf_helpers_doc.py so that we can take advantage of the parser to generate more up-to-date man page writing based upon these headers. Some minor wording adjustments were made to make the descriptions more consistent for the description / return format. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-2-joe@cilium.io Co-authored-by: Alexei Starovoitov Co-authored-by: Michael Kerrisk --- include/uapi/linux/bpf.h | 122 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b89af20cfa19..fb16c590e6d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -93,7 +93,127 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * For example, after **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. In addition, file descriptors + * referring to eBPF objects can be transferred over UNIX domain sockets. + * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. An eBPF object is + * deallocated only after all file descriptors referring to the object + * have been closed. + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, -- cgit v1.2.3 From f67c9cbf6c581468f6c7144d497565cfc7918c31 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:34 -0800 Subject: bpf: Add minimal bpf() command documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce high-level descriptions of the intent and return codes of the bpf() syscall commands. Subsequent patches may further flesh out the content to provide a more useful programming reference. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-3-joe@cilium.io --- include/uapi/linux/bpf.h | 368 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 368 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fb16c590e6d9..052bbfe65f77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -204,6 +204,374 @@ union bpf_iter_link_info { * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run an eBPF program a number of times against a provided + * program context and return the modified program context and + * duration of the test run. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Iterate and update multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_DELETE_BATCH + * Description + * Iterate and delete multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * For example, after **fork**\ (2), the child inherits file descriptors -- cgit v1.2.3 From 6690523bccb3e44cfcc4b2c995767e6814046e34 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:35 -0800 Subject: bpf: Document BPF_F_LOCK in syscall commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the meaning of the BPF_F_LOCK flag for the map lookup/update descriptions. Based on commit 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag"). Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-4-joe@cilium.io --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 052bbfe65f77..eb9f059f0569 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -123,6 +123,14 @@ union bpf_iter_link_info { * Look up an element with a given *key* in the map referred to * by the file descriptor *map_fd*. * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -140,6 +148,8 @@ union bpf_iter_link_info { * Create a new element only if it did not exist. * **BPF_EXIST** * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. * * Return * Returns zero on success. On error, -1 is returned and *errno* -- cgit v1.2.3 From 8aacb3c8d1a32b23c82645051bba55f0ae6c103b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:36 -0800 Subject: bpf: Document BPF_PROG_PIN syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b2197755b263 ("bpf: add support for persistent maps/progs") contains the original implementation and git logs, used as reference for this documentation. Also pull in the filename restriction as documented in commit 6d8cb045cde6 ("bpf: comment why dots in filenames under BPF virtual FS are not allowed") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-5-joe@cilium.io --- include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb9f059f0569..6946dde90c56 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -219,6 +219,22 @@ union bpf_iter_link_info { * Pin an eBPF program or map referred by the specified *bpf_fd* * to the provided *pathname* on the filesystem. * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -584,13 +600,19 @@ union bpf_iter_link_info { * * NOTES * eBPF objects (maps and programs) can be shared between processes. - * For example, after **fork**\ (2), the child inherits file descriptors - * referring to the same eBPF objects. In addition, file descriptors - * referring to eBPF objects can be transferred over UNIX domain sockets. - * File descriptors referring to eBPF objects can be duplicated in the - * usual way, using **dup**\ (2) and similar calls. An eBPF object is - * deallocated only after all file descriptors referring to the object - * have been closed. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). */ enum bpf_cmd { BPF_MAP_CREATE, -- cgit v1.2.3 From 32e76b187a90de5809d68c2ef3e3964176dacaf0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:37 -0800 Subject: bpf: Document BPF_PROG_ATTACH syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the prog attach command in more detail, based on git commits: * commit f4324551489e ("bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands") * commit 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") * commit f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2") * commit d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-6-joe@cilium.io --- include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6946dde90c56..a8f2964ec885 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -253,6 +253,43 @@ union bpf_iter_link_info { * Attach an eBPF program to a *target_fd* at the specified * *attach_type* hook. * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. -- cgit v1.2.3 From 2a3fdca4e3bc7a01316277ba26f4090c4b19bf7c Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:38 -0800 Subject: bpf: Document BPF_PROG_TEST_RUN syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on a brief read of the corresponding source code. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-7-joe@cilium.io --- include/uapi/linux/bpf.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a8f2964ec885..a6cd6650e23d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -306,14 +306,22 @@ union bpf_iter_link_info { * * BPF_PROG_TEST_RUN * Description - * Run an eBPF program a number of times against a provided - * program context and return the modified program context and - * duration of the test run. + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * * BPF_PROG_GET_NEXT_ID * Description * Fetch the next eBPF program currently loaded into the kernel. -- cgit v1.2.3 From 5d999994e05d62d4f53059540652014cf83cddfe Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:39 -0800 Subject: bpf: Document BPF_PROG_QUERY syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 468e2f64d220 ("bpf: introduce BPF_PROG_QUERY command") originally introduced this, but there have been several additions since then. Unlike BPF_PROG_ATTACH, it appears that the sockmap progs are not able to be queried so far. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-8-joe@cilium.io --- include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6cd6650e23d..0cf92ef011f1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -389,6 +389,43 @@ union bpf_iter_link_info { * Obtain information about eBPF programs associated with the * specified *attach_type* hook. * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. -- cgit v1.2.3 From 0cb804547927c05f6aa7e28c8d4a1e02fec1a6d4 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:40 -0800 Subject: bpf: Document BPF_MAP_*_BATCH syscall commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based roughly on the following commits: * Commit cb4d03ab499d ("bpf: Add generic support for lookup batch op") * Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") * Commit aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Brian Vazquez Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-9-joe@cilium.io --- include/uapi/linux/bpf.h | 114 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cf92ef011f1..c8b9d19fce22 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -553,13 +553,55 @@ union bpf_iter_link_info { * Description * Iterate and fetch multiple elements in a map. * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * * BPF_MAP_LOOKUP_AND_DELETE_BATCH * Description - * Iterate and delete multiple elements in a map. + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -567,15 +609,81 @@ union bpf_iter_link_info { * * BPF_MAP_UPDATE_BATCH * Description - * Iterate and update multiple elements in a map. + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * * BPF_MAP_DELETE_BATCH * Description - * Iterate and delete multiple elements in a map. + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. * * Return * Returns zero on success. On error, -1 is returned and *errno* -- cgit v1.2.3 From 923a932c982fd71856f80dbeaaa3ca41a75e89e0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:41 -0800 Subject: scripts/bpf: Abstract eBPF API target parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract out the target parameter so that upcoming commits, more than just the existing "helpers" target can be called to generate specific portions of docs from the eBPF UAPI headers. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-10-joe@cilium.io --- MAINTAINERS | 1 + include/uapi/linux/bpf.h | 2 +- scripts/bpf_doc.py | 650 +++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 615 -------------------------------------- tools/bpf/Makefile.helpers | 2 +- tools/include/uapi/linux/bpf.h | 2 +- tools/lib/bpf/Makefile | 2 +- tools/perf/MANIFEST | 2 +- 8 files changed, 656 insertions(+), 620 deletions(-) create mode 100755 scripts/bpf_doc.py delete mode 100755 scripts/bpf_helpers_doc.py (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index a50a543e3c81..8d56c7044067 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3223,6 +3223,7 @@ F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ +F: scripts/bpf_doc.py F: tools/bpf/ F: tools/lib/bpf/ F: tools/testing/selftests/bpf/ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8b9d19fce22..63a56ed6a785 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1439,7 +1439,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py new file mode 100755 index 000000000000..5a4f68aab335 --- /dev/null +++ b/scripts/bpf_doc.py @@ -0,0 +1,650 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (C) 2018-2019 Netronome Systems, Inc. +# Copyright (C) 2021 Isovalent, Inc. + +# In case user attempts to run with Python 2. +from __future__ import print_function + +import argparse +import re +import sys, os + +class NoHelperFound(BaseException): + pass + +class ParsingError(BaseException): + def __init__(self, line='', reader=None): + if reader: + BaseException.__init__(self, + 'Error at file offset %d, parsing line: %s' % + (reader.tell(), line)) + else: + BaseException.__init__(self, 'Error parsing line: %s' % line) + +class Helper(object): + """ + An object representing the description of an eBPF helper function. + @proto: function prototype of the helper function + @desc: textual description of the helper function + @ret: description of the return value of the helper function + """ + def __init__(self, proto='', desc='', ret=''): + self.proto = proto + self.desc = desc + self.ret = ret + + def proto_break_down(self): + """ + Break down helper function protocol into smaller chunks: return type, + name, distincts arguments. + """ + arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$') + res = {} + proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') + + capture = proto_re.match(self.proto) + res['ret_type'] = capture.group(1) + res['ret_star'] = capture.group(2) + res['name'] = capture.group(3) + res['args'] = [] + + args = capture.group(4).split(', ') + for a in args: + capture = arg_re.match(a) + res['args'].append({ + 'type' : capture.group(1), + 'star' : capture.group(5), + 'name' : capture.group(6) + }) + + return res + +class HeaderParser(object): + """ + An object used to parse a file in order to extract the documentation of a + list of eBPF helper functions. All the helpers that can be retrieved are + stored as Helper object, in the self.helpers() array. + @filename: name of file to parse, usually include/uapi/linux/bpf.h in the + kernel tree + """ + def __init__(self, filename): + self.reader = open(filename, 'r') + self.line = '' + self.helpers = [] + + def parse_helper(self): + proto = self.parse_proto() + desc = self.parse_desc() + ret = self.parse_ret() + return Helper(proto=proto, desc=desc, ret=ret) + + def parse_proto(self): + # Argument can be of shape: + # - "void" + # - "type name" + # - "type *name" + # - Same as above, with "const" and/or "struct" in front of type + # - "..." (undefined number of arguments, for bpf_trace_printk()) + # There is at least one term ("void"), and at most five arguments. + p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') + capture = p.match(self.line) + if not capture: + raise NoHelperFound + self.line = self.reader.readline() + return capture.group(1) + + def parse_desc(self): + p = re.compile(' \* ?(?:\t| {5,8})Description$') + capture = p.match(self.line) + if not capture: + # Helper can have empty description and we might be parsing another + # attribute: return but do not consume. + return '' + # Description can be several lines, some of them possibly empty, and it + # stops when another subsection title is met. + desc = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + desc += '\n' + else: + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') + capture = p.match(self.line) + if capture: + desc += capture.group(1) + '\n' + else: + break + return desc + + def parse_ret(self): + p = re.compile(' \* ?(?:\t| {5,8})Return$') + capture = p.match(self.line) + if not capture: + # Helper can have empty retval and we might be parsing another + # attribute: return but do not consume. + return '' + # Return value description can be several lines, some of them possibly + # empty, and it stops when another subsection title is met. + ret = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + ret += '\n' + else: + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') + capture = p.match(self.line) + if capture: + ret += capture.group(1) + '\n' + else: + break + return ret + + def run(self): + # Advance to start of helper function descriptions. + offset = self.reader.read().find('* Start of BPF helper function descriptions:') + if offset == -1: + raise Exception('Could not find start of eBPF helper descriptions list') + self.reader.seek(offset) + self.reader.readline() + self.reader.readline() + self.line = self.reader.readline() + + while True: + try: + helper = self.parse_helper() + self.helpers.append(helper) + except NoHelperFound: + break + + self.reader.close() + +############################################################################### + +class Printer(object): + """ + A generic class for printers. Printers should be created with an array of + Helper objects, and implement a way to print them in the desired fashion. + @parser: A HeaderParser with objects to print to standard output + """ + def __init__(self, parser): + self.parser = parser + self.elements = [] + + def print_header(self): + pass + + def print_footer(self): + pass + + def print_one(self, helper): + pass + + def print_all(self): + self.print_header() + for elem in self.elements: + self.print_one(elem) + self.print_footer() + + +class PrinterRST(Printer): + """ + A generic class for printers that print ReStructured Text. Printers should + be created with a HeaderParser object, and implement a way to print API + elements in the desired fashion. + @parser: A HeaderParser with objects to print to standard output + """ + def __init__(self, parser): + self.parser = parser + + def print_license(self): + license = '''\ +.. Copyright (C) All BPF authors and contributors from 2014 to present. +.. See git log include/uapi/linux/bpf.h in kernel tree for details. +.. +.. %%%LICENSE_START(VERBATIM) +.. Permission is granted to make and distribute verbatim copies of this +.. manual provided the copyright notice and this permission notice are +.. preserved on all copies. +.. +.. Permission is granted to copy and distribute modified versions of this +.. manual under the conditions for verbatim copying, provided that the +.. entire resulting derived work is distributed under the terms of a +.. permission notice identical to this one. +.. +.. Since the Linux kernel and libraries are constantly changing, this +.. manual page may be incorrect or out-of-date. The author(s) assume no +.. responsibility for errors or omissions, or for damages resulting from +.. the use of the information contained herein. The author(s) may not +.. have taken the same level of care in the production of this manual, +.. which is licensed free of charge, as they might when working +.. professionally. +.. +.. Formatted or processed versions of this manual, if unaccompanied by +.. the source, must acknowledge the copyright and authors of this work. +.. %%%LICENSE_END +.. +.. Please do not edit this file. It was generated from the documentation +.. located in file include/uapi/linux/bpf.h of the Linux kernel sources +.. (helpers description), and from scripts/bpf_doc.py in the same +.. repository (header and footer). +''' + print(license) + + def print_elem(self, elem): + if (elem.desc): + print('\tDescription') + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', elem.desc, count=1).split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + if (elem.ret): + print('\tReturn') + for line in elem.ret.rstrip().split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + print('') + + +class PrinterHelpersRST(PrinterRST): + """ + A printer for dumping collected information about helpers as a ReStructured + Text page compatible with the rst2man program, which can be used to + generate a manual page for the helpers. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + def print_header(self): + header = '''\ +=========== +BPF-HELPERS +=========== +------------------------------------------------------------------------------- +list of eBPF helper functions +------------------------------------------------------------------------------- + +:Manual section: 7 + +DESCRIPTION +=========== + +The extended Berkeley Packet Filter (eBPF) subsystem consists in programs +written in a pseudo-assembly language, then attached to one of the several +kernel hooks and run in reaction of specific events. This framework differs +from the older, "classic" BPF (or "cBPF") in several aspects, one of them being +the ability to call special functions (or "helpers") from within a program. +These functions are restricted to a white-list of helpers defined in the +kernel. + +These helpers are used by eBPF programs to interact with the system, or with +the context in which they work. For instance, they can be used to print +debugging messages, to get the time since the system was booted, to interact +with eBPF maps, or to manipulate network packets. Since there are several eBPF +program types, and that they do not run in the same context, each program type +can only call a subset of those helpers. + +Due to eBPF conventions, a helper can not have more than five arguments. + +Internally, eBPF programs call directly into the compiled helper functions +without requiring any foreign-function interface. As a result, calling helpers +introduces no overhead, thus offering excellent performance. + +This document is an attempt to list and document the helpers available to eBPF +developers. They are sorted by chronological order (the oldest helpers in the +kernel at the top). + +HELPERS +======= +''' + PrinterRST.print_license(self) + print(header) + + def print_footer(self): + footer = ''' +EXAMPLES +======== + +Example usage for most of the eBPF helpers listed in this manual page are +available within the Linux kernel sources, at the following locations: + +* *samples/bpf/* +* *tools/testing/selftests/bpf/* + +LICENSE +======= + +eBPF programs can have an associated license, passed along with the bytecode +instructions to the kernel when the programs are loaded. The format for that +string is identical to the one in use for kernel modules (Dual licenses, such +as "Dual BSD/GPL", may be used). Some helper functions are only accessible to +programs that are compatible with the GNU Privacy License (GPL). + +In order to use such helpers, the eBPF program must be loaded with the correct +license string passed (via **attr**) to the **bpf**\ () system call, and this +generally translates into the C source code of the program containing a line +similar to the following: + +:: + + char ____license[] __attribute__((section("license"), used)) = "GPL"; + +IMPLEMENTATION +============== + +This manual page is an effort to document the existing eBPF helper functions. +But as of this writing, the BPF sub-system is under heavy development. New eBPF +program or map types are added, along with new helper functions. Some helpers +are occasionally made available for additional program types. So in spite of +the efforts of the community, this page might not be up-to-date. If you want to +check by yourself what helper functions exist in your kernel, or what types of +programs they can support, here are some files among the kernel tree that you +may be interested in: + +* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list + of all helper functions, as well as many other BPF definitions including most + of the flags, structs or constants used by the helpers. +* *net/core/filter.c* contains the definition of most network-related helper + functions, and the list of program types from which they can be used. +* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related + helpers. +* *kernel/bpf/verifier.c* contains the functions used to check that valid types + of eBPF maps are used with a given helper function. +* *kernel/bpf/* directory contains other files in which additional helpers are + defined (for cgroups, sockmaps, etc.). +* The bpftool utility can be used to probe the availability of helper functions + on the system (as well as supported program and map types, and a number of + other parameters). To do so, run **bpftool feature probe** (see + **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to + list features available to unprivileged users. + +Compatibility between helper functions and program types can generally be found +in the files where helper functions are defined. Look for the **struct +bpf_func_proto** objects and for functions returning them: these functions +contain a list of helpers that a given program type can call. Note that the +**default:** label of the **switch ... case** used to filter helpers can call +other functions, themselves allowing access to additional helpers. The +requirement for GPL license is also in those **struct bpf_func_proto**. + +Compatibility between helper functions and map types can be found in the +**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*. + +Helper functions that invalidate the checks on **data** and **data_end** +pointers for network processing are listed in function +**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*. + +SEE ALSO +======== + +**bpf**\ (2), +**bpftool**\ (8), +**cgroups**\ (7), +**ip**\ (8), +**perf_event_open**\ (2), +**sendmsg**\ (2), +**socket**\ (7), +**tc-bpf**\ (8)''' + print(footer) + + def print_proto(self, helper): + """ + Format function protocol with bold and italics markers. This makes RST + file less readable, but gives nice results in the manual page. + """ + proto = helper.proto_break_down() + + print('**%s %s%s(' % (proto['ret_type'], + proto['ret_star'].replace('*', '\\*'), + proto['name']), + end='') + + comma = '' + for a in proto['args']: + one_arg = '{}{}'.format(comma, a['type']) + if a['name']: + if a['star']: + one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*')) + else: + one_arg += '** ' + one_arg += '*{}*\\ **'.format(a['name']) + comma = ', ' + print(one_arg, end='') + + print(')**') + + def print_one(self, helper): + self.print_proto(helper) + self.print_elem(helper) + + + + +class PrinterHelpers(Printer): + """ + A printer for dumping collected information about helpers as C header to + be included from BPF program. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + type_fwds = [ + 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', + 'struct bpf_perf_event_data', + 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', + 'struct bpf_sock', + 'struct bpf_sock_addr', + 'struct bpf_sock_ops', + 'struct bpf_sock_tuple', + 'struct bpf_spin_lock', + 'struct bpf_sysctl', + 'struct bpf_tcp_sock', + 'struct bpf_tunnel_key', + 'struct bpf_xfrm_state', + 'struct linux_binprm', + 'struct pt_regs', + 'struct sk_reuseport_md', + 'struct sockaddr', + 'struct tcphdr', + 'struct seq_file', + 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', + 'struct udp6_sock', + 'struct task_struct', + + 'struct __sk_buff', + 'struct sk_msg_md', + 'struct xdp_md', + 'struct path', + 'struct btf_ptr', + 'struct inode', + 'struct socket', + 'struct file', + ] + known_types = { + '...', + 'void', + 'const void', + 'char', + 'const char', + 'int', + 'long', + 'unsigned long', + + '__be16', + '__be32', + '__wsum', + + 'struct bpf_fib_lookup', + 'struct bpf_perf_event_data', + 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', + 'struct bpf_sk_lookup', + 'struct bpf_sock', + 'struct bpf_sock_addr', + 'struct bpf_sock_ops', + 'struct bpf_sock_tuple', + 'struct bpf_spin_lock', + 'struct bpf_sysctl', + 'struct bpf_tcp_sock', + 'struct bpf_tunnel_key', + 'struct bpf_xfrm_state', + 'struct linux_binprm', + 'struct pt_regs', + 'struct sk_reuseport_md', + 'struct sockaddr', + 'struct tcphdr', + 'struct seq_file', + 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', + 'struct udp6_sock', + 'struct task_struct', + 'struct path', + 'struct btf_ptr', + 'struct inode', + 'struct socket', + 'struct file', + } + mapped_types = { + 'u8': '__u8', + 'u16': '__u16', + 'u32': '__u32', + 'u64': '__u64', + 's8': '__s8', + 's16': '__s16', + 's32': '__s32', + 's64': '__s64', + 'size_t': 'unsigned long', + 'struct bpf_map': 'void', + 'struct sk_buff': 'struct __sk_buff', + 'const struct sk_buff': 'const struct __sk_buff', + 'struct sk_msg_buff': 'struct sk_msg_md', + 'struct xdp_buff': 'struct xdp_md', + } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] + + def print_header(self): + header = '''\ +/* This is auto-generated file. See bpf_doc.py for details. */ + +/* Forward declarations of BPF structs */''' + + print(header) + for fwd in self.type_fwds: + print('%s;' % fwd) + print('') + + def print_footer(self): + footer = '' + print(footer) + + def map_type(self, t): + if t in self.known_types: + return t + if t in self.mapped_types: + return self.mapped_types[t] + print("Unrecognized type '%s', please add it to known types!" % t, + file=sys.stderr) + sys.exit(1) + + seen_helpers = set() + + def print_one(self, helper): + proto = helper.proto_break_down() + + if proto['name'] in self.seen_helpers: + return + self.seen_helpers.add(proto['name']) + + print('/*') + print(" * %s" % proto['name']) + print(" *") + if (helper.desc): + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): + print(' *{}{}'.format(' \t' if line else '', line)) + + if (helper.ret): + print(' *') + print(' * Returns') + for line in helper.ret.rstrip().split('\n'): + print(' *{}{}'.format(' \t' if line else '', line)) + + print(' */') + print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']), + proto['ret_star'], proto['name']), end='') + comma = '' + for i, a in enumerate(proto['args']): + t = a['type'] + n = a['name'] + if proto['name'] in self.overloaded_helpers and i == 0: + t = 'void' + n = 'ctx' + one_arg = '{}{}'.format(comma, self.map_type(t)) + if n: + if a['star']: + one_arg += ' {}'.format(a['star']) + else: + one_arg += ' ' + one_arg += '{}'.format(n) + comma = ', ' + print(one_arg, end='') + + print(') = (void *) %d;' % len(self.seen_helpers)) + print('') + +############################################################################### + +# If script is launched from scripts/ from kernel tree and can access +# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse, +# otherwise the --filename argument will be required from the command line. +script = os.path.abspath(sys.argv[0]) +linuxRoot = os.path.dirname(os.path.dirname(script)) +bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') + +printers = { + 'helpers': PrinterHelpersRST, +} + +argParser = argparse.ArgumentParser(description=""" +Parse eBPF header file and generate documentation for the eBPF API. +The RST-formatted output produced can be turned into a manual page with the +rst2man utility. +""") +argParser.add_argument('--header', action='store_true', + help='generate C header file') +if (os.path.isfile(bpfh)): + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h', + default=bpfh) +else: + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') +argParser.add_argument('target', nargs='?', default='helpers', + choices=printers.keys(), help='eBPF API target') +args = argParser.parse_args() + +# Parse file. +headerParser = HeaderParser(args.filename) +headerParser.run() + +# Print formatted output to standard output. +if args.header: + printer = PrinterHelpers(headerParser) +else: + printer = printers[args.target](headerParser) +printer.print_all() diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py deleted file mode 100755 index 867ada23281c..000000000000 --- a/scripts/bpf_helpers_doc.py +++ /dev/null @@ -1,615 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0-only -# -# Copyright (C) 2018-2019 Netronome Systems, Inc. - -# In case user attempts to run with Python 2. -from __future__ import print_function - -import argparse -import re -import sys, os - -class NoHelperFound(BaseException): - pass - -class ParsingError(BaseException): - def __init__(self, line='', reader=None): - if reader: - BaseException.__init__(self, - 'Error at file offset %d, parsing line: %s' % - (reader.tell(), line)) - else: - BaseException.__init__(self, 'Error parsing line: %s' % line) - -class Helper(object): - """ - An object representing the description of an eBPF helper function. - @proto: function prototype of the helper function - @desc: textual description of the helper function - @ret: description of the return value of the helper function - """ - def __init__(self, proto='', desc='', ret=''): - self.proto = proto - self.desc = desc - self.ret = ret - - def proto_break_down(self): - """ - Break down helper function protocol into smaller chunks: return type, - name, distincts arguments. - """ - arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$') - res = {} - proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') - - capture = proto_re.match(self.proto) - res['ret_type'] = capture.group(1) - res['ret_star'] = capture.group(2) - res['name'] = capture.group(3) - res['args'] = [] - - args = capture.group(4).split(', ') - for a in args: - capture = arg_re.match(a) - res['args'].append({ - 'type' : capture.group(1), - 'star' : capture.group(5), - 'name' : capture.group(6) - }) - - return res - -class HeaderParser(object): - """ - An object used to parse a file in order to extract the documentation of a - list of eBPF helper functions. All the helpers that can be retrieved are - stored as Helper object, in the self.helpers() array. - @filename: name of file to parse, usually include/uapi/linux/bpf.h in the - kernel tree - """ - def __init__(self, filename): - self.reader = open(filename, 'r') - self.line = '' - self.helpers = [] - - def parse_helper(self): - proto = self.parse_proto() - desc = self.parse_desc() - ret = self.parse_ret() - return Helper(proto=proto, desc=desc, ret=ret) - - def parse_proto(self): - # Argument can be of shape: - # - "void" - # - "type name" - # - "type *name" - # - Same as above, with "const" and/or "struct" in front of type - # - "..." (undefined number of arguments, for bpf_trace_printk()) - # There is at least one term ("void"), and at most five arguments. - p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') - capture = p.match(self.line) - if not capture: - raise NoHelperFound - self.line = self.reader.readline() - return capture.group(1) - - def parse_desc(self): - p = re.compile(' \* ?(?:\t| {5,8})Description$') - capture = p.match(self.line) - if not capture: - # Helper can have empty description and we might be parsing another - # attribute: return but do not consume. - return '' - # Description can be several lines, some of them possibly empty, and it - # stops when another subsection title is met. - desc = '' - while True: - self.line = self.reader.readline() - if self.line == ' *\n': - desc += '\n' - else: - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') - capture = p.match(self.line) - if capture: - desc += capture.group(1) + '\n' - else: - break - return desc - - def parse_ret(self): - p = re.compile(' \* ?(?:\t| {5,8})Return$') - capture = p.match(self.line) - if not capture: - # Helper can have empty retval and we might be parsing another - # attribute: return but do not consume. - return '' - # Return value description can be several lines, some of them possibly - # empty, and it stops when another subsection title is met. - ret = '' - while True: - self.line = self.reader.readline() - if self.line == ' *\n': - ret += '\n' - else: - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') - capture = p.match(self.line) - if capture: - ret += capture.group(1) + '\n' - else: - break - return ret - - def run(self): - # Advance to start of helper function descriptions. - offset = self.reader.read().find('* Start of BPF helper function descriptions:') - if offset == -1: - raise Exception('Could not find start of eBPF helper descriptions list') - self.reader.seek(offset) - self.reader.readline() - self.reader.readline() - self.line = self.reader.readline() - - while True: - try: - helper = self.parse_helper() - self.helpers.append(helper) - except NoHelperFound: - break - - self.reader.close() - -############################################################################### - -class Printer(object): - """ - A generic class for printers. Printers should be created with an array of - Helper objects, and implement a way to print them in the desired fashion. - @helpers: array of Helper objects to print to standard output - """ - def __init__(self, helpers): - self.helpers = helpers - - def print_header(self): - pass - - def print_footer(self): - pass - - def print_one(self, helper): - pass - - def print_all(self): - self.print_header() - for helper in self.helpers: - self.print_one(helper) - self.print_footer() - -class PrinterRST(Printer): - """ - A printer for dumping collected information about helpers as a ReStructured - Text page compatible with the rst2man program, which can be used to - generate a manual page for the helpers. - @helpers: array of Helper objects to print to standard output - """ - def print_header(self): - header = '''\ -.. Copyright (C) All BPF authors and contributors from 2014 to present. -.. See git log include/uapi/linux/bpf.h in kernel tree for details. -.. -.. %%%LICENSE_START(VERBATIM) -.. Permission is granted to make and distribute verbatim copies of this -.. manual provided the copyright notice and this permission notice are -.. preserved on all copies. -.. -.. Permission is granted to copy and distribute modified versions of this -.. manual under the conditions for verbatim copying, provided that the -.. entire resulting derived work is distributed under the terms of a -.. permission notice identical to this one. -.. -.. Since the Linux kernel and libraries are constantly changing, this -.. manual page may be incorrect or out-of-date. The author(s) assume no -.. responsibility for errors or omissions, or for damages resulting from -.. the use of the information contained herein. The author(s) may not -.. have taken the same level of care in the production of this manual, -.. which is licensed free of charge, as they might when working -.. professionally. -.. -.. Formatted or processed versions of this manual, if unaccompanied by -.. the source, must acknowledge the copyright and authors of this work. -.. %%%LICENSE_END -.. -.. Please do not edit this file. It was generated from the documentation -.. located in file include/uapi/linux/bpf.h of the Linux kernel sources -.. (helpers description), and from scripts/bpf_helpers_doc.py in the same -.. repository (header and footer). - -=========== -BPF-HELPERS -=========== -------------------------------------------------------------------------------- -list of eBPF helper functions -------------------------------------------------------------------------------- - -:Manual section: 7 - -DESCRIPTION -=========== - -The extended Berkeley Packet Filter (eBPF) subsystem consists in programs -written in a pseudo-assembly language, then attached to one of the several -kernel hooks and run in reaction of specific events. This framework differs -from the older, "classic" BPF (or "cBPF") in several aspects, one of them being -the ability to call special functions (or "helpers") from within a program. -These functions are restricted to a white-list of helpers defined in the -kernel. - -These helpers are used by eBPF programs to interact with the system, or with -the context in which they work. For instance, they can be used to print -debugging messages, to get the time since the system was booted, to interact -with eBPF maps, or to manipulate network packets. Since there are several eBPF -program types, and that they do not run in the same context, each program type -can only call a subset of those helpers. - -Due to eBPF conventions, a helper can not have more than five arguments. - -Internally, eBPF programs call directly into the compiled helper functions -without requiring any foreign-function interface. As a result, calling helpers -introduces no overhead, thus offering excellent performance. - -This document is an attempt to list and document the helpers available to eBPF -developers. They are sorted by chronological order (the oldest helpers in the -kernel at the top). - -HELPERS -======= -''' - print(header) - - def print_footer(self): - footer = ''' -EXAMPLES -======== - -Example usage for most of the eBPF helpers listed in this manual page are -available within the Linux kernel sources, at the following locations: - -* *samples/bpf/* -* *tools/testing/selftests/bpf/* - -LICENSE -======= - -eBPF programs can have an associated license, passed along with the bytecode -instructions to the kernel when the programs are loaded. The format for that -string is identical to the one in use for kernel modules (Dual licenses, such -as "Dual BSD/GPL", may be used). Some helper functions are only accessible to -programs that are compatible with the GNU Privacy License (GPL). - -In order to use such helpers, the eBPF program must be loaded with the correct -license string passed (via **attr**) to the **bpf**\ () system call, and this -generally translates into the C source code of the program containing a line -similar to the following: - -:: - - char ____license[] __attribute__((section("license"), used)) = "GPL"; - -IMPLEMENTATION -============== - -This manual page is an effort to document the existing eBPF helper functions. -But as of this writing, the BPF sub-system is under heavy development. New eBPF -program or map types are added, along with new helper functions. Some helpers -are occasionally made available for additional program types. So in spite of -the efforts of the community, this page might not be up-to-date. If you want to -check by yourself what helper functions exist in your kernel, or what types of -programs they can support, here are some files among the kernel tree that you -may be interested in: - -* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list - of all helper functions, as well as many other BPF definitions including most - of the flags, structs or constants used by the helpers. -* *net/core/filter.c* contains the definition of most network-related helper - functions, and the list of program types from which they can be used. -* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related - helpers. -* *kernel/bpf/verifier.c* contains the functions used to check that valid types - of eBPF maps are used with a given helper function. -* *kernel/bpf/* directory contains other files in which additional helpers are - defined (for cgroups, sockmaps, etc.). -* The bpftool utility can be used to probe the availability of helper functions - on the system (as well as supported program and map types, and a number of - other parameters). To do so, run **bpftool feature probe** (see - **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to - list features available to unprivileged users. - -Compatibility between helper functions and program types can generally be found -in the files where helper functions are defined. Look for the **struct -bpf_func_proto** objects and for functions returning them: these functions -contain a list of helpers that a given program type can call. Note that the -**default:** label of the **switch ... case** used to filter helpers can call -other functions, themselves allowing access to additional helpers. The -requirement for GPL license is also in those **struct bpf_func_proto**. - -Compatibility between helper functions and map types can be found in the -**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*. - -Helper functions that invalidate the checks on **data** and **data_end** -pointers for network processing are listed in function -**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*. - -SEE ALSO -======== - -**bpf**\ (2), -**bpftool**\ (8), -**cgroups**\ (7), -**ip**\ (8), -**perf_event_open**\ (2), -**sendmsg**\ (2), -**socket**\ (7), -**tc-bpf**\ (8)''' - print(footer) - - def print_proto(self, helper): - """ - Format function protocol with bold and italics markers. This makes RST - file less readable, but gives nice results in the manual page. - """ - proto = helper.proto_break_down() - - print('**%s %s%s(' % (proto['ret_type'], - proto['ret_star'].replace('*', '\\*'), - proto['name']), - end='') - - comma = '' - for a in proto['args']: - one_arg = '{}{}'.format(comma, a['type']) - if a['name']: - if a['star']: - one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*')) - else: - one_arg += '** ' - one_arg += '*{}*\\ **'.format(a['name']) - comma = ', ' - print(one_arg, end='') - - print(')**') - - def print_one(self, helper): - self.print_proto(helper) - - if (helper.desc): - print('\tDescription') - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - - if (helper.ret): - print('\tReturn') - for line in helper.ret.rstrip().split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - - print('') - -class PrinterHelpers(Printer): - """ - A printer for dumping collected information about helpers as C header to - be included from BPF program. - @helpers: array of Helper objects to print to standard output - """ - - type_fwds = [ - 'struct bpf_fib_lookup', - 'struct bpf_sk_lookup', - 'struct bpf_perf_event_data', - 'struct bpf_perf_event_value', - 'struct bpf_pidns_info', - 'struct bpf_redir_neigh', - 'struct bpf_sock', - 'struct bpf_sock_addr', - 'struct bpf_sock_ops', - 'struct bpf_sock_tuple', - 'struct bpf_spin_lock', - 'struct bpf_sysctl', - 'struct bpf_tcp_sock', - 'struct bpf_tunnel_key', - 'struct bpf_xfrm_state', - 'struct linux_binprm', - 'struct pt_regs', - 'struct sk_reuseport_md', - 'struct sockaddr', - 'struct tcphdr', - 'struct seq_file', - 'struct tcp6_sock', - 'struct tcp_sock', - 'struct tcp_timewait_sock', - 'struct tcp_request_sock', - 'struct udp6_sock', - 'struct task_struct', - - 'struct __sk_buff', - 'struct sk_msg_md', - 'struct xdp_md', - 'struct path', - 'struct btf_ptr', - 'struct inode', - 'struct socket', - 'struct file', - ] - known_types = { - '...', - 'void', - 'const void', - 'char', - 'const char', - 'int', - 'long', - 'unsigned long', - - '__be16', - '__be32', - '__wsum', - - 'struct bpf_fib_lookup', - 'struct bpf_perf_event_data', - 'struct bpf_perf_event_value', - 'struct bpf_pidns_info', - 'struct bpf_redir_neigh', - 'struct bpf_sk_lookup', - 'struct bpf_sock', - 'struct bpf_sock_addr', - 'struct bpf_sock_ops', - 'struct bpf_sock_tuple', - 'struct bpf_spin_lock', - 'struct bpf_sysctl', - 'struct bpf_tcp_sock', - 'struct bpf_tunnel_key', - 'struct bpf_xfrm_state', - 'struct linux_binprm', - 'struct pt_regs', - 'struct sk_reuseport_md', - 'struct sockaddr', - 'struct tcphdr', - 'struct seq_file', - 'struct tcp6_sock', - 'struct tcp_sock', - 'struct tcp_timewait_sock', - 'struct tcp_request_sock', - 'struct udp6_sock', - 'struct task_struct', - 'struct path', - 'struct btf_ptr', - 'struct inode', - 'struct socket', - 'struct file', - } - mapped_types = { - 'u8': '__u8', - 'u16': '__u16', - 'u32': '__u32', - 'u64': '__u64', - 's8': '__s8', - 's16': '__s16', - 's32': '__s32', - 's64': '__s64', - 'size_t': 'unsigned long', - 'struct bpf_map': 'void', - 'struct sk_buff': 'struct __sk_buff', - 'const struct sk_buff': 'const struct __sk_buff', - 'struct sk_msg_buff': 'struct sk_msg_md', - 'struct xdp_buff': 'struct xdp_md', - } - # Helpers overloaded for different context types. - overloaded_helpers = [ - 'bpf_get_socket_cookie', - 'bpf_sk_assign', - ] - - def print_header(self): - header = '''\ -/* This is auto-generated file. See bpf_helpers_doc.py for details. */ - -/* Forward declarations of BPF structs */''' - - print(header) - for fwd in self.type_fwds: - print('%s;' % fwd) - print('') - - def print_footer(self): - footer = '' - print(footer) - - def map_type(self, t): - if t in self.known_types: - return t - if t in self.mapped_types: - return self.mapped_types[t] - print("Unrecognized type '%s', please add it to known types!" % t, - file=sys.stderr) - sys.exit(1) - - seen_helpers = set() - - def print_one(self, helper): - proto = helper.proto_break_down() - - if proto['name'] in self.seen_helpers: - return - self.seen_helpers.add(proto['name']) - - print('/*') - print(" * %s" % proto['name']) - print(" *") - if (helper.desc): - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print(' *{}{}'.format(' \t' if line else '', line)) - - if (helper.ret): - print(' *') - print(' * Returns') - for line in helper.ret.rstrip().split('\n'): - print(' *{}{}'.format(' \t' if line else '', line)) - - print(' */') - print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']), - proto['ret_star'], proto['name']), end='') - comma = '' - for i, a in enumerate(proto['args']): - t = a['type'] - n = a['name'] - if proto['name'] in self.overloaded_helpers and i == 0: - t = 'void' - n = 'ctx' - one_arg = '{}{}'.format(comma, self.map_type(t)) - if n: - if a['star']: - one_arg += ' {}'.format(a['star']) - else: - one_arg += ' ' - one_arg += '{}'.format(n) - comma = ', ' - print(one_arg, end='') - - print(') = (void *) %d;' % len(self.seen_helpers)) - print('') - -############################################################################### - -# If script is launched from scripts/ from kernel tree and can access -# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse, -# otherwise the --filename argument will be required from the command line. -script = os.path.abspath(sys.argv[0]) -linuxRoot = os.path.dirname(os.path.dirname(script)) -bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') - -argParser = argparse.ArgumentParser(description=""" -Parse eBPF header file and generate documentation for eBPF helper functions. -The RST-formatted output produced can be turned into a manual page with the -rst2man utility. -""") -argParser.add_argument('--header', action='store_true', - help='generate C header file') -if (os.path.isfile(bpfh)): - argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h', - default=bpfh) -else: - argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') -args = argParser.parse_args() - -# Parse file. -headerParser = HeaderParser(args.filename) -headerParser.run() - -# Print formatted output to standard output. -if args.header: - printer = PrinterHelpers(headerParser.helpers) -else: - printer = PrinterRST(headerParser.helpers) -printer.print_all() diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers index 854d084026dd..a26599022fd6 100644 --- a/tools/bpf/Makefile.helpers +++ b/tools/bpf/Makefile.helpers @@ -35,7 +35,7 @@ man7: $(DOC_MAN7) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) $(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@ + $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_doc.py --filename $< > $@ $(OUTPUT)%.7: $(OUTPUT)%.rst ifndef RST2MAN_DEP diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b89af20cfa19..b4c5c529ad17 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -729,7 +729,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..8170f88e8ea6 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h - $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \ + $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5d7b947320fb..f05c4d48fd7e 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -20,4 +20,4 @@ tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c tools/lib/zalloc.c -scripts/bpf_helpers_doc.py +scripts/bpf_doc.py -- cgit v1.2.3 From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:13 +0000 Subject: bpf: Add PROG_TEST_RUN support for sk_lookup programs Allow to pass sk_lookup programs to PROG_TEST_RUN. User space provides the full bpf_sk_lookup struct as context. Since the context includes a socket pointer that can't be exposed to user space we define that PROG_TEST_RUN returns the cookie of the selected socket or zero in place of the socket pointer. We don't support testing programs that select a reuseport socket, since this would mean running another (unrelated) BPF program from the sk_lookup test handler. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com --- include/linux/bpf.h | 10 ++++ include/uapi/linux/bpf.h | 5 +- net/bpf/test_run.c | 105 +++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 1 + tools/include/uapi/linux/bpf.h | 5 +- 5 files changed, 124 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c730863fa77..c931bc97019d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1491,6 +1491,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1692,6 +1695,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, return -ENOTSUPP; } +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb3c78cd4d7c..0abdd67f44b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -781,3 +783,106 @@ out: kfree(data); return ret; } + +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; + + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; + } + + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; + + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); + +out: + bpf_prog_array_free(progs); + kfree(user_ctx); + return ret; +} diff --git a/net/core/filter.c b/net/core/filter.c index 13bcf248ee7b..a526db494c62 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ -- cgit v1.2.3 From d01b59c9ae94560fbcceaafeef39784d72765033 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Thu, 4 Mar 2021 14:40:46 +0800 Subject: bpf: Add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets encapsulation. But that is not appropriate when pushing Ethernet header. Add an option to further specify encap L2 type and set the inner_protocol as ETH_P_TEB. Suggested-by: Willem de Bruijn Signed-off-by: Xuesen Huang Signed-off-by: Zhiyong Cheng Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210304064046.6232-1-hxseverything@gmail.com --- include/uapi/linux/bpf.h | 5 +++++ net/core/filter.c | 11 ++++++++++- tools/include/uapi/linux/bpf.h | 5 +++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index a526db494c62..588b19ba0da8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3409,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3445,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3463,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { -- cgit v1.2.3 From e56763ee50a3f28d2f70355ca43fb78d8539a183 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 10 Mar 2021 13:02:42 +0100 Subject: FDDI: if_fddi.h: Update my e-mail address Following the recent update to MAINTAINERS update my e-mail address. Signed-off-by: Maciej W. Rozycki Signed-off-by: David S. Miller --- include/uapi/linux/if_fddi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_fddi.h b/include/uapi/linux/if_fddi.h index 7239aa9c0766..8df2d9934bcd 100644 --- a/include/uapi/linux/if_fddi.h +++ b/include/uapi/linux/if_fddi.h @@ -9,7 +9,7 @@ * Version: @(#)if_fddi.h 1.0.3 Oct 6 2018 * * Author: Lawrence V. Stefani, - * Maintainer: Maciej W. Rozycki, + * Maintainer: Maciej W. Rozycki, * * if_fddi.h is based on previous if_ether.h and if_tr.h work by * Fred N. van Kempen, -- cgit v1.2.3 From 710ec5622306de8c071637ee41ddf4c9bd17e75a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 11 Mar 2021 19:03:15 +0100 Subject: nexthop: Add netlink defines and enumerators for resilient NH groups - RTM_NEWNEXTHOP et.al. that handle resilient groups will have a new nested attribute, NHA_RES_GROUP, whose elements are attributes NHA_RES_GROUP_*. - RTM_NEWNEXTHOPBUCKET et.al. is a suite of new messages that will currently serve only for dumping of individual buckets of resilient next hop groups. For nexthop group buckets, these messages will carry a nested attribute NHA_RES_BUCKET, whose elements are attributes NHA_RES_BUCKET_*. There are several reasons why a new suite of messages is created for nexthop buckets instead of overloading the information on the existing RTM_{NEW,DEL,GET}NEXTHOP messages. First, a nexthop group can contain a large number of nexthop buckets (4k is not unheard of). This imposes limits on the amount of information that can be encoded for each nexthop bucket given a netlink message is limited to 64k bytes. Second, while RTM_NEWNEXTHOPBUCKET is only used for notifications at this point, in the future it can be extended to provide user space with control over nexthop buckets configuration. - The new group type is NEXTHOP_GRP_TYPE_RES. Note that nexthop code is adjusted to bounce groups with that type for now. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/uapi/linux/nexthop.h | 47 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/rtnetlink.h | 7 +++++++ net/ipv4/nexthop.c | 2 ++ security/selinux/nlmsgtab.c | 5 ++++- 4 files changed, 59 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h index 2d4a1e784cf0..d8ffa8c9ca78 100644 --- a/include/uapi/linux/nexthop.h +++ b/include/uapi/linux/nexthop.h @@ -21,7 +21,10 @@ struct nexthop_grp { }; enum { - NEXTHOP_GRP_TYPE_MPATH, /* default type if not specified */ + NEXTHOP_GRP_TYPE_MPATH, /* hash-threshold nexthop group + * default type if not specified + */ + NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */ __NEXTHOP_GRP_TYPE_MAX, }; @@ -52,8 +55,50 @@ enum { NHA_FDB, /* flag; nexthop belongs to a bridge fdb */ /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */ + /* nested; resilient nexthop group attributes */ + NHA_RES_GROUP, + /* nested; nexthop bucket attributes */ + NHA_RES_BUCKET, + __NHA_MAX, }; #define NHA_MAX (__NHA_MAX - 1) + +enum { + NHA_RES_GROUP_UNSPEC, + /* Pad attribute for 64-bit alignment. */ + NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC, + + /* u16; number of nexthop buckets in a resilient nexthop group */ + NHA_RES_GROUP_BUCKETS, + /* clock_t as u32; nexthop bucket idle timer (per-group) */ + NHA_RES_GROUP_IDLE_TIMER, + /* clock_t as u32; nexthop unbalanced timer */ + NHA_RES_GROUP_UNBALANCED_TIMER, + /* clock_t as u64; nexthop unbalanced time */ + NHA_RES_GROUP_UNBALANCED_TIME, + + __NHA_RES_GROUP_MAX, +}; + +#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1) + +enum { + NHA_RES_BUCKET_UNSPEC, + /* Pad attribute for 64-bit alignment. */ + NHA_RES_BUCKET_PAD = NHA_RES_BUCKET_UNSPEC, + + /* u16; nexthop bucket index */ + NHA_RES_BUCKET_INDEX, + /* clock_t as u64; nexthop bucket idle time */ + NHA_RES_BUCKET_IDLE_TIME, + /* u32; nexthop id assigned to the nexthop bucket */ + NHA_RES_BUCKET_NH_ID, + + __NHA_RES_BUCKET_MAX, +}; + +#define NHA_RES_BUCKET_MAX (__NHA_RES_BUCKET_MAX - 1) + #endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 91e4ca064d61..d35953bc7d53 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -178,6 +178,13 @@ enum { RTM_GETVLAN, #define RTM_GETVLAN RTM_GETVLAN + RTM_NEWNEXTHOPBUCKET = 116, +#define RTM_NEWNEXTHOPBUCKET RTM_NEWNEXTHOPBUCKET + RTM_DELNEXTHOPBUCKET, +#define RTM_DELNEXTHOPBUCKET RTM_DELNEXTHOPBUCKET + RTM_GETNEXTHOPBUCKET, +#define RTM_GETNEXTHOPBUCKET RTM_GETNEXTHOPBUCKET + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 56c54d0fbacc..7a94591da856 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1492,6 +1492,8 @@ static struct nexthop *nexthop_create_group(struct net *net, if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->mpath = 1; nhg->is_multipath = true; + } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { + goto out_no_nh; } WARN_ON_ONCE(nhg->mpath != 1); diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index b69231918686..d59276f48d4f 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -88,6 +88,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -171,7 +174,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWVLAN + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOPBUCKET + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3 From 2ffe0395288aa237ff7e0143366bd1cd57bfc5b7 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 12 Mar 2021 15:08:31 +0100 Subject: net/sched: act_police: add support for packet-per-second policing Allow a policer action to enforce a rate-limit based on packets-per-second, configurable using a packet-per-second rate and burst parameters. e.g. tc filter add dev tap1 parent ffff: u32 match \ u32 0 0 police pkts_rate 3000 pkts_burst 1000 Testing was unable to uncover a performance impact of this change on existing features. Signed-off-by: Baowen Zheng Signed-off-by: Simon Horman Signed-off-by: Louis Peens Signed-off-by: David S. Miller --- include/net/sch_generic.h | 14 ++++++++ include/net/tc_act/tc_police.h | 48 ++++++++++++++++++++++++--- include/uapi/linux/pkt_cls.h | 2 ++ net/sched/act_police.c | 59 +++++++++++++++++++++++++++++---- net/sched/sch_generic.c | 75 ++++++++++++++++++++++++++++-------------- 5 files changed, 162 insertions(+), 36 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2d6eb60c58c8..f7a6e14491fb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1242,6 +1242,20 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } +struct psched_pktrate { + u64 rate_pkts_ps; /* packets per second */ + u32 mult; + u8 shift; +}; + +static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r, + unsigned int pkt_num) +{ + return ((u64)pkt_num * r->mult) >> r->shift; +} + +void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); + /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. * The fast path only needs to access filter list and to update stats */ diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index ae117f7937d5..72649512dcdd 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -10,10 +10,13 @@ struct tcf_police_params { s64 tcfp_burst; u32 tcfp_mtu; s64 tcfp_mtu_ptoks; + s64 tcfp_pkt_burst; struct psched_ratecfg rate; bool rate_present; struct psched_ratecfg peak; bool peak_present; + struct psched_pktrate ppsrate; + bool pps_present; struct rcu_head rcu; }; @@ -24,6 +27,7 @@ struct tcf_police { spinlock_t tcfp_lock ____cacheline_aligned_in_smp; s64 tcfp_toks; s64 tcfp_ptoks; + s64 tcfp_pkttoks; s64 tcfp_t_c; }; @@ -99,14 +103,50 @@ static inline u32 tcf_police_burst(const struct tc_action *act) static inline u64 tcf_police_rate_pkt_ps(const struct tc_action *act) { - /* Not implemented */ - return 0; + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->ppsrate.rate_pkts_ps; } static inline u32 tcf_police_burst_pkt(const struct tc_action *act) { - /* Not implemented */ - return 0; + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + u32 burst; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + + /* + * "rate" pkts "burst" nanoseconds + * ------------ * ------------------- + * 1 second 2^6 ticks + * + * ------------------------------------ + * NSEC_PER_SEC nanoseconds + * ------------------------ + * 2^6 ticks + * + * "rate" pkts "burst" nanoseconds 2^6 ticks + * = ------------ * ------------------- * ------------------------ + * 1 second 2^6 ticks NSEC_PER_SEC nanoseconds + * + * "rate" * "burst" + * = ---------------- pkts/nanosecond + * NSEC_PER_SEC^2 + * + * + * "rate" * "burst" + * = ---------------- pkts/second + * NSEC_PER_SEC + */ + burst = div_u64(params->tcfp_pkt_burst * params->ppsrate.rate_pkts_ps, + NSEC_PER_SEC); + + return burst; } static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act) diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7ea59cfe1fa7..025c40fef93d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -190,6 +190,8 @@ enum { TCA_POLICE_PAD, TCA_POLICE_RATE64, TCA_POLICE_PEAKRATE64, + TCA_POLICE_PKTRATE64, + TCA_POLICE_PKTBURST64, __TCA_POLICE_MAX #define TCA_POLICE_RESULT TCA_POLICE_RESULT }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8d8452b1cdd4..0fab8de176d2 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, [TCA_POLICE_RATE64] = { .type = NLA_U64 }, [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 }, + [TCA_POLICE_PKTRATE64] = { .type = NLA_U64, .min = 1 }, + [TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 }, }; static int tcf_police_init(struct net *net, struct nlattr *nla, @@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, bool exists = false; u32 index; u64 rate64, prate64; + u64 pps, ppsburst; if (nla == NULL) return -EINVAL; @@ -142,6 +145,21 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } } + if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) || + (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) { + NL_SET_ERR_MSG(extack, + "Both or neither packet-per-second burst and rate must be provided"); + err = -EINVAL; + goto failure; + } + + if (tb[TCA_POLICE_PKTRATE64] && R_tab) { + NL_SET_ERR_MSG(extack, + "packet-per-second and byte-per-second rate limits not allowed in same action"); + err = -EINVAL; + goto failure; + } + new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; @@ -183,6 +201,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + if (tb[TCA_POLICE_PKTRATE64]) { + pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]); + ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]); + new->pps_present = true; + new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst); + psched_ppscfg_precompute(&new->ppsrate, pps); + } + spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); @@ -217,8 +243,8 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); + s64 now, toks, ppstoks = 0, ptoks = 0; struct tcf_police_params *p; - s64 now, toks, ptoks = 0; int ret; tcf_lastuse_update(&police->tcf_tm); @@ -236,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, } if (qdisc_pkt_len(skb) <= p->tcfp_mtu) { - if (!p->rate_present) { + if (!p->rate_present && !p->pps_present) { ret = p->tcfp_result; goto end; } @@ -251,14 +277,23 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } - toks += police->tcfp_toks; - if (toks > p->tcfp_burst) - toks = p->tcfp_burst; - toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); - if ((toks|ptoks) >= 0) { + if (p->rate_present) { + toks += police->tcfp_toks; + if (toks > p->tcfp_burst) + toks = p->tcfp_burst; + toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); + } else if (p->pps_present) { + ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst); + ppstoks += police->tcfp_pkttoks; + if (ppstoks > p->tcfp_pkt_burst) + ppstoks = p->tcfp_pkt_burst; + ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1); + } + if ((toks | ptoks | ppstoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; police->tcfp_ptoks = ptoks; + police->tcfp_pkttoks = ppstoks; spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; @@ -331,6 +366,16 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, TCA_POLICE_PAD)) goto nla_put_failure; } + if (p->pps_present) { + if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64, + police->params->ppsrate.rate_pkts_ps, + TCA_POLICE_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64, + PSCHED_NS2TICKS(p->tcfp_pkt_burst), + TCA_POLICE_PAD)) + goto nla_put_failure; + } if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfp_result && diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 49eae93d1489..44991ea726fc 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1325,6 +1325,48 @@ void dev_shutdown(struct net_device *dev) WARN_ON(timer_pending(&dev->watchdog_timer)); } +/** + * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division + * @rate: Rate to compute reciprocal division values of + * @mult: Multiplier for reciprocal division + * @shift: Shift for reciprocal division + * + * The multiplier and shift for reciprocal division by rate are stored + * in mult and shift. + * + * The deal here is to replace a divide by a reciprocal one + * in fast path (a reciprocal divide is a multiply and a shift) + * + * Normal formula would be : + * time_in_ns = (NSEC_PER_SEC * len) / rate_bps + * + * We compute mult/shift to use instead : + * time_in_ns = (len * mult) >> shift; + * + * We try to get the highest possible mult value for accuracy, + * but have to make sure no overflows will ever happen. + * + * reciprocal_value() is not used here it doesn't handle 64-bit values. + */ +static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) +{ + u64 factor = NSEC_PER_SEC; + + *mult = 1; + *shift = 0; + + if (rate <= 0) + return; + + for (;;) { + *mult = div64_u64(factor, rate); + if (*mult & (1U << 31) || factor & (1ULL << 63)) + break; + factor <<= 1; + (*shift)++; + } +} + void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) @@ -1333,34 +1375,17 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, r->overhead = conf->overhead; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); - r->mult = 1; - /* - * The deal here is to replace a divide by a reciprocal one - * in fast path (a reciprocal divide is a multiply and a shift) - * - * Normal formula would be : - * time_in_ns = (NSEC_PER_SEC * len) / rate_bps - * - * We compute mult/shift to use instead : - * time_in_ns = (len * mult) >> shift; - * - * We try to get the highest possible mult value for accuracy, - * but have to make sure no overflows will ever happen. - */ - if (r->rate_bytes_ps > 0) { - u64 factor = NSEC_PER_SEC; - - for (;;) { - r->mult = div64_u64(factor, r->rate_bytes_ps); - if (r->mult & (1U << 31) || factor & (1ULL << 63)) - break; - factor <<= 1; - r->shift++; - } - } + psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); +void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) +{ + r->rate_pkts_ps = pktrate64; + psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); +} +EXPORT_SYMBOL(psched_ppscfg_precompute); + static void mini_qdisc_rcu_func(struct rcu_head *head) { } -- cgit v1.2.3 From 07e1a5809b595df6e125504dff6245cb2c8ed3de Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 14 Mar 2021 14:19:31 +0200 Subject: psample: Add additional metadata attributes Extend psample to report the following attributes when available: * Output traffic class as a 16-bit value * Output traffic class occupancy in bytes as a 64-bit value * End-to-end latency of the packet in nanoseconds resolution * Software timestamp in nanoseconds resolution (always available) * Packet's protocol. Needed for packet dissection in user space (always available) Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/psample.h | 7 +++++++ include/uapi/linux/psample.h | 7 +++++++ net/psample/psample.c | 39 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/psample.h b/include/net/psample.h index ac6dbfb3870d..e328c5127757 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -18,6 +18,13 @@ struct psample_metadata { u32 trunc_size; int in_ifindex; int out_ifindex; + u16 out_tc; + u64 out_tc_occ; /* bytes */ + u64 latency; /* nanoseconds */ + u8 out_tc_valid:1, + out_tc_occ_valid:1, + latency_valid:1, + unused:5; }; struct psample_group *psample_group_get(struct net *net, u32 group_num); diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h index aea26ab1431c..c6329f71b939 100644 --- a/include/uapi/linux/psample.h +++ b/include/uapi/linux/psample.h @@ -16,6 +16,13 @@ enum { /* commands attributes */ PSAMPLE_ATTR_GROUP_REFCOUNT, + PSAMPLE_ATTR_PAD, + PSAMPLE_ATTR_OUT_TC, /* u16 */ + PSAMPLE_ATTR_OUT_TC_OCC, /* u64, bytes */ + PSAMPLE_ATTR_LATENCY, /* u64, nanoseconds */ + PSAMPLE_ATTR_TIMESTAMP, /* u64, nanoseconds */ + PSAMPLE_ATTR_PROTO, /* u16 */ + __PSAMPLE_ATTR_MAX }; diff --git a/net/psample/psample.c b/net/psample/psample.c index 065bc887d239..118d5d2a81a0 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -358,6 +359,7 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, u32 sample_rate, const struct psample_metadata *md) { + ktime_t tstamp = ktime_get_real(); int out_ifindex = md->out_ifindex; int in_ifindex = md->in_ifindex; u32 trunc_size = md->trunc_size; @@ -372,10 +374,15 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + + (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + nla_total_size(sizeof(u32)) + /* sample_rate */ nla_total_size(sizeof(u32)) + /* orig_size */ nla_total_size(sizeof(u32)) + /* group_num */ - nla_total_size(sizeof(u32)); /* seq */ + nla_total_size(sizeof(u32)) + /* seq */ + nla_total_size_64bit(sizeof(u64)) + /* timestamp */ + nla_total_size(sizeof(u16)); /* protocol */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -425,6 +432,36 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, if (unlikely(ret < 0)) goto error; + if (md->out_tc_valid) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc); + if (unlikely(ret < 0)) + goto error; + } + + if (md->out_tc_occ_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC, + md->out_tc_occ, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + if (md->latency_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY, + md->latency, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP, + ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO, + be16_to_cpu(skb->protocol)); + if (unlikely(ret < 0)) + goto error; + if (data_len) { int nla_len = nla_total_size(data_len); struct nlattr *nla; -- cgit v1.2.3 From 3c85a8b81cc89c712c4f44cb1a4d5547e472fb1f Mon Sep 17 00:00:00 2001 From: Cooper Lees Date: Tue, 23 Mar 2021 20:47:38 -0700 Subject: Add Open Routing Protocol ID to `rtnetlink.h` - The Open Routing (Open/R) network protocol netlink handler uses ID 99 - Will also add to `/etc/iproute2/rt_protos` once this is accepted - For more information: https://github.com/facebook/openr Signed-off-by: From: Cooper Lees Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index d35953bc7d53..5888492a5257 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -290,6 +290,7 @@ enum { #define RTPROT_MROUTED 17 /* Multicast daemon */ #define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ #define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */ #define RTPROT_BGP 186 /* BGP Routes */ #define RTPROT_ISIS 187 /* ISIS Routes */ #define RTPROT_OSPF 188 /* OSPF Routes */ -- cgit v1.2.3 From ed3038158e7b58dcf966cb7ec4ae98d152a5b794 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:55 -0700 Subject: ethtool: fec: fix typo in kdoc s/porte/the port/ Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index cde753bb2093..1433d6278018 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1378,7 +1378,7 @@ struct ethtool_per_queue_op { /** * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on porte + * @active_fec: FEC mode which is active on the port * @fec: Bitmask of supported/configured FEC modes * @rsvd: Reserved for future extensions. i.e FEC bypass feature. * -- cgit v1.2.3 From 408386817a9d32c88c9ac528749e9999d0e3f6a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:56 -0700 Subject: ethtool: fec: remove long structure description Digging through the mailing list archive @autoneg was part of the first version of the RFC, this left over comment was pointed out twice in review but wasn't removed. The sentence is an exact copy-paste from pauseparam. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 1433d6278018..36bf435d232c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1381,10 +1381,6 @@ struct ethtool_per_queue_op { * @active_fec: FEC mode which is active on the port * @fec: Bitmask of supported/configured FEC modes * @rsvd: Reserved for future extensions. i.e FEC bypass feature. - * - * Drivers should reject a non-zero setting of @autoneg when - * autoneogotiation is disabled (or not supported) for the link. - * */ struct ethtool_fecparam { __u32 cmd; -- cgit v1.2.3 From 240e114411e74d2ee8121643e0c67717eb7c6982 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:57 -0700 Subject: ethtool: fec: sanitize ethtool_fecparam->reserved struct ethtool_fecparam::reserved is never looked at by the core. Make sure it's actually 0. Unfortunately we can't return an error because old ethtool doesn't zero-initialize the structure for SET. On GET we can be more verbose, there are no in tree (ab)users. Fix up the kdoc on the structure. Remove the mention of FEC bypass. Seems like a niche thing to configure in the first place. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 +- net/ethtool/ioctl.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 36bf435d232c..9e2682a67460 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1380,7 +1380,7 @@ struct ethtool_per_queue_op { * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM * @active_fec: FEC mode which is active on the port * @fec: Bitmask of supported/configured FEC modes - * @rsvd: Reserved for future extensions. i.e FEC bypass feature. + * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. */ struct ethtool_fecparam { __u32 cmd; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 0788cc3b3114..be3549023d89 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2568,6 +2568,9 @@ static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) if (rc) return rc; + if (WARN_ON_ONCE(fecparam.reserved)) + fecparam.reserved = 0; + if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; return 0; @@ -2583,6 +2586,8 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; + fecparam.reserved = 0; + return dev->ethtool_ops->set_fecparam(dev, &fecparam); } -- cgit v1.2.3 From d3b37fc805d9ef697451730ebdfc7e35e6c2ace8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:58 -0700 Subject: ethtool: fec: sanitize ethtool_fecparam->active_fec struct ethtool_fecparam::active_fec is a GET-only field, all in-tree drivers correctly ignore it on SET. Clear the field on SET to avoid any confusion. Again, we can't reject non-zero now since ethtool user space does not zero-init the param correctly. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 +- net/ethtool/ioctl.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9e2682a67460..517b68c5fcec 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1378,7 +1378,7 @@ struct ethtool_per_queue_op { /** * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on the port + * @active_fec: FEC mode which is active on the port, GET only. * @fec: Bitmask of supported/configured FEC modes * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index be3549023d89..237ffe5440ef 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2586,6 +2586,7 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; + fecparam.active_fec = 0; fecparam.reserved = 0; return dev->ethtool_ops->set_fecparam(dev, &fecparam); -- cgit v1.2.3 From 6dbf94b264e62641d521975d0eddbeef36bacf3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:12:00 -0700 Subject: ethtool: clarify the ethtool FEC interface The definition of the FEC driver interface is quite unclear. Improve the documentation. This is based on current driver and user space code, as well as the discussions about the interface: RFC v1 (24 Oct 2016): https://lore.kernel.org/netdev/1477363849-36517-1-git-send-email-vidya@cumulusnetworks.com/ - this version has the autoneg field - no active_fec field - none vs off confusion is already present RFC v2 (10 Feb 2017): https://lore.kernel.org/netdev/1486727004-11316-1-git-send-email-vidya@cumulusnetworks.com/ - autoneg removed - active_fec added v1 (10 Feb 2017): https://lore.kernel.org/netdev/1486751311-42019-1-git-send-email-vidya@cumulusnetworks.com/ - no changes in the code v1 (24 Jun 2017): https://lore.kernel.org/netdev/1498331985-8525-1-git-send-email-roopa@cumulusnetworks.com/ - include in tree user v2 (27 Jul 2017): https://lore.kernel.org/netdev/1501199248-24695-1-git-send-email-roopa@cumulusnetworks.com/ Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 517b68c5fcec..f6ef7d42c7a1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1376,11 +1376,29 @@ struct ethtool_per_queue_op { }; /** - * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters + * struct ethtool_fecparam - Ethernet Forward Error Correction parameters * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on the port, GET only. - * @fec: Bitmask of supported/configured FEC modes + * @active_fec: FEC mode which is active on the port, single bit set, GET only. + * @fec: Bitmask of configured FEC modes. * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. + * + * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. + * FEC settings are configured by link autonegotiation whenever it's enabled. + * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. + * + * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. + * It is recommended that drivers only accept a single bit set in @fec. + * When multiple bits are set in @fec drivers may pick mode in an implementation + * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other + * FEC modes, because it's unclear whether in this case other modes constrain + * AUTO or are independent choices. + * Drivers must reject SET requests if they support none of the requested modes. + * + * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead + * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. + * + * See enum ethtool_fec_config_bits for definition of valid bits for both + * @fec and @active_fec. */ struct ethtool_fecparam { __u32 cmd; @@ -1392,11 +1410,16 @@ struct ethtool_fecparam { /** * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration - * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported - * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver + * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO: Select default/best FEC mode automatically, usually based + * link mode and SFP parameters read from module's EEPROM. + * This bit does _not_ mean autonegotiation. * @ETHTOOL_FEC_OFF: No FEC Mode - * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode - * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode + * @ETHTOOL_FEC_RS: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) */ enum ethtool_fec_config_bits { ETHTOOL_FEC_NONE_BIT, -- cgit v1.2.3 From ad1cd7856d870e5861ef80fbf3e4b0d68bb82a69 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 26 Mar 2021 13:22:21 -0700 Subject: ethtool: fec: add note about reuse of reserved struct ethtool_fecparam::reserved can't be used in SET, because ethtool user space doesn't zero-initialize the structure. Make this clear. Suggested-by: Andrew Lunn Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f6ef7d42c7a1..9a47c3efd8ca 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1382,6 +1382,10 @@ struct ethtool_per_queue_op { * @fec: Bitmask of configured FEC modes. * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. * + * Note that @reserved was never validated on input and ethtool user space + * left it uninitialized when calling SET. Hence going forward it can only be + * used to return a value to userspace with GET. + * * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. * FEC settings are configured by link autonegotiation whenever it's enabled. * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. -- cgit v1.2.3 From d04feecaf1543e538e856166e494daebe808d1fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 26 Mar 2021 13:22:23 -0700 Subject: ethtool: document the enum values not defines kdoc does not have good support for documenting defines, and we can't abuse the enum documentation because it generates warnings. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9a47c3efd8ca..868b513d4f54 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1414,16 +1414,16 @@ struct ethtool_fecparam { /** * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration - * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported. Should not - * be used together with other bits. GET only. - * @ETHTOOL_FEC_AUTO: Select default/best FEC mode automatically, usually based - * link mode and SFP parameters read from module's EEPROM. - * This bit does _not_ mean autonegotiation. - * @ETHTOOL_FEC_OFF: No FEC Mode - * @ETHTOOL_FEC_RS: Reed-Solomon FEC Mode - * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon FEC Mode - * @ETHTOOL_FEC_LLRS: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet - * Consortium) + * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually + * based link mode and SFP parameters read from module's + * EEPROM. This bit does _not_ mean autonegotiation. + * @ETHTOOL_FEC_OFF_BIT: No FEC Mode + * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) */ enum ethtool_fec_config_bits { ETHTOOL_FEC_NONE_BIT, -- cgit v1.2.3 From e6ac2450d6dee3121cd8bbf2907b78a68a8a353d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:51:42 -0700 Subject: bpf: Support bpf program calling kernel function This patch adds support to BPF verifier to allow bpf program calling kernel function directly. The use case included in this set is to allow bpf-tcp-cc to directly call some tcp-cc helper functions (e.g. "tcp_cong_avoid_ai()"). Those functions have already been used by some kernel tcp-cc implementations. This set will also allow the bpf-tcp-cc program to directly call the kernel tcp-cc implementation, For example, a bpf_dctcp may only want to implement its own dctcp_cwnd_event() and reuse other dctcp_*() directly from the kernel tcp_dctcp.c instead of reimplementing (or copy-and-pasting) them. The tcp-cc kernel functions mentioned above will be white listed for the struct_ops bpf-tcp-cc programs to use in a later patch. The white listed functions are not bounded to a fixed ABI contract. Those functions have already been used by the existing kernel tcp-cc. If any of them has changed, both in-tree and out-of-tree kernel tcp-cc implementations have to be changed. The same goes for the struct_ops bpf-tcp-cc programs which have to be adjusted accordingly. This patch is to make the required changes in the bpf verifier. First change is in btf.c, it adds a case in "btf_check_func_arg_match()". When the passed in "btf->kernel_btf == true", it means matching the verifier regs' states with a kernel function. This will handle the PTR_TO_BTF_ID reg. It also maps PTR_TO_SOCK_COMMON, PTR_TO_SOCKET, and PTR_TO_TCP_SOCK to its kernel's btf_id. In the later libbpf patch, the insn calling a kernel function will look like: insn->code == (BPF_JMP | BPF_CALL) insn->src_reg == BPF_PSEUDO_KFUNC_CALL /* <- new in this patch */ insn->imm == func_btf_id /* btf_id of the running kernel */ [ For the future calling function-in-kernel-module support, an array of module btf_fds can be passed at the load time and insn->off can be used to index into this array. ] At the early stage of verifier, the verifier will collect all kernel function calls into "struct bpf_kfunc_desc". Those descriptors are stored in "prog->aux->kfunc_tab" and will be available to the JIT. Since this "add" operation is similar to the current "add_subprog()" and looking for the same insn->code, they are done together in the new "add_subprog_and_kfunc()". In the "do_check()" stage, the new "check_kfunc_call()" is added to verify the kernel function call instruction: 1. Ensure the kernel function can be used by a particular BPF_PROG_TYPE. A new bpf_verifier_ops "check_kfunc_call" is added to do that. The bpf-tcp-cc struct_ops program will implement this function in a later patch. 2. Call "btf_check_kfunc_args_match()" to ensure the regs can be used as the args of a kernel function. 3. Mark the regs' type, subreg_def, and zext_dst. At the later do_misc_fixups() stage, the new fixup_kfunc_call() will replace the insn->imm with the function address (relative to __bpf_call_base). If needed, the jit can find the btf_func_model by calling the new bpf_jit_find_kfunc_model(prog, insn). With the imm set to the function address, "bpftool prog dump xlated" will be able to display the kernel function calls the same way as it displays other bpf helper calls. gpl_compatible program is required to call kernel function. This feature currently requires JIT. The verifier selftests are adjusted because of the changes in the verbose log in add_subprog_and_kfunc(). Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015142.1544736-1-kafai@fb.com --- arch/x86/net/bpf_jit_comp.c | 5 + include/linux/bpf.h | 24 ++ include/linux/btf.h | 1 + include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 4 + kernel/bpf/btf.c | 65 +++- kernel/bpf/core.c | 18 +- kernel/bpf/disasm.c | 13 +- kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 368 +++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 4 + tools/testing/selftests/bpf/verifier/calls.c | 12 +- tools/testing/selftests/bpf/verifier/dead_code.c | 10 +- 13 files changed, 480 insertions(+), 46 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b35fc8023884..9eead60f0301 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2346,3 +2346,8 @@ out: tmp : orig_prog); return prog; } + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaae618a90b5..b5b7967e3ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -427,6 +427,7 @@ enum bpf_reg_type { PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ PTR_TO_MAP_KEY, /* reg points to a map element key */ + __BPF_REG_TYPE_MAX, }; /* The information passed from prog-specific *_is_valid_access @@ -480,6 +481,7 @@ struct bpf_verifier_ops { const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); + bool (*check_kfunc_call)(u32 kfunc_btf_id); }; struct bpf_prog_offload_ops { @@ -796,6 +798,8 @@ struct btf_mod_pair { struct module *module; }; +struct bpf_kfunc_desc_tab; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -832,6 +836,7 @@ struct bpf_prog_aux { struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; + struct bpf_kfunc_desc_tab *kfunc_tab; u32 size_poke_tab; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; @@ -1547,6 +1552,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, @@ -1557,6 +1565,10 @@ struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); +bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1737,6 +1749,18 @@ bpf_base_func_proto(enum bpf_func_id func_id) static inline void bpf_task_storage_free(struct task_struct *task) { } + +static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) +{ + return false; +} + +static inline const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + return NULL; +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/include/linux/btf.h b/include/linux/btf.h index 8a05687a4ee2..3bac66e0183a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size); +const char *btf_type_str(const struct btf_type *t); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 0d9c710eb050..eecfd82db648 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -918,6 +918,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); +bool bpf_jit_supports_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 008edc1dc8c1..598716742593 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1117,6 +1117,10 @@ enum bpf_link_type { * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3c489adacf3b..ec8afc4bc560 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -283,7 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", }; -static const char *btf_type_str(const struct btf_type *t) +const char *btf_type_str(const struct btf_type *t) { return btf_kind_str[BTF_INFO_KIND(t->info)]; } @@ -5362,6 +5362,14 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif +}; + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, @@ -5371,12 +5379,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs; + u32 i, nargs, ref_id; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { /* These checks were already done by the verifier while loading - * struct bpf_func_info + * struct bpf_func_info or in add_kfunc_call(). */ bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", func_id); @@ -5418,9 +5426,49 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { + if (btf_is_kernel(btf)) { + const struct btf_type *reg_ref_t; + const struct btf *reg_btf; + const char *reg_ref_tname; + u32 reg_ref_id; + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (reg->type == PTR_TO_BTF_ID) { + reg_btf = reg->btf; + reg_ref_id = reg->btf_id; + } else if (reg2btf_ids[reg->type]) { + reg_btf = btf_vmlinux; + reg_ref_id = *reg2btf_ids[reg->type]; + } else { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", + func_name, i, + btf_type_str(ref_t), ref_tname, regno); + return -EINVAL; + } + + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, + ®_ref_id); + reg_ref_tname = btf_name_by_offset(reg_btf, + reg_ref_t->name_off); + if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, + reg->off, btf, ref_id)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", + func_name, i, + btf_type_str(ref_t), ref_tname, + regno, btf_type_str(reg_ref_t), + reg_ref_tname); + return -EINVAL; + } + } else if (btf_get_prog_ctx_type(log, btf, t, + env->prog->type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5493,6 +5541,13 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return err; } +int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs) +{ + return btf_check_func_arg_match(env, btf, func_id, regs, false); +} + /* Convert BTF of a function into bpf_reg_state if possible * Returns: * EFAULT - there is a verifier bug. Abort verification. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a35eb3d7b126..f5423251c118 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -159,6 +159,9 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog) kvfree(prog->aux->jited_linfo); prog->aux->jited_linfo = NULL; } + + kfree(prog->aux->kfunc_tab); + prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array @@ -1840,9 +1843,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ + bool jit_needed = false; + if (fp->bpf_func) goto finalize; + if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || + bpf_prog_has_kfunc_call(fp)) + jit_needed = true; + bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant @@ -1858,12 +1867,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); -#ifdef CONFIG_BPF_JIT_ALWAYS_ON - if (!fp->jited) { + if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; } -#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) @@ -2343,6 +2350,11 @@ bool __weak bpf_jit_needs_zext(void) return false; } +bool __weak bpf_jit_supports_kfunc_call(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 3acc7e0b6916..dad821c8ecd0 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs, { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - if (insn->src_reg != BPF_PSEUDO_CALL && + if (!insn->src_reg && insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && func_id_str[insn->imm]) return func_id_str[insn->imm]; - if (cbs && cbs->cb_call) - return cbs->cb_call(cbs->private_data, insn); + if (cbs && cbs->cb_call) { + const char *res; + + res = cbs->cb_call(cbs->private_data, insn); + if (res) + return res; + } if (insn->src_reg == BPF_PSEUDO_CALL) snprintf(buff, len, "%+d", insn->imm); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + snprintf(buff, len, "kernel-function"); return buff; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eaf85bf51c5a..9603de81811a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1696,6 +1696,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); + kfree(prog->aux->kfunc_tab); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b31e62daafbd..852541a435ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_KFUNC_CALL; +} + static bool bpf_pseudo_func(const struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && @@ -1554,47 +1560,205 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "too many subprograms\n"); return -E2BIG; } + /* determine subprog starts. The end is one before the next starts */ env->subprog_info[env->subprog_cnt++].start = off; sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return env->subprog_cnt - 1; } -static int check_subprogs(struct bpf_verifier_env *env) +struct bpf_kfunc_desc { + struct btf_func_model func_model; + u32 func_id; + s32 imm; +}; + +#define MAX_KFUNC_DESCS 256 +struct bpf_kfunc_desc_tab { + struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; + u32 nr_descs; +}; + +static int kfunc_desc_cmp_by_id(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + /* func_id is not greater than BTF_MAX_TYPE */ + return d0->func_id - d1->func_id; +} + +static const struct bpf_kfunc_desc * +find_kfunc_desc(const struct bpf_prog *prog, u32 func_id) +{ + struct bpf_kfunc_desc desc = { + .func_id = func_id, + }; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + return bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_id); +} + +static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id) +{ + const struct btf_type *func, *func_proto; + struct bpf_kfunc_desc_tab *tab; + struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_desc *desc; + const char *func_name; + unsigned long addr; + int err; + + prog_aux = env->prog->aux; + tab = prog_aux->kfunc_tab; + if (!tab) { + if (!btf_vmlinux) { + verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!env->prog->jit_requested) { + verbose(env, "JIT is required for calling kernel function\n"); + return -ENOTSUPP; + } + + if (!bpf_jit_supports_kfunc_call()) { + verbose(env, "JIT does not support calling kernel function\n"); + return -ENOTSUPP; + } + + if (!env->prog->gpl_compatible) { + verbose(env, "cannot call kernel function from non-GPL compatible program\n"); + return -EINVAL; + } + + tab = kzalloc(sizeof(*tab), GFP_KERNEL); + if (!tab) + return -ENOMEM; + prog_aux->kfunc_tab = tab; + } + + if (find_kfunc_desc(env->prog, func_id)) + return 0; + + if (tab->nr_descs == MAX_KFUNC_DESCS) { + verbose(env, "too many different kernel function calls\n"); + return -E2BIG; + } + + func = btf_type_by_id(btf_vmlinux, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %u is not a function\n", + func_id); + return -EINVAL; + } + func_proto = btf_type_by_id(btf_vmlinux, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + addr = kallsyms_lookup_name(func_name); + if (!addr) { + verbose(env, "cannot find address for kernel function %s\n", + func_name); + return -EINVAL; + } + + desc = &tab->descs[tab->nr_descs++]; + desc->func_id = func_id; + desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base; + err = btf_distill_func_proto(&env->log, btf_vmlinux, + func_proto, func_name, + &desc->func_model); + if (!err) + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id, NULL); + return err; +} + +static int kfunc_desc_cmp_by_imm(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + if (d0->imm > d1->imm) + return 1; + else if (d0->imm < d1->imm) + return -1; + return 0; +} + +static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) +{ + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + if (!tab) + return; + + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_imm, NULL); +} + +bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) +{ + return !!prog->aux->kfunc_tab; +} + +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc desc = { + .imm = insn->imm, + }; + const struct bpf_kfunc_desc *res; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + res = bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm); + + return res ? &res->func_model : NULL; +} + +static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { - int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; + int i, ret, insn_cnt = env->prog->len; /* Add entry function. */ ret = add_subprog(env, 0); - if (ret < 0) + if (ret) return ret; - /* determine subprog starts. The end is one before the next starts */ - for (i = 0; i < insn_cnt; i++) { - if (bpf_pseudo_func(insn + i)) { - if (!env->bpf_capable) { - verbose(env, - "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); - return -EPERM; - } - ret = add_subprog(env, i + insn[i].imm + 1); - if (ret < 0) - return ret; - /* remember subprog */ - insn[i + 1].imm = ret; - continue; - } - if (!bpf_pseudo_call(insn + i)) + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) && + !bpf_pseudo_kfunc_call(insn)) continue; + if (!env->bpf_capable) { - verbose(env, - "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } - ret = add_subprog(env, i + insn[i].imm + 1); + + if (bpf_pseudo_func(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + if (ret >= 0) + /* remember subprog */ + insn[1].imm = ret; + } else if (bpf_pseudo_call(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + } else { + ret = add_kfunc_call(env, insn->imm); + } + if (ret < 0) return ret; } @@ -1608,6 +1772,16 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + /* now check that all jumps are within the same subprog */ subprog_start = subprog[cur_subprog].start; subprog_end = subprog[cur_subprog + 1].start; @@ -1916,6 +2090,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, return i; } +static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) +{ + const struct btf_type *func; + + if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) + return NULL; + + func = btf_type_by_id(btf_vmlinux, insn->imm); + return btf_name_by_offset(btf_vmlinux, func->name_off); +} + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -1924,6 +2109,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, u32 *reg_mask, u64 *stack_mask) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -5960,6 +6146,98 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return 0; } +/* mark_btf_func_reg_size() is used when the reg size is determined by + * the BTF func_proto's return value size and argument. + */ +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + struct bpf_reg_state *reg = &cur_regs(env)[regno]; + + if (regno == BPF_REG_0) { + /* Function return value */ + reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = reg_size == sizeof(u64) ? + DEF_NOT_SUBREG : env->insn_idx + 1; + } else { + /* Function argument */ + if (reg_size == sizeof(u64)) { + mark_insn_zext(env, reg); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + } else { + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); + } + } +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct btf_type *t, *func, *func_proto, *ptr_type; + struct bpf_reg_state *regs = cur_regs(env); + const char *func_name, *ptr_type_name; + u32 i, nargs, func_id, ptr_type_id; + const struct btf_param *args; + int err; + + func_id = insn->imm; + func = btf_type_by_id(btf_vmlinux, func_id); + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + func_proto = btf_type_by_id(btf_vmlinux, func->type); + + if (!env->ops->check_kfunc_call || + !env->ops->check_kfunc_call(func_id)) { + verbose(env, "calling kernel function %s is not allowed\n", + func_name); + return -EACCES; + } + + /* Check the arguments */ + err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs); + if (err) + return err; + + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + /* Check return type */ + t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL); + if (btf_type_is_scalar(t)) { + mark_reg_unknown(env, regs, BPF_REG_0); + mark_btf_func_reg_size(env, BPF_REG_0, t->size); + } else if (btf_type_is_ptr(t)) { + ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type, + &ptr_type_id); + if (!btf_type_is_struct(ptr_type)) { + ptr_type_name = btf_name_by_offset(btf_vmlinux, + ptr_type->name_off); + verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", + func_name, btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; + mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ + + nargs = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nargs; i++) { + u32 regno = i + 1; + + t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL); + if (btf_type_is_ptr(t)) + mark_btf_func_reg_size(env, regno, sizeof(void *)); + else + /* scalar. ensured by btf_check_kfunc_arg_match() */ + mark_btf_func_reg_size(env, regno, t->size); + } + + return 0; +} + static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -10162,6 +10440,7 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -10309,7 +10588,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL) || + insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); @@ -10324,6 +10604,8 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + err = check_kfunc_call(env, insn); else err = check_helper_call(env, insn, &env->insn_idx); if (err) @@ -11634,6 +11916,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->linfo = prog->aux->linfo; func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; @@ -11773,6 +12056,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) #ifndef CONFIG_BPF_JIT_ALWAYS_ON struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; + bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); int i, depth; #endif int err = 0; @@ -11786,6 +12070,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (has_kfunc_call) { + verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); + return -EINVAL; + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. @@ -11814,6 +12102,26 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } +static int fixup_kfunc_call(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc *desc; + + /* insn->imm has the btf func_id. Replace it with + * an address (relative to __bpf_base_call). + */ + desc = find_kfunc_desc(env->prog, insn->imm); + if (!desc) { + verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", + insn->imm); + return -EFAULT; + } + + insn->imm = desc->imm; + + return 0; +} + /* Do various post-verification rewrites in a single program pass. * These rewrites simplify JIT and interpreter implementations. */ @@ -11949,6 +12257,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) continue; if (insn->src_reg == BPF_PSEUDO_CALL) continue; + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + ret = fixup_kfunc_call(env, insn); + if (ret) + return ret; + continue; + } if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -12178,6 +12492,8 @@ patch_call_imm: } } + sort_kfunc_descs_by_imm(env->prog); + return 0; } @@ -12883,6 +13199,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; + ret = add_subprog_and_kfunc(env); + if (ret < 0) + goto skip_full_check; + ret = check_subprogs(env); if (ret < 0) goto skip_full_check; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d3036e292a9..ab9f2233607c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1117,6 +1117,10 @@ enum bpf_link_type { * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index eb888c8479c3..336a749673d1 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -136,7 +136,7 @@ { "calls: wrong src reg", .insns = { - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0), BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 5cf361d8eb1c..17fe33a75034 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, -- cgit v1.2.3 From 2b246b2569cd2ac6ff700d0dce56b8bae29b1842 Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Mon, 29 Mar 2021 18:45:15 -0700 Subject: icmp: add support for RFC 8335 PROBE Add definitions for PROBE ICMP types and codes. Add AFI definitions for IP and IPV6 as specified by IANA Add a struct to represent the additional header when probing by IP address (ctype == 3) for use in parsing incoming PROBE messages Add a struct to represent the entire Interface Identification Object (IIO) section of an incoming PROBE packet Signed-off-by: Andreas Roeseler Signed-off-by: David S. Miller --- include/uapi/linux/icmp.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index fb169a50895e..222325d1d80e 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -20,6 +20,9 @@ #include #include +#include +#include +#include #define ICMP_ECHOREPLY 0 /* Echo Reply */ #define ICMP_DEST_UNREACH 3 /* Destination Unreachable */ @@ -66,6 +69,23 @@ #define ICMP_EXC_TTL 0 /* TTL count exceeded */ #define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */ +/* Codes for EXT_ECHO (PROBE) */ +#define ICMP_EXT_ECHO 42 +#define ICMP_EXT_ECHOREPLY 43 +#define ICMP_EXT_MAL_QUERY 1 /* Malformed Query */ +#define ICMP_EXT_NO_IF 2 /* No such Interface */ +#define ICMP_EXT_NO_TABLE_ENT 3 /* No such Table Entry */ +#define ICMP_EXT_MULT_IFS 4 /* Multiple Interfaces Satisfy Query */ + +/* Constants for EXT_ECHO (PROBE) */ +#define EXT_ECHOREPLY_ACTIVE (1 << 2)/* active bit in reply message */ +#define EXT_ECHOREPLY_IPV4 (1 << 1)/* ipv4 bit in reply message */ +#define EXT_ECHOREPLY_IPV6 1 /* ipv6 bit in reply message */ +#define EXT_ECHO_CTYPE_NAME 1 +#define EXT_ECHO_CTYPE_INDEX 2 +#define EXT_ECHO_CTYPE_ADDR 3 +#define ICMP_AFI_IP 1 /* Address Family Identifier for ipv4 */ +#define ICMP_AFI_IP6 2 /* Address Family Identifier for ipv6 */ struct icmphdr { __u8 type; @@ -118,4 +138,26 @@ struct icmp_extobj_hdr { __u8 class_type; }; +/* RFC 8335: 2.1 Header for c-type 3 payload */ +struct icmp_ext_echo_ctype3_hdr { + __be16 afi; + __u8 addrlen; + __u8 reserved; +}; + +/* RFC 8335: 2.1 Interface Identification Object */ +struct icmp_ext_echo_iio { + struct icmp_extobj_hdr extobj_hdr; + union { + char name[IFNAMSIZ]; + __be32 ifindex; + struct { + struct icmp_ext_echo_ctype3_hdr ctype3_hdr; + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + } ip_addr; + } addr; + } ident; +}; #endif /* _UAPI_LINUX_ICMP_H */ -- cgit v1.2.3 From 750f4fc2a12f6632b5aa04526bf57fa06bfe8467 Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Mon, 29 Mar 2021 18:45:21 -0700 Subject: ICMPV6: add support for RFC 8335 PROBE Add definitions for the ICMPV6 type of Extended Echo Request and Extended Echo Reply, as defined by sections 2 and 3 of RFC 8335. Signed-off-by: Andreas Roeseler Signed-off-by: David S. Miller --- include/uapi/linux/icmpv6.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 0564fd7ccde4..ecaece3af38d 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -140,6 +140,9 @@ struct icmp6hdr { #define ICMPV6_UNK_OPTION 2 #define ICMPV6_HDR_INCOMP 3 +/* Codes for EXT_ECHO (PROBE) */ +#define ICMPV6_EXT_ECHO_REQUEST 160 +#define ICMPV6_EXT_ECHO_REPLY 161 /* * constants for (set|get)sockopt */ -- cgit v1.2.3 From 1e5d1f69d9fb8ea0679f9e85915e8e7fdacfbe7a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Mar 2021 20:59:52 -0700 Subject: ethtool: support FEC settings over netlink Add FEC API to netlink. This is not a 1-to-1 conversion. FEC settings already depend on link modes to tell user which modes are supported. Take this further an use link modes for manual configuration. Old struct ethtool_fecparam is still used to talk to the drivers, so we need to translate back and forth. We can revisit the internal API if number of FEC encodings starts to grow. Enforce only one active FEC bit (by using a bit position rather than another mask). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 62 ++++++- include/uapi/linux/ethtool_netlink.h | 17 ++ net/ethtool/Makefile | 2 +- net/ethtool/fec.c | 238 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 19 +++ net/ethtool/netlink.h | 4 + 6 files changed, 339 insertions(+), 3 deletions(-) create mode 100644 net/ethtool/fec.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 05073482db05..4bdb4298f178 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -208,6 +208,8 @@ Userspace to kernel: ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` action start raw TDR cable test ``ETHTOOL_MSG_TUNNEL_INFO_GET`` get tunnel offload info + ``ETHTOOL_MSG_FEC_GET`` get FEC settings + ``ETHTOOL_MSG_FEC_SET`` set FEC settings ===================================== ================================ Kernel to userspace: @@ -242,6 +244,8 @@ Kernel to userspace: ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF`` Cable test TDR results ``ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY`` tunnel offload info + ``ETHTOOL_MSG_FEC_GET_REPLY`` FEC settings + ``ETHTOOL_MSG_FEC_NTF`` FEC settings ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1280,6 +1284,60 @@ Kernel response contents: For UDP tunnel table empty ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` indicates that the table contains static entries, hard-coded by the NIC. +FEC_GET +======= + +Gets FEC configuration and state like ``ETHTOOL_GFECPARAM`` ioctl request. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ===================================== ====== ========================== + +Kernel response contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ``ETHTOOL_A_FEC_MODES`` bitset configured modes + ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection + ``ETHTOOL_A_FEC_ACTIVE`` u32 index of active FEC mode + ===================================== ====== ========================== + +``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently +active on the interface. This attribute may not be present if device does +not support FEC. + +``ETHTOOL_A_FEC_MODES`` and ``ETHTOOL_A_FEC_AUTO`` are only meaningful when +autonegotiation is disabled. If ``ETHTOOL_A_FEC_AUTO`` is non-zero driver will +select the FEC mode automatically based on the parameters of the SFP module. +This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface. +``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode +bits (rather than old ``ETHTOOL_FEC_*`` bits). + +FEC_SET +======= + +Sets FEC parameters like ``ETHTOOL_SFECPARAM`` ioctl request. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ``ETHTOOL_A_FEC_MODES`` bitset configured modes + ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection + ===================================== ====== ========================== + +``FEC_SET`` is only meaningful when autonegotiation is disabled. Otherwise +FEC mode is selected as part of autonegotiation. + +``ETHTOOL_A_FEC_MODES`` selects which FEC mode should be used. It's recommended +to set only one bit, if multiple bits are set driver may choose between them +in an implementation specific way. + +``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP +module parameters. This does not mean autonegotiation. + Request translation =================== @@ -1373,8 +1431,8 @@ are netlink only. ``ETHTOOL_MSG_LINKMODES_SET`` ``ETHTOOL_PHY_GTUNABLE`` n/a ``ETHTOOL_PHY_STUNABLE`` n/a - ``ETHTOOL_GFECPARAM`` n/a - ``ETHTOOL_SFECPARAM`` n/a + ``ETHTOOL_GFECPARAM`` ``ETHTOOL_MSG_FEC_GET`` + ``ETHTOOL_SFECPARAM`` ``ETHTOOL_MSG_FEC_SET`` n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' n/a ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT'' n/a ``ETHTOOL_MSG_TUNNEL_INFO_GET`` diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index a286635ac9b8..7f1bdb5b31ba 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -42,6 +42,8 @@ enum { ETHTOOL_MSG_CABLE_TEST_ACT, ETHTOOL_MSG_CABLE_TEST_TDR_ACT, ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_FEC_GET, + ETHTOOL_MSG_FEC_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -80,6 +82,8 @@ enum { ETHTOOL_MSG_CABLE_TEST_NTF, ETHTOOL_MSG_CABLE_TEST_TDR_NTF, ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, + ETHTOOL_MSG_FEC_GET_REPLY, + ETHTOOL_MSG_FEC_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -629,6 +633,19 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +/* FEC */ + +enum { + ETHTOOL_A_FEC_UNSPEC, + ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEC_MODES, /* bitset */ + ETHTOOL_A_FEC_AUTO, /* u8 */ + ETHTOOL_A_FEC_ACTIVE, /* u32 */ + + __ETHTOOL_A_FEC_CNT, + ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 7a849ff22dad..c2dc9033a8f7 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o + tunnels.o fec.o diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c new file mode 100644 index 000000000000..31454b9188bd --- /dev/null +++ b/net/ethtool/fec.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct fec_req_info { + struct ethnl_req_info base; +}; + +struct fec_reply_data { + struct ethnl_reply_data base; + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes); + u32 active_fec; + u8 fec_auto; +}; + +#define FEC_REPDATA(__reply_base) \ + container_of(__reply_base, struct fec_reply_data, base) + +#define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1) + +const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static void +ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto) +{ + if (fec_auto) + *fec_auto = !!(fec & ETHTOOL_FEC_AUTO); + + if (fec & ETHTOOL_FEC_OFF) + __set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes); + if (fec & ETHTOOL_FEC_RS) + __set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes); + if (fec & ETHTOOL_FEC_BASER) + __set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes); + if (fec & ETHTOOL_FEC_LLRS) + __set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes); +} + +static int +ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec, + unsigned long *link_modes, u8 fec_auto) +{ + memset(fec, 0, sizeof(*fec)); + + if (fec_auto) + fec->fec |= ETHTOOL_FEC_AUTO; + + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_OFF; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_RS; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_BASER; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_LLRS; + + if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS)) + return -EINVAL; + + return 0; +} + +static int fec_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {}; + struct fec_reply_data *data = FEC_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + struct ethtool_fecparam fec = {}; + int ret; + + if (!dev->ethtool_ops->get_fecparam) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = dev->ethtool_ops->get_fecparam(dev, &fec); + ethnl_ops_complete(dev); + if (ret) + return ret; + + WARN_ON_ONCE(fec.reserved); + + ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes, + &data->fec_auto); + + ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL); + data->active_fec = find_first_bit(active_fec_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS); + /* Don't report attr if no FEC mode set. Note that + * ethtool_fecparam_to_link_modes() ignores NONE and AUTO. + */ + if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS) + data->active_fec = 0; + + return 0; +} + +static int fec_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int len = 0; + int ret; + + ret = ethnl_bitset_size(data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + + len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ + nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ + + return len; +} + +static int fec_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int ret; + + ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES, + data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + + if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) || + (data->active_fec && + nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec))) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_fec_request_ops = { + .request_cmd = ETHTOOL_MSG_FEC_GET, + .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, + .hdr_attr = ETHTOOL_A_FEC_HEADER, + .req_info_size = sizeof(struct fec_req_info), + .reply_data_size = sizeof(struct fec_reply_data), + + .prepare_data = fec_prepare_data, + .reply_size = fec_reply_size, + .fill_reply = fec_fill_reply, +}; + +/* FEC_SET */ + +const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_FEC_MODES] = { .type = NLA_NESTED }, + [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {}; + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct ethtool_fecparam fec = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + bool mod = false; + u8 fec_auto; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEC_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_fecparam || !ops->set_fecparam) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ret = ops->get_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + + ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto); + + ret = ethnl_update_bitset(fec_link_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_FEC_MODES], + link_mode_names, info->extack, &mod); + if (ret < 0) + goto out_ops; + ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod); + + ret = 0; + if (!mod) + goto out_ops; + + ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "invalid FEC modes requested"); + goto out_ops; + } + if (!fec.fec) { + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "no FEC modes set"); + goto out_ops; + } + + ret = dev->ethtool_ops->set_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_FEC_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 50d3c8896f91..705a4b201564 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -244,6 +244,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, }; @@ -551,6 +552,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_COALESCE_NTF] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, }; /* default notification handler */ @@ -643,6 +645,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -912,6 +915,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_FEC_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_fec_get_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_FEC_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_fec, + .policy = ethnl_fec_set_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 6eabd58d81bf..785f7ee45930 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -344,6 +344,7 @@ extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; +extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -375,6 +376,8 @@ extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; +extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; +extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -392,5 +395,6 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:30 -0700 Subject: sock_map: Introduce BPF_SK_SKB_VERDICT Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is confusing and more importantly we still want to distinguish them from user-space. So we can just reuse the stream verdict code but introduce a new type of eBPF program, skb_verdict. Users are not allowed to attach stream_verdict and skb_verdict programs to the same map. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 1 + net/core/skmsg.c | 4 +++- net/core/sock_map.c | 28 ++++++++++++++++++++++++++++ tools/bpf/bpftool/common.c | 1 + tools/bpf/bpftool/prog.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e7aba150539d..c83dbc2d81d9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,7 @@ struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { @@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); + psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 598716742593..49371eba98ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9603de81811a..6428634da57e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 656eceab73bc..a045812d7c78 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.stream_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, } skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 42d797291d34..c2a0411e08a8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk, strp_stop = true; if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } @@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; @@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { @@ -278,6 +290,9 @@ no_progs: if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; @@ -309,6 +324,9 @@ no_progs: } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -317,6 +335,9 @@ out_unlock_drop: out_drop: sk_psock_put(sk, psock); out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: @@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #endif case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; pprog = &progs->stream_verdict; break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + pprog = &progs->skb_verdict; + break; default: return -EOPNOTSUPP; } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ab9f2233607c..69902603012c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From dc87efdb1a5cd46134a9d490480160e303bc6eef Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:19:44 -0700 Subject: mptcp: add mptcp reset option support The MPTCP reset option allows to carry a mptcp-specific error code that provides more information on the nature of a connection reset. Reset option data received gets stored in the subflow context so it can be sent to userspace via the 'subflow closed' netlink event. When a subflow is closed, the desired error code that should be sent to the peer is also placed in the subflow context structure. If a reset is sent before subflow establishment could complete, e.g. on HMAC failure during an MP_JOIN operation, the mptcp skb extension is used to store the reset information. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 18 +++++++++++-- include/uapi/linux/mptcp.h | 11 ++++++++ net/ipv4/tcp_ipv4.c | 21 ++++++++++++--- net/ipv6/tcp_ipv6.c | 14 +++++++++- net/mptcp/options.c | 67 ++++++++++++++++++++++++++++++++++++++++++---- net/mptcp/pm_netlink.c | 12 +++++++++ net/mptcp/protocol.c | 12 ++++++--- net/mptcp/protocol.h | 14 +++++++++- net/mptcp/subflow.c | 30 ++++++++++++++++++--- 9 files changed, 180 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index cea69c801595..16fe34d139c3 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -30,8 +30,8 @@ struct mptcp_ext { ack64:1, mpc_map:1, frozen:1, - __unused:1; - /* one byte hole */ + reset_transient:1; + u8 reset_reason:4; }; #define MPTCP_RM_IDS_MAX 8 @@ -58,6 +58,8 @@ struct mptcp_out_options { struct mptcp_rm_list rm_list; u8 join_id; u8 backup; + u8 reset_reason:4; + u8 reset_transient:1; u32 nonce; u64 thmac; u32 token; @@ -156,6 +158,16 @@ void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); + +__be32 mptcp_get_reset_option(const struct sk_buff *skb); + +static inline __be32 mptcp_reset_option(const struct sk_buff *skb) +{ + if (skb_ext_exist(skb, SKB_EXT_MPTCP)) + return mptcp_get_reset_option(skb); + + return htonl(0u); +} #else static inline void mptcp_init(void) @@ -236,6 +248,8 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, { return 0; /* TCP fallback */ } + +static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index e1172c1ffdfd..8eb3c0844bff 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -174,10 +174,21 @@ enum mptcp_event_attr { MPTCP_ATTR_FLAGS, /* u16 */ MPTCP_ATTR_TIMEOUT, /* u32 */ MPTCP_ATTR_IF_IDX, /* s32 */ + MPTCP_ATTR_RESET_REASON,/* u32 */ + MPTCP_ATTR_RESET_FLAGS, /* u32 */ __MPTCP_ATTR_AFTER_LAST }; #define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) +/* MPTCP Reset reason codes, rfc8684 */ +#define MPTCP_RST_EUNSPEC 0 +#define MPTCP_RST_EMPTCP 1 +#define MPTCP_RST_ERESOURCE 2 +#define MPTCP_RST_EPROHIBIT 3 +#define MPTCP_RST_EWQ2BIG 4 +#define MPTCP_RST_EBADPERF 5 +#define MPTCP_RST_EMIDDLEBOX 6 + #endif /* _UAPI_MPTCP_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index dfc6d1c0e710..312184cead57 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -655,14 +655,18 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ +#ifdef CONFIG_TCP_MD5SIG +#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED +#else +#define OPTION_BYTES sizeof(__be32) +#endif + static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; -#ifdef CONFIG_TCP_MD5SIG - __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; -#endif + __be32 opt[OPTION_BYTES / sizeof(__be32)]; } rep; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG @@ -770,6 +774,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->daddr, &rep.th); } #endif + /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ + if (rep.opt[0] == 0) { + __be32 mrst = mptcp_reset_option(skb); + + if (mrst) { + rep.opt[0] = mrst; + arg.iov[0].iov_len += sizeof(mrst); + rep.th.doff = arg.iov[0].iov_len / 4; + } + } + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bff22d6ef516..5f47c0b6e3de 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -879,8 +879,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); + __be32 mrst = 0, *topt; struct dst_entry *dst; - __be32 *topt; __u32 mark = 0; if (tsecr) @@ -890,6 +890,15 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif +#ifdef CONFIG_MPTCP + if (rst && !key) { + mrst = mptcp_reset_option(skb); + + if (mrst) + tot_len += sizeof(__be32); + } +#endif + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (!buff) @@ -920,6 +929,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 *topt++ = htonl(tsecr); } + if (mrst) + *topt++ = mrst; + #ifdef CONFIG_TCP_MD5SIG if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 68361d28dc67..4b7119eb2c31 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -305,6 +305,18 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->fastclose = 1; break; + case MPTCPOPT_RST: + if (opsize != TCPOLEN_MPTCP_RST) + break; + + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + break; + mp_opt->reset = 1; + flags = *ptr++; + mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; + mp_opt->reset_reason = *ptr; + break; + default: break; } @@ -327,6 +339,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->rm_addr = 0; mp_opt->dss = 0; mp_opt->mp_prio = 0; + mp_opt->reset = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -726,6 +739,22 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, return true; } +static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (remaining < TCPOLEN_MPTCP_RST) + return; + + *size = TCPOLEN_MPTCP_RST; + opts->suboptions |= OPTION_MPTCP_RST; + opts->reset_transient = subflow->reset_transient; + opts->reset_reason = subflow->reset_reason; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -741,11 +770,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(__mptcp_check_fallback(msk))) return false; - /* prevent adding of any MPTCP related options on reset packet - * until we support MP_TCPRST/MP_FASTCLOSE - */ - if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) - return false; + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { + mptcp_established_options_rst(sk, skb, size, remaining, opts); + return true; + } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) @@ -1062,6 +1090,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mp_opt.mp_prio = 0; } + if (mp_opt.reset) { + subflow->reset_seen = 1; + subflow->reset_reason = mp_opt.reset_reason; + subflow->reset_transient = mp_opt.reset_transient; + } + if (!mp_opt.dss) return; @@ -1289,6 +1323,12 @@ mp_capable_done: ptr += 5; } + if (OPTION_MPTCP_RST & opts->suboptions) + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; @@ -1340,3 +1380,20 @@ mp_capable_done: if (tp) mptcp_set_rwin(tp); } + +__be32 mptcp_get_reset_option(const struct sk_buff *skb) +{ + const struct mptcp_ext *ext = mptcp_get_ext(skb); + u8 flags, reason; + + if (ext) { + flags = ext->reset_transient; + reason = ext->reset_reason; + + return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, + flags, reason); + } + + return htonl(0u); +} +EXPORT_SYMBOL_GPL(mptcp_get_reset_option); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index cadafafa1049..51be6c34b339 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1687,9 +1687,21 @@ static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { + const struct mptcp_subflow_context *sf; + if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; + sf = mptcp_subflow_ctx(ssk); + if (!sf->reset_seen) + return 0; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) + return -EMSGSIZE; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) + return -EMSGSIZE; + return 0; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 531ee24aa827..e894345d10c1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3090,14 +3090,18 @@ bool mptcp_finish_join(struct sock *ssk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (!mptcp_is_fully_established(parent)) + if (!mptcp_is_fully_established(parent)) { + subflow->reset_reason = MPTCP_RST_EMPTCP; return false; + } if (!msk->pm.server_side) goto out; - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* active connections are already on conn_list, and we can't acquire * msk lock here. @@ -3111,8 +3115,10 @@ bool mptcp_finish_join(struct sock *ssk) sock_hold(ssk); } spin_unlock_bh(&msk->join_list_lock); - if (!ret) + if (!ret) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* attach to msk socket only after we are sure he will deal with us * at close time diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e8c5ff2b8ace..40e9b05856cd 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -26,6 +26,7 @@ #define OPTION_MPTCP_RM_ADDR BIT(8) #define OPTION_MPTCP_FASTCLOSE BIT(9) #define OPTION_MPTCP_PRIO BIT(10) +#define OPTION_MPTCP_RST BIT(11) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -36,6 +37,7 @@ #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 +#define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 @@ -65,6 +67,7 @@ #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 +#define TCPOLEN_MPTCP_RST 4 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -94,6 +97,9 @@ /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) +/* MPTCP TCPRST flags */ +#define MPTCP_RST_TRANSIENT BIT(0) + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_NOSPACE 1 @@ -123,6 +129,7 @@ struct mptcp_options_received { u16 mp_capable : 1, mp_join : 1, fastclose : 1, + reset : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -152,6 +159,8 @@ struct mptcp_options_received { }; u64 ahmac; u16 port; + u8 reset_reason:4; + u8 reset_transient:1; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) @@ -422,6 +431,9 @@ struct mptcp_subflow_context { u8 hmac[MPTCPOPT_HMAC_LEN]; u8 local_id; u8 remote_id; + u8 reset_seen:1; + u8 reset_transient:1; + u8 reset_reason:4; long delegated_status; struct list_head delegated_node; /* link into delegated_action, protected by local BH */ @@ -742,7 +754,7 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 7a5f50d00d4b..223d6be5fc3b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -115,6 +115,16 @@ static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } +static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) +{ + struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + + if (mpext) { + memset(mpext, 0, sizeof(*mpext)); + mpext->reset_reason = reason; + } +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -190,8 +200,10 @@ again: subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ - if (!subflow_req->msk) + if (!subflow_req->msk) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; + } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d", @@ -400,8 +412,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!mp_opt.mp_join) + if (!mp_opt.mp_join) { + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; + } subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; @@ -410,6 +424,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } @@ -438,6 +453,7 @@ fallback: return; do_reset: + subflow->reset_transient = 0; mptcp_subflow_reset(sk); } @@ -654,8 +670,10 @@ create_child: * to reset the context to non MPTCP status. */ if (!ctx || fallback) { - if (fallback_is_fatal) + if (fallback_is_fatal) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; + } subflow_drop_ctx(child); goto out; @@ -690,8 +708,10 @@ create_child: struct mptcp_sock *owner; owner = subflow_req->msk; - if (!owner) + if (!owner) { + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; + } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; @@ -1056,6 +1076,8 @@ fatal: smp_wmb(); ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); subflow->data_avail = 0; return false; -- cgit v1.2.3 From 0750cfd8b7fd491c9d7c8bd19d9ac380cd3c84ee Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Thu, 11 Mar 2021 15:03:33 -0800 Subject: nl80211: better document CMD_ROAM behavior The docs were very sparse with how exactly CMD_ROAM should be used. Specifically related to BSS information normally obtained through a user space scan. Signed-off-by: James Prestwood Link: https://lore.kernel.org/r/20210311230333.103934-1-prestwoj@gmail.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ac78da99fccd..5e30c7f6c484 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -655,6 +655,9 @@ * When a security association was established on an 802.1X network using * fast transition, this event should be followed by an * %NL80211_CMD_PORT_AUTHORIZED event. + * Following a %NL80211_CMD_ROAM event userspace can issue + * %NL80211_CMD_GET_SCAN in order to obtain the scan information for the + * new BSS the card/driver roamed to. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other * reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and -- cgit v1.2.3 From afd2daa26c7abd734d78bd274fc6c59a15e61063 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 6 Apr 2021 21:55:53 +0200 Subject: Bluetooth: Add support for virtio transport driver This adds support for Bluetooth HCI transport over virtio. Signed-off-by: Marcel Holtmann Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/Kconfig | 10 + drivers/bluetooth/Makefile | 2 + drivers/bluetooth/virtio_bt.c | 401 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_bt.h | 31 ++++ include/uapi/linux/virtio_ids.h | 1 + 5 files changed, 445 insertions(+) create mode 100644 drivers/bluetooth/virtio_bt.c create mode 100644 include/uapi/linux/virtio_bt.h (limited to 'include/uapi/linux') diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index 4e73a531b377..851842372c9b 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -425,4 +425,14 @@ config BT_HCIRSI Say Y here to compile support for HCI over Redpine into the kernel or say M to compile as a module. +config BT_VIRTIO + tristate "Virtio Bluetooth driver" + depends on VIRTIO + help + Virtio Bluetooth support driver. + This driver supports Virtio Bluetooth devices. + + Say Y here to compile support for HCI over Virtio into the + kernel or say M to compile as a module. + endmenu diff --git a/drivers/bluetooth/Makefile b/drivers/bluetooth/Makefile index 1a58a3ae142c..16286ea2655d 100644 --- a/drivers/bluetooth/Makefile +++ b/drivers/bluetooth/Makefile @@ -26,6 +26,8 @@ obj-$(CONFIG_BT_BCM) += btbcm.o obj-$(CONFIG_BT_RTL) += btrtl.o obj-$(CONFIG_BT_QCA) += btqca.o +obj-$(CONFIG_BT_VIRTIO) += virtio_bt.o + obj-$(CONFIG_BT_HCIUART_NOKIA) += hci_nokia.o obj-$(CONFIG_BT_HCIRSI) += btrsi.o diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c new file mode 100644 index 000000000000..c804db7e90f8 --- /dev/null +++ b/drivers/bluetooth/virtio_bt.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#define VERSION "0.1" + +enum { + VIRTBT_VQ_TX, + VIRTBT_VQ_RX, + VIRTBT_NUM_VQS, +}; + +struct virtio_bluetooth { + struct virtio_device *vdev; + struct virtqueue *vqs[VIRTBT_NUM_VQS]; + struct work_struct rx; + struct hci_dev *hdev; +}; + +static int virtbt_add_inbuf(struct virtio_bluetooth *vbt) +{ + struct virtqueue *vq = vbt->vqs[VIRTBT_VQ_RX]; + struct scatterlist sg[1]; + struct sk_buff *skb; + int err; + + skb = alloc_skb(1000, GFP_KERNEL); + sg_init_one(sg, skb->data, 1000); + + err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL); + if (err < 0) { + kfree_skb(skb); + return err; + } + + return 0; +} + +static int virtbt_open(struct hci_dev *hdev) +{ + struct virtio_bluetooth *vbt = hci_get_drvdata(hdev); + + if (virtbt_add_inbuf(vbt) < 0) + return -EIO; + + virtqueue_kick(vbt->vqs[VIRTBT_VQ_RX]); + return 0; +} + +static int virtbt_close(struct hci_dev *hdev) +{ + struct virtio_bluetooth *vbt = hci_get_drvdata(hdev); + int i; + + cancel_work_sync(&vbt->rx); + + for (i = 0; i < ARRAY_SIZE(vbt->vqs); i++) { + struct virtqueue *vq = vbt->vqs[i]; + struct sk_buff *skb; + + while ((skb = virtqueue_detach_unused_buf(vq))) + kfree_skb(skb); + } + + return 0; +} + +static int virtbt_flush(struct hci_dev *hdev) +{ + return 0; +} + +static int virtbt_send_frame(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct virtio_bluetooth *vbt = hci_get_drvdata(hdev); + struct scatterlist sg[1]; + int err; + + memcpy(skb_push(skb, 1), &hci_skb_pkt_type(skb), 1); + + sg_init_one(sg, skb->data, skb->len); + err = virtqueue_add_outbuf(vbt->vqs[VIRTBT_VQ_TX], sg, 1, skb, + GFP_KERNEL); + if (err) { + kfree_skb(skb); + return err; + } + + virtqueue_kick(vbt->vqs[VIRTBT_VQ_TX]); + return 0; +} + +static int virtbt_setup_zephyr(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Read Build Information */ + skb = __hci_cmd_sync(hdev, 0xfc08, 0, NULL, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + bt_dev_info(hdev, "%s", (char *)(skb->data + 1)); + + hci_set_fw_info(hdev, "%s", skb->data + 1); + + kfree_skb(skb); + return 0; +} + +static int virtbt_set_bdaddr_zephyr(struct hci_dev *hdev, + const bdaddr_t *bdaddr) +{ + struct sk_buff *skb; + + /* Write BD_ADDR */ + skb = __hci_cmd_sync(hdev, 0xfc06, 6, bdaddr, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static int virtbt_setup_intel(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Intel Read Version */ + skb = __hci_cmd_sync(hdev, 0xfc05, 0, NULL, HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static int virtbt_set_bdaddr_intel(struct hci_dev *hdev, const bdaddr_t *bdaddr) +{ + struct sk_buff *skb; + + /* Intel Write BD Address */ + skb = __hci_cmd_sync(hdev, 0xfc31, 6, bdaddr, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static int virtbt_setup_realtek(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Read ROM Version */ + skb = __hci_cmd_sync(hdev, 0xfc6d, 0, NULL, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + bt_dev_info(hdev, "ROM version %u", *((__u8 *) (skb->data + 1))); + + kfree_skb(skb); + return 0; +} + +static int virtbt_shutdown_generic(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Reset */ + skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static void virtbt_rx_handle(struct virtio_bluetooth *vbt, struct sk_buff *skb) +{ + __u8 pkt_type; + + pkt_type = *((__u8 *) skb->data); + skb_pull(skb, 1); + + switch (pkt_type) { + case HCI_EVENT_PKT: + case HCI_ACLDATA_PKT: + case HCI_SCODATA_PKT: + case HCI_ISODATA_PKT: + hci_skb_pkt_type(skb) = pkt_type; + hci_recv_frame(vbt->hdev, skb); + break; + } +} + +static void virtbt_rx_work(struct work_struct *work) +{ + struct virtio_bluetooth *vbt = container_of(work, + struct virtio_bluetooth, rx); + struct sk_buff *skb; + unsigned int len; + + skb = virtqueue_get_buf(vbt->vqs[VIRTBT_VQ_RX], &len); + if (!skb) + return; + + skb->len = len; + virtbt_rx_handle(vbt, skb); + + if (virtbt_add_inbuf(vbt) < 0) + return; + + virtqueue_kick(vbt->vqs[VIRTBT_VQ_RX]); +} + +static void virtbt_tx_done(struct virtqueue *vq) +{ + struct sk_buff *skb; + unsigned int len; + + while ((skb = virtqueue_get_buf(vq, &len))) + kfree_skb(skb); +} + +static void virtbt_rx_done(struct virtqueue *vq) +{ + struct virtio_bluetooth *vbt = vq->vdev->priv; + + schedule_work(&vbt->rx); +} + +static int virtbt_probe(struct virtio_device *vdev) +{ + vq_callback_t *callbacks[VIRTBT_NUM_VQS] = { + [VIRTBT_VQ_TX] = virtbt_tx_done, + [VIRTBT_VQ_RX] = virtbt_rx_done, + }; + const char *names[VIRTBT_NUM_VQS] = { + [VIRTBT_VQ_TX] = "tx", + [VIRTBT_VQ_RX] = "rx", + }; + struct virtio_bluetooth *vbt; + struct hci_dev *hdev; + int err; + __u8 type; + + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) + return -ENODEV; + + type = virtio_cread8(vdev, offsetof(struct virtio_bt_config, type)); + + switch (type) { + case VIRTIO_BT_CONFIG_TYPE_PRIMARY: + case VIRTIO_BT_CONFIG_TYPE_AMP: + break; + default: + return -EINVAL; + } + + vbt = kzalloc(sizeof(*vbt), GFP_KERNEL); + if (!vbt) + return -ENOMEM; + + vdev->priv = vbt; + vbt->vdev = vdev; + + INIT_WORK(&vbt->rx, virtbt_rx_work); + + err = virtio_find_vqs(vdev, VIRTBT_NUM_VQS, vbt->vqs, callbacks, + names, NULL); + if (err) + return err; + + hdev = hci_alloc_dev(); + if (!hdev) { + err = -ENOMEM; + goto failed; + } + + vbt->hdev = hdev; + + hdev->bus = HCI_VIRTIO; + hdev->dev_type = type; + hci_set_drvdata(hdev, vbt); + + hdev->open = virtbt_open; + hdev->close = virtbt_close; + hdev->flush = virtbt_flush; + hdev->send = virtbt_send_frame; + + if (virtio_has_feature(vdev, VIRTIO_BT_F_VND_HCI)) { + __u16 vendor; + + virtio_cread(vdev, struct virtio_bt_config, vendor, &vendor); + + switch (vendor) { + case VIRTIO_BT_CONFIG_VENDOR_ZEPHYR: + hdev->manufacturer = 1521; + hdev->setup = virtbt_setup_zephyr; + hdev->shutdown = virtbt_shutdown_generic; + hdev->set_bdaddr = virtbt_set_bdaddr_zephyr; + break; + + case VIRTIO_BT_CONFIG_VENDOR_INTEL: + hdev->manufacturer = 2; + hdev->setup = virtbt_setup_intel; + hdev->shutdown = virtbt_shutdown_generic; + hdev->set_bdaddr = virtbt_set_bdaddr_intel; + set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); + set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + break; + + case VIRTIO_BT_CONFIG_VENDOR_REALTEK: + hdev->manufacturer = 93; + hdev->setup = virtbt_setup_realtek; + hdev->shutdown = virtbt_shutdown_generic; + set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + break; + } + } + + if (virtio_has_feature(vdev, VIRTIO_BT_F_MSFT_EXT)) { + __u16 msft_opcode; + + virtio_cread(vdev, struct virtio_bt_config, + msft_opcode, &msft_opcode); + + hci_set_msft_opcode(hdev, msft_opcode); + } + + if (virtio_has_feature(vdev, VIRTIO_BT_F_AOSP_EXT)) + hci_set_aosp_capable(hdev); + + if (hci_register_dev(hdev) < 0) { + hci_free_dev(hdev); + err = -EBUSY; + goto failed; + } + + return 0; + +failed: + vdev->config->del_vqs(vdev); + return err; +} + +static void virtbt_remove(struct virtio_device *vdev) +{ + struct virtio_bluetooth *vbt = vdev->priv; + struct hci_dev *hdev = vbt->hdev; + + hci_unregister_dev(hdev); + vdev->config->reset(vdev); + + hci_free_dev(hdev); + vbt->hdev = NULL; + + vdev->config->del_vqs(vdev); + kfree(vbt); +} + +static struct virtio_device_id virtbt_table[] = { + { VIRTIO_ID_BT, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +MODULE_DEVICE_TABLE(virtio, virtbt_table); + +static const unsigned int virtbt_features[] = { + VIRTIO_BT_F_VND_HCI, + VIRTIO_BT_F_MSFT_EXT, + VIRTIO_BT_F_AOSP_EXT, +}; + +static struct virtio_driver virtbt_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .feature_table = virtbt_features, + .feature_table_size = ARRAY_SIZE(virtbt_features), + .id_table = virtbt_table, + .probe = virtbt_probe, + .remove = virtbt_remove, +}; + +module_virtio_driver(virtbt_driver); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Generic Bluetooth VIRTIO driver ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/virtio_bt.h b/include/uapi/linux/virtio_bt.h new file mode 100644 index 000000000000..a7bd48daa9a9 --- /dev/null +++ b/include/uapi/linux/virtio_bt.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ + +#ifndef _UAPI_LINUX_VIRTIO_BT_H +#define _UAPI_LINUX_VIRTIO_BT_H + +#include + +/* Feature bits */ +#define VIRTIO_BT_F_VND_HCI 0 /* Indicates vendor command support */ +#define VIRTIO_BT_F_MSFT_EXT 1 /* Indicates MSFT vendor support */ +#define VIRTIO_BT_F_AOSP_EXT 2 /* Indicates AOSP vendor support */ + +enum virtio_bt_config_type { + VIRTIO_BT_CONFIG_TYPE_PRIMARY = 0, + VIRTIO_BT_CONFIG_TYPE_AMP = 1, +}; + +enum virtio_bt_config_vendor { + VIRTIO_BT_CONFIG_VENDOR_NONE = 0, + VIRTIO_BT_CONFIG_VENDOR_ZEPHYR = 1, + VIRTIO_BT_CONFIG_VENDOR_INTEL = 2, + VIRTIO_BT_CONFIG_VENDOR_REALTEK = 3, +}; + +struct virtio_bt_config { + __u8 type; + __u16 vendor; + __u16 msft_opcode; +} __attribute__((packed)); + +#endif /* _UAPI_LINUX_VIRTIO_BT_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index bc1c0621f5ed..b4f468e9441d 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -53,6 +53,7 @@ #define VIRTIO_ID_MEM 24 /* virtio mem */ #define VIRTIO_ID_FS 26 /* virtio filesystem */ #define VIRTIO_ID_PMEM 27 /* virtio pmem */ +#define VIRTIO_ID_BT 28 /* virtio bluetooth */ #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ #endif /* _LINUX_VIRTIO_IDS_H */ -- cgit v1.2.3 From d84d13d6f6e03a7aaac9e7d064a11cf6f5087da6 Mon Sep 17 00:00:00 2001 From: Vamsi Krishna Date: Tue, 2 Mar 2021 20:20:36 +0530 Subject: nl80211: Add interface to indicate TDLS peer's HE capability Enhance enum nl80211_tdls_peer_capability to configure TDLS peer's support for HE mode. Userspace decodes the TDLS setup response frame and confugures the HE mode support to driver if the peer has advertized HE mode support in TDLS setup response frame. The driver uses this information to decide whether to include HE operation IE in TDLS setup confirmation frame. Signed-off-by: Vamsi Krishna Link: https://lore.kernel.org/r/1614696636-30144-1-git-send-email-vamsin@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5e30c7f6c484..18dfe744bcb5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6298,11 +6298,13 @@ struct nl80211_vendor_cmd_info { * @NL80211_TDLS_PEER_HT: TDLS peer is HT capable. * @NL80211_TDLS_PEER_VHT: TDLS peer is VHT capable. * @NL80211_TDLS_PEER_WMM: TDLS peer is WMM capable. + * @NL80211_TDLS_PEER_HE: TDLS peer is HE capable. */ enum nl80211_tdls_peer_capability { NL80211_TDLS_PEER_HT = 1<<0, NL80211_TDLS_PEER_VHT = 1<<1, NL80211_TDLS_PEER_WMM = 1<<2, + NL80211_TDLS_PEER_HE = 1<<3, }; /** -- cgit v1.2.3 From 53f111cbfac6f2000c604994ddf01d3299d7de6b Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Wed, 7 Apr 2021 22:22:51 +0200 Subject: net: phy: add constants for 2.5G and 5G speed in PCS speed register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add constants for 2.5G and 5G speed in PCS speed register into mdio.h. Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/mdio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 3f302e2523b2..bdf77dffa5a4 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -120,6 +120,8 @@ #define MDIO_PMA_SPEED_100 0x0020 /* 100M capable */ #define MDIO_PMA_SPEED_10 0x0040 /* 10M capable */ #define MDIO_PCS_SPEED_10P2B 0x0002 /* 10PASS-TS/2BASE-TL capable */ +#define MDIO_PCS_SPEED_2_5G 0x0040 /* 2.5G capable */ +#define MDIO_PCS_SPEED_5G 0x0080 /* 5G capable */ /* Device present registers. */ #define MDIO_DEVS_PRESENT(devad) (1 << (devad)) -- cgit v1.2.3 From c781ff12a2f37a9795e13bf328e5053d3e69f9e0 Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasiuk Date: Fri, 9 Apr 2021 11:06:34 +0300 Subject: ethtool: Allow network drivers to dump arbitrary EEPROM data Define get_module_eeprom_by_page() ethtool callback and implement netlink infrastructure. get_module_eeprom_by_page() allows network drivers to dump a part of module's EEPROM specified by page and bank numbers along with offset and length. It is effectively a netlink replacement for get_module_info() and get_module_eeprom() pair, which is needed due to emergence of complex non-linear EEPROM layouts. Signed-off-by: Vladyslav Tarasiuk Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 36 +++++- include/linux/ethtool.h | 33 +++++- include/uapi/linux/ethtool_netlink.h | 19 +++ net/ethtool/Makefile | 2 +- net/ethtool/eeprom.c | 171 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 11 ++ net/ethtool/netlink.h | 2 + 7 files changed, 270 insertions(+), 4 deletions(-) create mode 100644 net/ethtool/eeprom.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ce4a69f8308f..bbecffc7b11a 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1338,6 +1338,38 @@ in an implementation specific way. ``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP module parameters. This does not mean autonegotiation. +MODULE_EEPROM +============= + +Fetch module EEPROM data dump. +This interface is designed to allow dumps of at most 1/2 page at once. This +means only dumps of 128 (or less) bytes are allowed, without crossing half page +boundary located at offset 128. For pages other than 0 only high 128 bytes are +accessible. + +Request contents: + + ======================================= ====== ========================== + ``ETHTOOL_A_MODULE_EEPROM_HEADER`` nested request header + ``ETHTOOL_A_MODULE_EEPROM_OFFSET`` u32 offset within a page + ``ETHTOOL_A_MODULE_EEPROM_LENGTH`` u32 amount of bytes to read + ``ETHTOOL_A_MODULE_EEPROM_PAGE`` u8 page number + ``ETHTOOL_A_MODULE_EEPROM_BANK`` u8 bank number + ``ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS`` u8 page I2C address + ======================================= ====== ========================== + +Kernel response contents: + + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_MODULE_EEPROM_HEADER`` | nested | reply header | + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_MODULE_EEPROM_DATA`` | nested | array of bytes from | + | | | module EEPROM | + +---------------------------------------------+--------+---------------------+ + +``ETHTOOL_A_MODULE_EEPROM_DATA`` has an attribute length equal to the amount of +bytes driver actually read. + Request translation =================== @@ -1415,8 +1447,8 @@ are netlink only. ``ETHTOOL_GET_DUMP_FLAG`` n/a ``ETHTOOL_GET_DUMP_DATA`` n/a ``ETHTOOL_GET_TS_INFO`` ``ETHTOOL_MSG_TSINFO_GET`` - ``ETHTOOL_GMODULEINFO`` n/a - ``ETHTOOL_GMODULEEEPROM`` n/a + ``ETHTOOL_GMODULEINFO`` ``ETHTOOL_MSG_MODULE_EEPROM_GET`` + ``ETHTOOL_GMODULEEEPROM`` ``ETHTOOL_MSG_MODULE_EEPROM_GET`` ``ETHTOOL_GEEE`` ``ETHTOOL_MSG_EEE_GET`` ``ETHTOOL_SEEE`` ``ETHTOOL_MSG_EEE_SET`` ``ETHTOOL_GRSSH`` n/a diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4290e2fa3117..9f6f323af59a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -81,6 +81,7 @@ enum { #define ETH_RSS_HASH_NO_CHANGE 0 struct net_device; +struct netlink_ext_ack; /* Some generic methods drivers may use in their ethtool_ops */ u32 ethtool_op_get_link(struct net_device *dev); @@ -262,6 +263,31 @@ struct ethtool_pause_stats { u64 rx_pause_frames; }; +#define ETH_MODULE_EEPROM_PAGE_LEN 128 +#define ETH_MODULE_MAX_I2C_ADDRESS 0x7f + +/** + * struct ethtool_module_eeprom - EEPROM dump from specified page + * @offset: Offset within the specified EEPROM page to begin read, in bytes. + * @length: Number of bytes to read. + * @page: Page number to read from. + * @bank: Page bank number to read from, if applicable by EEPROM spec. + * @i2c_address: I2C address of a page. Value less than 0x7f expected. Most + * EEPROMs use 0x50 or 0x51. + * @data: Pointer to buffer with EEPROM data of @length size. + * + * This can be used to manage pages during EEPROM dump in ethtool and pass + * required information to the driver. + */ +struct ethtool_module_eeprom { + __u32 offset; + __u32 length; + __u8 page; + __u8 bank; + __u8 i2c_address; + __u8 *data; +}; + /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes @@ -414,6 +440,9 @@ struct ethtool_pause_stats { * cannot use the standard PHY library helpers. * @get_phy_tunable: Read the value of a PHY tunable. * @set_phy_tunable: Set the value of a PHY tunable. + * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from + * specified page. Returns a negative error code or the amount of bytes + * read. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -519,6 +548,9 @@ struct ethtool_ops { const struct ethtool_tunable *, void *); int (*set_phy_tunable)(struct net_device *, const struct ethtool_tunable *, const void *); + int (*get_module_eeprom_by_page)(struct net_device *dev, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack); }; int ethtool_check_ops(const struct ethtool_ops *ops); @@ -542,7 +574,6 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex); -struct netlink_ext_ack; struct phy_device; struct phy_tdr_config; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7f1bdb5b31ba..9612dcd48a6a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -44,6 +44,7 @@ enum { ETHTOOL_MSG_TUNNEL_INFO_GET, ETHTOOL_MSG_FEC_GET, ETHTOOL_MSG_FEC_SET, + ETHTOOL_MSG_MODULE_EEPROM_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -84,6 +85,7 @@ enum { ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, ETHTOOL_MSG_FEC_GET_REPLY, ETHTOOL_MSG_FEC_NTF, + ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -646,6 +648,23 @@ enum { ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) }; +/* MODULE EEPROM */ + +enum { + ETHTOOL_A_MODULE_EEPROM_UNSPEC, + ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_DATA, /* nested */ + + __ETHTOOL_A_MODULE_EEPROM_CNT, + ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index c2dc9033a8f7..83842685fd8c 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o + tunnels.o fec.o eeprom.o diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c new file mode 100644 index 000000000000..8536dd905da5 --- /dev/null +++ b/net/ethtool/eeprom.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include "netlink.h" +#include "common.h" + +struct eeprom_req_info { + struct ethnl_req_info base; + u32 offset; + u32 length; + u8 page; + u8 bank; + u8 i2c_address; +}; + +struct eeprom_reply_data { + struct ethnl_reply_data base; + u32 length; + u8 *data; +}; + +#define MODULE_EEPROM_REQINFO(__req_base) \ + container_of(__req_base, struct eeprom_req_info, base) + +#define MODULE_EEPROM_REPDATA(__reply_base) \ + container_of(__reply_base, struct eeprom_reply_data, base) + +static int eeprom_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); + struct ethtool_module_eeprom page_data = {0}; + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_module_eeprom_by_page) + return -EOPNOTSUPP; + + page_data.offset = request->offset; + page_data.length = request->length; + page_data.i2c_address = request->i2c_address; + page_data.page = request->page; + page_data.bank = request->bank; + page_data.data = kmalloc(page_data.length, GFP_KERNEL); + if (!page_data.data) + return -ENOMEM; + + ret = ethnl_ops_begin(dev); + if (ret) + goto err_free; + + ret = dev->ethtool_ops->get_module_eeprom_by_page(dev, &page_data, + info->extack); + if (ret < 0) + goto err_ops; + + reply->length = ret; + reply->data = page_data.data; + + ethnl_ops_complete(dev); + return 0; + +err_ops: + ethnl_ops_complete(dev); +err_free: + kfree(page_data.data); + return ret; +} + +static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info); + + if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] || + !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] || + !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] || + !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]) + return -EINVAL; + + request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]); + request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); + request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); + + if (!request->length) + return -EINVAL; + + /* The following set of conditions limit the API to only dump 1/2 + * EEPROM page without crossing low page boundary located at offset 128. + * This means user may only request dumps of length limited to 128 from + * either low 128 bytes or high 128 bytes. + * For pages higher than 0 only high 128 bytes are accessible. + */ + request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]); + if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE], + "reading from lower half page is allowed for page 0 only"); + return -EINVAL; + } + + if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN && + request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], + "reading cross half page boundary is illegal"); + return -EINVAL; + } else if (request->offset >= ETH_MODULE_EEPROM_PAGE_LEN * 2) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_OFFSET], + "offset is out of bounds"); + return -EINVAL; + } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], + "reading cross page boundary is illegal"); + return -EINVAL; + } + + if (tb[ETHTOOL_A_MODULE_EEPROM_BANK]) + request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]); + + return 0; +} + +static int eeprom_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); + + return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */ +} + +static int eeprom_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + + return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data); +} + +static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + + kfree(reply->data); +} + +const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { + .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, + .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER, + .req_info_size = sizeof(struct eeprom_req_info), + .reply_data_size = sizeof(struct eeprom_reply_data), + + .parse_request = eeprom_parse_request, + .prepare_data = eeprom_prepare_data, + .reply_size = eeprom_reply_size, + .fill_reply = eeprom_fill_reply, + .cleanup_data = eeprom_cleanup_data, +}; + +const struct nla_policy ethnl_module_eeprom_get_policy[] = { + [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_MODULE_EEPROM_OFFSET] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_LENGTH] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, + [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, + [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = + NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS), +}; + diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 705a4b201564..5f5d7c4b3d4a 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -246,6 +246,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, + [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -931,6 +932,16 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_module_eeprom_get_policy, + .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 785f7ee45930..4305ac971bb0 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -345,6 +345,7 @@ extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; +extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -378,6 +379,7 @@ extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_T extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; +extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From f3c45326ee71d1d3ec11e9ddb5afc04bca9ae492 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Sat, 10 Apr 2021 10:45:48 -0700 Subject: bpf: Document PROG_TEST_RUN limitations Per net/bpf/test_run.c, particular prog types have additional restrictions around the parameters that can be provided, so document these in the header. I didn't bother documenting the limitation on duration for raw tracepoints since that's an output parameter anyway. Tested with ./tools/testing/selftests/bpf/test_doc_build.sh. Suggested-by: Yonghong Song Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210410174549.816482-1-joe@cilium.io --- include/uapi/linux/bpf.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 49371eba98ba..e1ee1be7e49b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,6 +312,27 @@ union bpf_iter_link_info { * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. -- cgit v1.2.3 From 5c507329000e282dce91e6c98ee6ffa61a8a5e49 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 12 Apr 2021 16:24:32 -0300 Subject: libbpf: Clarify flags in ringbuf helpers In 'bpf_ringbuf_reserve()' we require the flag to '0' at the moment. For 'bpf_ringbuf_{discard,submit,output}' a flag of '0' might send a notification to the process if needed. Signed-off-by: Pedro Tammela Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210412192434.944343-1-pctammela@mojatatu.com --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1ee1be7e49b..85c924bc21b1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4082,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4099,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4109,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e1ee1be7e49b..85c924bc21b1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4082,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4099,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4109,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * -- cgit v1.2.3 From 441e8c66b23e027c00ccebd70df9fd933918eefe Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 13 Apr 2021 11:16:06 +0200 Subject: bpf: Return target info when a tracing bpf_link is queried MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is currently no way to discover the target of a tracing program attachment after the fact. Add this information to bpf_link_info and return it when querying the bpf_link fd. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413091607.58945-1-toke@redhat.com --- include/linux/bpf_verifier.h | 9 +++++++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 3 +++ tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 51c2ffa3d901..6023a1367853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -487,6 +487,15 @@ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } +/* unpack the IDs from the key as constructed above */ +static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) +{ + if (obj_id) + *obj_id = key >> 32; + if (btf_id) + *btf_id = key & 0x7FFFFFFF; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85c924bc21b1..df164a44bb41 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5416,6 +5416,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6428634da57e..fd495190115e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2551,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link); info->tracing.attach_type = tr_link->attach_type; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &info->tracing.target_obj_id, + &info->tracing.target_btf_id); return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85c924bc21b1..df164a44bb41 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5416,6 +5416,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; -- cgit v1.2.3 From be85dbfeb37c8c4d4344da2ee594d78034b82489 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Apr 2021 15:53:15 -0700 Subject: ethtool: add FEC statistics Similarly to pause statistics add stats for FEC. The IEEE standard mandates two sets of counters: - 30.5.1.1.17 aFECCorrectedBlocks - 30.5.1.1.18 aFECUncorrectableBlocks where block is a block of bits FEC operates on. Each of these counters is defined per lane (PCS instance). Multiple vendors provide number of corrected _bits_ rather than/as well as blocks. This set adds the 2 standard-based block counters and a extra one for corrected bits. Counters are exposed to user space via netlink in new attributes. Each attribute carries an array of u64s, first element is the total count, and the following ones are a per-lane break down. Much like with pause stats the operation will not fail when driver does not implement the get_fec_stats callback (nor can the driver fail the operation by returning an error). If stats can't be reported the relevant attributes will be empty. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 21 ++++++++ Documentation/networking/statistics.rst | 2 + include/linux/ethtool.h | 40 +++++++++++++++ include/uapi/linux/ethtool_netlink.h | 14 ++++++ net/ethtool/fec.c | 73 +++++++++++++++++++++++++++- 5 files changed, 149 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index bbecffc7b11a..f8219e2f489e 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1302,6 +1302,7 @@ Kernel response contents: ``ETHTOOL_A_FEC_MODES`` bitset configured modes ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection ``ETHTOOL_A_FEC_ACTIVE`` u32 index of active FEC mode + ``ETHTOOL_A_FEC_STATS`` nested FEC statistics ===================================== ====== ========================== ``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently @@ -1315,6 +1316,26 @@ This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface. ``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode bits (rather than old ``ETHTOOL_FEC_*`` bits). +``ETHTOOL_A_FEC_STATS`` are reported if ``ETHTOOL_FLAG_STATS`` was set in +``ETHTOOL_A_HEADER_FLAGS``. +Each attribute carries an array of 64bit statistics. First entry in the array +contains the total number of events on the port, while the following entries +are counters corresponding to lanes/PCS instances. The number of entries in +the array will be: + ++--------------+---------------------------------------------+ +| `0` | device does not support FEC statistics | ++--------------+---------------------------------------------+ +| `1` | device does not support per-lane break down | ++--------------+---------------------------------------------+ +| `1 + #lanes` | device has full support for FEC stats | ++--------------+---------------------------------------------+ + +Drivers fill in the statistics in the following structure: + +.. kernel-doc:: include/linux/ethtool.h + :identifiers: ethtool_fec_stats + FEC_SET ======= diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst index 234abedc29b2..b748fe44ee02 100644 --- a/Documentation/networking/statistics.rst +++ b/Documentation/networking/statistics.rst @@ -130,6 +130,7 @@ the `ETHTOOL_FLAG_STATS` flag in `ETHTOOL_A_HEADER_FLAGS`. Currently statistics are supported in the following commands: - `ETHTOOL_MSG_PAUSE_GET` + - `ETHTOOL_MSG_FEC_GET` debugfs ------- @@ -176,3 +177,4 @@ translated to netlink attributes when dumped. Drivers must not overwrite the statistics they don't report with 0. - ethtool_pause_stats() +- ethtool_fec_stats() diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 069100b252bd..112a85b57f1f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -269,6 +269,39 @@ struct ethtool_pause_stats { u64 rx_pause_frames; }; +#define ETHTOOL_MAX_LANES 8 + +/** + * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC + * @corrected_blocks: number of received blocks corrected by FEC + * Reported to user space as %ETHTOOL_A_FEC_STAT_CORRECTED. + * + * Equivalent to `30.5.1.1.17 aFECCorrectedBlocks` from the standard. + * + * @uncorrectable_blocks: number of received blocks FEC was not able to correct + * Reported to user space as %ETHTOOL_A_FEC_STAT_UNCORR. + * + * Equivalent to `30.5.1.1.18 aFECUncorrectableBlocks` from the standard. + * + * @corrected_bits: number of bits corrected by FEC + * Similar to @corrected_blocks but counts individual bit changes, + * not entire FEC data blocks. This is a non-standard statistic. + * Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS. + * + * @lane: per-lane/PCS-instance counts as defined by the standard + * @total: error counts for the entire port, for drivers incapable of reporting + * per-lane stats + * + * Drivers should fill in either only total or per-lane statistics, core + * will take care of adding lane values up to produce the total. + */ +struct ethtool_fec_stats { + struct ethtool_fec_stat { + u64 total; + u64 lanes[ETHTOOL_MAX_LANES]; + } corrected_blocks, uncorrectable_blocks, corrected_bits; +}; + #define ETH_MODULE_EEPROM_PAGE_LEN 128 #define ETH_MODULE_MAX_I2C_ADDRESS 0x7f @@ -439,6 +472,11 @@ struct ethtool_module_eeprom { * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), * any change to them will be overwritten by kernel. Returns a negative * error code or zero. + * @get_fec_stats: Report FEC statistics. + * Core will sum up per-lane stats to get the total. + * Drivers must not zero statistics which they don't report. The stats + * structure is initialized to ETHTOOL_STAT_NOT_SET indicating driver does + * not report statistics. * @get_fecparam: Get the network device Forward Error Correction parameters. * @set_fecparam: Set the network device Forward Error Correction parameters. * @get_ethtool_phy_stats: Return extended statistics about the PHY device. @@ -544,6 +582,8 @@ struct ethtool_ops { struct ethtool_link_ksettings *); int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); + void (*get_fec_stats)(struct net_device *dev, + struct ethtool_fec_stats *fec_stats); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 9612dcd48a6a..3a2b31ccbc5b 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -643,11 +643,25 @@ enum { ETHTOOL_A_FEC_MODES, /* bitset */ ETHTOOL_A_FEC_AUTO, /* u8 */ ETHTOOL_A_FEC_ACTIVE, /* u32 */ + ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ __ETHTOOL_A_FEC_CNT, ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) }; +enum { + ETHTOOL_A_FEC_STAT_UNSPEC, + ETHTOOL_A_FEC_STAT_PAD, + + ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ + ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ + ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ + + /* add new constants above here */ + __ETHTOOL_A_FEC_STAT_CNT, + ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) +}; + /* MODULE EEPROM */ enum { diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index 3e7d091ee7aa..8738dafd5417 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -13,6 +13,10 @@ struct fec_reply_data { __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes); u32 active_fec; u8 fec_auto; + struct fec_stat_grp { + u64 stats[1 + ETHTOOL_MAX_LANES]; + u8 cnt; + } corr, uncorr, corr_bits; }; #define FEC_REPDATA(__reply_base) \ @@ -21,7 +25,7 @@ struct fec_reply_data { #define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1) const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = { - [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static void @@ -64,6 +68,28 @@ ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec, return 0; } +static void +fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats) +{ + int i; + + if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) { + grp->stats[0] = stats->total; + grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET; + return; + } + + grp->cnt = 1; + grp->stats[0] = 0; + for (i = 0; i < ETHTOOL_MAX_LANES; i++) { + if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET) + break; + + grp->stats[0] += stats->lanes[i]; + grp->stats[grp->cnt++] = stats->lanes[i]; + } +} + static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -82,6 +108,17 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, ret = dev->ethtool_ops->get_fecparam(dev, &fec); if (ret) goto out_complete; + if (req_base->flags & ETHTOOL_FLAG_STATS && + dev->ethtool_ops->get_fec_stats) { + struct ethtool_fec_stats stats; + + ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats); + + fec_stats_recalc(&data->corr, &stats.corrected_blocks); + fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); + fec_stats_recalc(&data->corr_bits, &stats.corrected_bits); + } WARN_ON_ONCE(fec.reserved); @@ -120,9 +157,40 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ + if (req_base->flags & ETHTOOL_FLAG_STATS) + len += 3 * nla_total_size_64bit(sizeof(u64) * + (1 + ETHTOOL_MAX_LANES)); + return len; } +static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS); + if (!nest) + return -EMSGSIZE; + + if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED, + sizeof(u64) * data->corr.cnt, + data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) || + nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR, + sizeof(u64) * data->uncorr.cnt, + data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) || + nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS, + sizeof(u64) * data->corr_bits.cnt, + data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) @@ -143,6 +211,9 @@ static int fec_fill_reply(struct sk_buff *skb, nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec))) return -EMSGSIZE; + if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data)) + return -EMSGSIZE; + return 0; } -- cgit v1.2.3 From f09ea6fb12723d6726293d68de00b6307368bd76 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:39 -0700 Subject: ethtool: add a new command for reading standard stats Add an interface for reading standard stats, including stats which don't have a corresponding control interface. Start with IEEE 802.3 PHY stats. There seems to be only one stat to expose there. Define API to not require user space changes when new stats or groups are added. Groups are based on bitset, stats have a string set associated. v1: wrap stats in a nest Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 10 ++ include/uapi/linux/ethtool.h | 4 + include/uapi/linux/ethtool_netlink.h | 47 ++++++++ net/ethtool/Makefile | 2 +- net/ethtool/netlink.c | 10 ++ net/ethtool/netlink.h | 5 + net/ethtool/stats.c | 200 +++++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 10 ++ 8 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/stats.c (limited to 'include/uapi/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 112a85b57f1f..2d5455eedbf4 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -250,6 +250,13 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n) stats[n] = ETHTOOL_STAT_NOT_SET; } +/* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed + * via a more targeted API. + */ +struct ethtool_eth_phy_stats { + u64 SymbolErrorDuringCarrier; +}; + /** * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames * @tx_pause_frames: transmitted pause frame count. Reported to user space @@ -487,6 +494,7 @@ struct ethtool_module_eeprom { * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from * specified page. Returns a negative error code or the amount of bytes * read. + * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -597,6 +605,8 @@ struct ethtool_ops { int (*get_module_eeprom_by_page)(struct net_device *dev, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack); + void (*get_eth_phy_stats)(struct net_device *dev, + struct ethtool_eth_phy_stats *phy_stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f91e079e3108..190ae6e03918 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -669,6 +669,8 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_TS_TX_TYPES: timestamping Tx types * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types + * @ETH_SS_STATS_STD: standardized stats + * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -689,6 +691,8 @@ enum ethtool_stringset { ETH_SS_TS_TX_TYPES, ETH_SS_TS_RX_FILTERS, ETH_SS_UDP_TUNNEL_TYPES, + ETH_SS_STATS_STD, + ETH_SS_STATS_ETH_PHY, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 3a2b31ccbc5b..a54cfe625f34 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -45,6 +45,7 @@ enum { ETHTOOL_MSG_FEC_GET, ETHTOOL_MSG_FEC_SET, ETHTOOL_MSG_MODULE_EEPROM_GET, + ETHTOOL_MSG_STATS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -86,6 +87,7 @@ enum { ETHTOOL_MSG_FEC_GET_REPLY, ETHTOOL_MSG_FEC_NTF, ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + ETHTOOL_MSG_STATS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -679,6 +681,51 @@ enum { ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) }; +/* STATS */ + +enum { + ETHTOOL_A_STATS_UNSPEC, + ETHTOOL_A_STATS_PAD, + ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_STATS_GROUPS, /* bitset */ + + ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_CNT, + ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) +}; + +enum { + ETHTOOL_STATS_ETH_PHY, + + /* add new constants above here */ + __ETHTOOL_STATS_CNT +}; + +enum { + ETHTOOL_A_STATS_GRP_UNSPEC, + ETHTOOL_A_STATS_GRP_PAD, + + ETHTOOL_A_STATS_GRP_ID, /* u32 */ + ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ + + ETHTOOL_A_STATS_GRP_STAT, /* nest */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_GRP_CNT, + ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1) +}; + +enum { + /* 30.3.2.1.5 aSymbolErrorDuringCarrier */ + ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, + + /* add new constants above here */ + __ETHTOOL_A_STATS_ETH_PHY_CNT, + ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 83842685fd8c..723c9a8a8cdf 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o + tunnels.o fec.o eeprom.o stats.o diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 5f5d7c4b3d4a..290012d0d11d 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -247,6 +247,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, + [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -942,6 +943,15 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_module_eeprom_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_STATS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_stats_get_policy, + .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 4305ac971bb0..9d88983b6597 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -346,6 +346,7 @@ extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; +extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -380,6 +381,7 @@ extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INF extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1]; +extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -399,4 +401,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); +extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; + #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c new file mode 100644 index 000000000000..fd8f47178c06 --- /dev/null +++ b/net/ethtool/stats.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct stats_req_info { + struct ethnl_req_info base; + DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT); +}; + +#define STATS_REQINFO(__req_base) \ + container_of(__req_base, struct stats_req_info, base) + +struct stats_reply_data { + struct ethnl_reply_data base; + struct ethtool_eth_phy_stats phy_stats; +}; + +#define STATS_REPDATA(__reply_base) \ + container_of(__reply_base, struct stats_reply_data, base) + +const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_STATS_ETH_PHY] = "eth-phy", +}; + +const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier", +}; + +const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { + [ETHTOOL_A_STATS_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED }, +}; + +static int stats_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct stats_req_info *req_info = STATS_REQINFO(req_base); + bool mod = false; + int err; + + err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT, + tb[ETHTOOL_A_STATS_GROUPS], stats_std_names, + extack, &mod); + if (err) + return err; + + if (!mod) { + NL_SET_ERR_MSG(extack, "no stats requested"); + return -EINVAL; + } + + return 0; +} + +static int stats_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + struct stats_reply_data *data = STATS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); + + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && + dev->ethtool_ops->get_eth_phy_stats) + dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); + + ethnl_ops_complete(dev); + return 0; +} + +static int stats_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + unsigned int n_grps = 0, n_stats = 0; + int len = 0; + + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); + n_grps++; + } + + len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ + nla_total_size(4) + /* _A_STATS_GRP_ID */ + nla_total_size(4)); /* _A_STATS_GRP_SS_ID */ + len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */ + nla_total_size_64bit(sizeof(u64))); + + return len; +} + +static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val) +{ + struct nlattr *nest; + int ret; + + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + + /* We want to start stats attr types from 0, so we don't have a type + * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside + * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the + * actual attr we're 4B off - nla_need_padding_for_64bit() & co. + * can't be used. + */ +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) + if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0)) + return -EMSGSIZE; +#endif + + nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT); + if (!nest) + return -EMSGSIZE; + + ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */); + if (ret) { + nla_nest_cancel(skb, nest); + return ret; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int stats_put_phy_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, + data->phy_stats.SymbolErrorDuringCarrier)) + return -EMSGSIZE; + return 0; +} + +static int stats_put_stats(struct sk_buff *skb, + const struct stats_reply_data *data, + u32 id, u32 ss_id, + int (*cb)(struct sk_buff *skb, + const struct stats_reply_data *data)) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) || + nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id)) + goto err_cancel; + + if (cb(skb, data)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int stats_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + const struct stats_reply_data *data = STATS_REPDATA(reply_base); + int ret = 0; + + if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, + ETH_SS_STATS_ETH_PHY, + stats_put_phy_stats); + + return ret; +} + +const struct ethnl_request_ops ethnl_stats_request_ops = { + .request_cmd = ETHTOOL_MSG_STATS_GET, + .reply_cmd = ETHTOOL_MSG_STATS_GET_REPLY, + .hdr_attr = ETHTOOL_A_STATS_HEADER, + .req_info_size = sizeof(struct stats_req_info), + .reply_data_size = sizeof(struct stats_reply_data), + + .parse_request = stats_parse_request, + .prepare_data = stats_prepare_data, + .reply_size = stats_reply_size, + .fill_reply = stats_fill_reply, +}; diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index c3a5489964cd..5f3c73587ff4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -80,6 +80,16 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, .strings = udp_tunnel_type_names, }, + [ETH_SS_STATS_STD] = { + .per_dev = false, + .count = __ETHTOOL_STATS_CNT, + .strings = stats_std_names, + }, + [ETH_SS_STATS_ETH_PHY] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, + .strings = stats_eth_phy_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From ca2244547ec7505d1cf61d43f5e76e3ffd99cf77 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:40 -0700 Subject: ethtool: add interface to read standard MAC stats Most of the MAC statistics are included in struct rtnl_link_stats64, but some fields are aggregated. Besides it's good to expose these clearly hardware stats separately. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 31 +++++++++++++ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 53 +++++++++++++++++++++ net/ethtool/netlink.h | 1 + net/ethtool/stats.c | 90 ++++++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 5 ++ 6 files changed, 182 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 2d5455eedbf4..3c689a13e679 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -250,6 +250,34 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n) stats[n] = ETHTOOL_STAT_NOT_SET; } +/* Basic IEEE 802.3 MAC statistics (30.3.1.1.*), not otherwise exposed + * via a more targeted API. + */ +struct ethtool_eth_mac_stats { + u64 FramesTransmittedOK; + u64 SingleCollisionFrames; + u64 MultipleCollisionFrames; + u64 FramesReceivedOK; + u64 FrameCheckSequenceErrors; + u64 AlignmentErrors; + u64 OctetsTransmittedOK; + u64 FramesWithDeferredXmissions; + u64 LateCollisions; + u64 FramesAbortedDueToXSColls; + u64 FramesLostDueToIntMACXmitError; + u64 CarrierSenseErrors; + u64 OctetsReceivedOK; + u64 FramesLostDueToIntMACRcvError; + u64 MulticastFramesXmittedOK; + u64 BroadcastFramesXmittedOK; + u64 FramesWithExcessiveDeferral; + u64 MulticastFramesReceivedOK; + u64 BroadcastFramesReceivedOK; + u64 InRangeLengthErrors; + u64 OutOfRangeLengthField; + u64 FrameTooLongErrors; +}; + /* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed * via a more targeted API. */ @@ -495,6 +523,7 @@ struct ethtool_module_eeprom { * specified page. Returns a negative error code or the amount of bytes * read. * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. + * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -607,6 +636,8 @@ struct ethtool_ops { struct netlink_ext_ack *extack); void (*get_eth_phy_stats)(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats); + void (*get_eth_mac_stats)(struct net_device *dev, + struct ethtool_eth_mac_stats *mac_stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 190ae6e03918..c227376d811a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -671,6 +671,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types * @ETH_SS_STATS_STD: standardized stats * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics + * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -693,6 +694,7 @@ enum ethtool_stringset { ETH_SS_UDP_TUNNEL_TYPES, ETH_SS_STATS_STD, ETH_SS_STATS_ETH_PHY, + ETH_SS_STATS_ETH_MAC, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index a54cfe625f34..f0fbe8f4eb1b 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -698,6 +698,7 @@ enum { enum { ETHTOOL_STATS_ETH_PHY, + ETHTOOL_STATS_ETH_MAC, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -726,6 +727,58 @@ enum { ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1) }; +enum { + /* 30.3.1.1.2 aFramesTransmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT, + /* 30.3.1.1.3 aSingleCollisionFrames */ + ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL, + /* 30.3.1.1.4 aMultipleCollisionFrames */ + ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL, + /* 30.3.1.1.5 aFramesReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT, + /* 30.3.1.1.6 aFrameCheckSequenceErrors */ + ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR, + /* 30.3.1.1.7 aAlignmentErrors */ + ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR, + /* 30.3.1.1.8 aOctetsTransmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES, + /* 30.3.1.1.9 aFramesWithDeferredXmissions */ + ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER, + /* 30.3.1.1.10 aLateCollisions */ + ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL, + /* 30.3.1.1.11 aFramesAbortedDueToXSColls */ + ETHTOOL_A_STATS_ETH_MAC_11_XS_COL, + /* 30.3.1.1.12 aFramesLostDueToIntMACXmitError */ + ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR, + /* 30.3.1.1.13 aCarrierSenseErrors */ + ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR, + /* 30.3.1.1.14 aOctetsReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES, + /* 30.3.1.1.15 aFramesLostDueToIntMACRcvError */ + ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR, + + /* 30.3.1.1.18 aMulticastFramesXmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST, + /* 30.3.1.1.19 aBroadcastFramesXmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST, + /* 30.3.1.1.20 aFramesWithExcessiveDeferral */ + ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER, + /* 30.3.1.1.21 aMulticastFramesReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST, + /* 30.3.1.1.22 aBroadcastFramesReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST, + /* 30.3.1.1.23 aInRangeLengthErrors */ + ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR, + /* 30.3.1.1.24 aOutOfRangeLengthField */ + ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN, + /* 30.3.1.1.25 aFrameTooLongErrors */ + ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR, + + /* add new constants above here */ + __ETHTOOL_A_STATS_ETH_MAC_CNT, + ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9d88983b6597..c70bac5329af 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -403,5 +403,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index fd8f47178c06..e80175872226 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -15,6 +15,7 @@ struct stats_req_info { struct stats_reply_data { struct ethnl_reply_data base; struct ethtool_eth_phy_stats phy_stats; + struct ethtool_eth_mac_stats mac_stats; }; #define STATS_REPDATA(__reply_base) \ @@ -22,12 +23,38 @@ struct stats_reply_data { const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", + [ETHTOOL_STATS_ETH_MAC] = "eth-mac", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier", }; +const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT] = "FramesTransmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL] = "SingleCollisionFrames", + [ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL] = "MultipleCollisionFrames", + [ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT] = "FramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR] = "FrameCheckSequenceErrors", + [ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR] = "AlignmentErrors", + [ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES] = "OctetsTransmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER] = "FramesWithDeferredXmissions", + [ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL] = "LateCollisions", + [ETHTOOL_A_STATS_ETH_MAC_11_XS_COL] = "FramesAbortedDueToXSColls", + [ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR] = "FramesLostDueToIntMACXmitError", + [ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR] = "CarrierSenseErrors", + [ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES] = "OctetsReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR] = "FramesLostDueToIntMACRcvError", + [ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST] = "MulticastFramesXmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST] = "BroadcastFramesXmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER] = "FramesWithExcessiveDeferral", + [ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST] = "MulticastFramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST] = "BroadcastFramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR] = "InRangeLengthErrors", + [ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN] = "OutOfRangeLengthField", + [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -70,10 +97,14 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, return ret; memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); + memset(&data->mac_stats, 0xff, sizeof(data->mac_stats)); if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); + if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) && + dev->ethtool_ops->get_eth_mac_stats) + dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats); ethnl_ops_complete(dev); return 0; @@ -90,6 +121,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); n_grps++; } + if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64); + n_grps++; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -143,6 +178,57 @@ static int stats_put_phy_stats(struct sk_buff *skb, return 0; } +static int stats_put_mac_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT, + data->mac_stats.FramesTransmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL, + data->mac_stats.SingleCollisionFrames) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL, + data->mac_stats.MultipleCollisionFrames) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT, + data->mac_stats.FramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR, + data->mac_stats.FrameCheckSequenceErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR, + data->mac_stats.AlignmentErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES, + data->mac_stats.OctetsTransmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER, + data->mac_stats.FramesWithDeferredXmissions) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL, + data->mac_stats.LateCollisions) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL, + data->mac_stats.FramesAbortedDueToXSColls) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR, + data->mac_stats.FramesLostDueToIntMACXmitError) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR, + data->mac_stats.CarrierSenseErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES, + data->mac_stats.OctetsReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR, + data->mac_stats.FramesLostDueToIntMACRcvError) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST, + data->mac_stats.MulticastFramesXmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST, + data->mac_stats.BroadcastFramesXmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER, + data->mac_stats.FramesWithExcessiveDeferral) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST, + data->mac_stats.MulticastFramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST, + data->mac_stats.BroadcastFramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR, + data->mac_stats.InRangeLengthErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN, + data->mac_stats.OutOfRangeLengthField) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR, + data->mac_stats.FrameTooLongErrors)) + return -EMSGSIZE; + return 0; +} + static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, @@ -182,6 +268,10 @@ static int stats_fill_reply(struct sk_buff *skb, ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, ETH_SS_STATS_ETH_PHY, stats_put_phy_stats); + if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC, + ETH_SS_STATS_ETH_MAC, + stats_put_mac_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 5f3c73587ff4..a8aac7bcfcc9 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -90,6 +90,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, .strings = stats_eth_phy_names, }, + [ETH_SS_STATS_ETH_MAC] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, + .strings = stats_eth_mac_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From bfad2b979ddcc330c08bb071eb3c3f7b3411a681 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:41 -0700 Subject: ethtool: add interface to read standard MAC Ctrl stats Number of devices maintains the standard-based MAC control counters for control frames. Add a API for those. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 12 ++++++++++++ include/uapi/linux/ethtool.h | 2 ++ include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ net/ethtool/netlink.h | 1 + net/ethtool/stats.c | 33 +++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 5 +++++ 6 files changed, 67 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3c689a13e679..1ca6b836f9fe 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -285,6 +285,15 @@ struct ethtool_eth_phy_stats { u64 SymbolErrorDuringCarrier; }; +/* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed + * via a more targeted API. + */ +struct ethtool_eth_ctrl_stats { + u64 MACControlFramesTransmitted; + u64 MACControlFramesReceived; + u64 UnsupportedOpcodesReceived; +}; + /** * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames * @tx_pause_frames: transmitted pause frame count. Reported to user space @@ -524,6 +533,7 @@ struct ethtool_module_eeprom { * read. * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. + * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -638,6 +648,8 @@ struct ethtool_ops { struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats); + void (*get_eth_ctrl_stats)(struct net_device *dev, + struct ethtool_eth_ctrl_stats *ctrl_stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index c227376d811a..9cb8df89d4f2 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -672,6 +672,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_STATS_STD: standardized stats * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics + * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -695,6 +696,7 @@ enum ethtool_stringset { ETH_SS_STATS_STD, ETH_SS_STATS_ETH_PHY, ETH_SS_STATS_ETH_MAC, + ETH_SS_STATS_ETH_CTRL, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index f0fbe8f4eb1b..2ea5f049df6a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -699,6 +699,7 @@ enum { enum { ETHTOOL_STATS_ETH_PHY, ETHTOOL_STATS_ETH_MAC, + ETHTOOL_STATS_ETH_CTRL, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -779,6 +780,19 @@ enum { ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1) }; +enum { + /* 30.3.3.3 aMACControlFramesTransmitted */ + ETHTOOL_A_STATS_ETH_CTRL_3_TX, + /* 30.3.3.4 aMACControlFramesReceived */ + ETHTOOL_A_STATS_ETH_CTRL_4_RX, + /* 30.3.3.5 aUnsupportedOpcodesReceived */ + ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP, + + /* add new constants above here */ + __ETHTOOL_A_STATS_ETH_CTRL_CNT, + ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index c70bac5329af..febfa61e52e2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -404,5 +404,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index e80175872226..f4fded66731c 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -16,6 +16,7 @@ struct stats_reply_data { struct ethnl_reply_data base; struct ethtool_eth_phy_stats phy_stats; struct ethtool_eth_mac_stats mac_stats; + struct ethtool_eth_ctrl_stats ctrl_stats; }; #define STATS_REPDATA(__reply_base) \ @@ -24,6 +25,7 @@ struct stats_reply_data { const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", [ETHTOOL_STATS_ETH_MAC] = "eth-mac", + [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { @@ -55,6 +57,12 @@ const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors", }; +const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_CTRL_3_TX] = "MACControlFramesTransmitted", + [ETHTOOL_A_STATS_ETH_CTRL_4_RX] = "MACControlFramesReceived", + [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -98,6 +106,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); memset(&data->mac_stats, 0xff, sizeof(data->mac_stats)); + memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats)); if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) @@ -105,6 +114,9 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) && dev->ethtool_ops->get_eth_mac_stats) dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats); + if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) && + dev->ethtool_ops->get_eth_ctrl_stats) + dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats); ethnl_ops_complete(dev); return 0; @@ -125,6 +137,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64); n_grps++; } + if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64); + n_grps++; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -229,6 +245,19 @@ static int stats_put_mac_stats(struct sk_buff *skb, return 0; } +static int stats_put_ctrl_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX, + data->ctrl_stats.MACControlFramesTransmitted) || + stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX, + data->ctrl_stats.MACControlFramesReceived) || + stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP, + data->ctrl_stats.UnsupportedOpcodesReceived)) + return -EMSGSIZE; + return 0; +} + static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, @@ -272,6 +301,10 @@ static int stats_fill_reply(struct sk_buff *skb, ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC, ETH_SS_STATS_ETH_MAC, stats_put_mac_stats); + if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL, + ETH_SS_STATS_ETH_CTRL, + stats_put_ctrl_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index a8aac7bcfcc9..a33c603a7a02 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -95,6 +95,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, .strings = stats_eth_mac_names, }, + [ETH_SS_STATS_ETH_CTRL] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, + .strings = stats_eth_ctrl_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From a8b06e9d40d8b18c41c8ce060e8dc004fa59e708 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:42 -0700 Subject: ethtool: add interface to read RMON stats Most devices maintain RMON (RFC 2819) stats - particularly the "histogram" of packets received by size. Unlike other RFCs which duplicate IEEE stats, the short/oversized frame counters in RMON don't seem to match IEEE stats 1-to-1 either, so expose those, too. Do not expose basic packet, CRC errors etc - those are already otherwise covered. Because standard defines packet ranges only up to 1518, and everything above that should theoretically be "oversized" - devices often create their own ranges. Going beyond what the RFC defines - expose the "histogram" in the Tx direction (assume for now that the ranges will be the same). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 43 ++++++++++++++++++ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 23 ++++++++++ net/ethtool/netlink.h | 1 + net/ethtool/stats.c | 87 ++++++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 5 +++ 6 files changed, 161 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 1ca6b836f9fe..e030f7510cd3 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -346,6 +346,44 @@ struct ethtool_fec_stats { } corrected_blocks, uncorrectable_blocks, corrected_bits; }; +/** + * struct ethtool_rmon_hist_range - byte range for histogram statistics + * @low: low bound of the bucket (inclusive) + * @high: high bound of the bucket (inclusive) + */ +struct ethtool_rmon_hist_range { + u16 low; + u16 high; +}; + +#define ETHTOOL_RMON_HIST_MAX 10 + +/** + * struct ethtool_rmon_stats - selected RMON (RFC 2819) statistics + * @undersize_pkts: Equivalent to `etherStatsUndersizePkts` from the RFC. + * @oversize_pkts: Equivalent to `etherStatsOversizePkts` from the RFC. + * @fragments: Equivalent to `etherStatsFragments` from the RFC. + * @jabbers: Equivalent to `etherStatsJabbers` from the RFC. + * @hist: Packet counter for packet length buckets (e.g. + * `etherStatsPkts128to255Octets` from the RFC). + * @hist_tx: Tx counters in similar form to @hist, not defined in the RFC. + * + * Selection of RMON (RFC 2819) statistics which are not exposed via different + * APIs, primarily the packet-length-based counters. + * Unfortunately different designs choose different buckets beyond + * the 1024B mark (jumbo frame teritory), so the definition of the bucket + * ranges is left to the driver. + */ +struct ethtool_rmon_stats { + u64 undersize_pkts; + u64 oversize_pkts; + u64 fragments; + u64 jabbers; + + u64 hist[ETHTOOL_RMON_HIST_MAX]; + u64 hist_tx[ETHTOOL_RMON_HIST_MAX]; +}; + #define ETH_MODULE_EEPROM_PAGE_LEN 128 #define ETH_MODULE_MAX_I2C_ADDRESS 0x7f @@ -534,6 +572,8 @@ struct ethtool_module_eeprom { * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. + * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics. + * Set %ranges to a pointer to zero-terminated array of byte ranges. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -650,6 +690,9 @@ struct ethtool_ops { struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats); + void (*get_rmon_stats)(struct net_device *dev, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9cb8df89d4f2..cfef6b08169a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -673,6 +673,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics + * @ETH_SS_STATS_RMON: names of RMON statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -697,6 +698,7 @@ enum ethtool_stringset { ETH_SS_STATS_ETH_PHY, ETH_SS_STATS_ETH_MAC, ETH_SS_STATS_ETH_CTRL, + ETH_SS_STATS_RMON, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 2ea5f049df6a..825cfda1c5d5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -700,6 +700,7 @@ enum { ETHTOOL_STATS_ETH_PHY, ETHTOOL_STATS_ETH_MAC, ETHTOOL_STATS_ETH_CTRL, + ETHTOOL_STATS_RMON, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -714,6 +715,13 @@ enum { ETHTOOL_A_STATS_GRP_STAT, /* nest */ + ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ + ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ + + ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ + /* add new constants above here */ __ETHTOOL_A_STATS_GRP_CNT, ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1) @@ -793,6 +801,21 @@ enum { ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1) }; +enum { + /* etherStatsUndersizePkts */ + ETHTOOL_A_STATS_RMON_UNDERSIZE, + /* etherStatsOversizePkts */ + ETHTOOL_A_STATS_RMON_OVERSIZE, + /* etherStatsFragments */ + ETHTOOL_A_STATS_RMON_FRAG, + /* etherStatsJabbers */ + ETHTOOL_A_STATS_RMON_JABBER, + + /* add new constants above here */ + __ETHTOOL_A_STATS_RMON_CNT, + ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index febfa61e52e2..bed3afdf3656 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -405,5 +405,6 @@ extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; +extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index f4fded66731c..acb2b080c358 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -17,6 +17,8 @@ struct stats_reply_data { struct ethtool_eth_phy_stats phy_stats; struct ethtool_eth_mac_stats mac_stats; struct ethtool_eth_ctrl_stats ctrl_stats; + struct ethtool_rmon_stats rmon_stats; + const struct ethtool_rmon_hist_range *rmon_ranges; }; #define STATS_REPDATA(__reply_base) \ @@ -26,6 +28,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", [ETHTOOL_STATS_ETH_MAC] = "eth-mac", [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", + [ETHTOOL_STATS_RMON] = "rmon", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { @@ -63,6 +66,13 @@ const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived", }; +const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_RMON_UNDERSIZE] = "etherStatsUndersizePkts", + [ETHTOOL_A_STATS_RMON_OVERSIZE] = "etherStatsOversizePkts", + [ETHTOOL_A_STATS_RMON_FRAG] = "etherStatsFragments", + [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -107,6 +117,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); memset(&data->mac_stats, 0xff, sizeof(data->mac_stats)); memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats)); + memset(&data->rmon_stats, 0xff, sizeof(data->rmon_stats)); if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) @@ -117,6 +128,10 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) && dev->ethtool_ops->get_eth_ctrl_stats) dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats); + if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) && + dev->ethtool_ops->get_rmon_stats) + dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats, + &data->rmon_ranges); ethnl_ops_complete(dev); return 0; @@ -141,6 +156,16 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64); n_grps++; } + if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64); + n_grps++; + /* Above includes the space for _A_STATS_GRP_HIST_VALs */ + + len += (nla_total_size(0) + /* _A_STATS_GRP_HIST */ + nla_total_size(4) + /* _A_STATS_GRP_HIST_BKT_LOW */ + nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */ + ETHTOOL_RMON_HIST_MAX * 2; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -258,6 +283,65 @@ static int stats_put_ctrl_stats(struct sk_buff *skb, return 0; } +static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist, + const struct ethtool_rmon_hist_range *ranges) +{ + struct nlattr *nest; + int i; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_RMON_HIST_MAX; i++) { + if (!ranges[i].low && !ranges[i].high) + break; + if (hist[i] == ETHTOOL_STAT_NOT_SET) + continue; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI, + ranges[i].high) || + nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL, + hist[i], ETHTOOL_A_STATS_GRP_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int stats_put_rmon_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX, + data->rmon_stats.hist, data->rmon_ranges) || + stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX, + data->rmon_stats.hist_tx, data->rmon_ranges)) + return -EMSGSIZE; + + if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE, + data->rmon_stats.undersize_pkts) || + stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE, + data->rmon_stats.oversize_pkts) || + stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG, + data->rmon_stats.fragments) || + stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER, + data->rmon_stats.jabbers)) + return -EMSGSIZE; + + return 0; +} + static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, @@ -305,6 +389,9 @@ static int stats_fill_reply(struct sk_buff *skb, ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL, ETH_SS_STATS_ETH_CTRL, stats_put_ctrl_stats); + if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON, + ETH_SS_STATS_RMON, stats_put_rmon_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index a33c603a7a02..b3029fff715d 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -100,6 +100,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, .strings = stats_eth_ctrl_names, }, + [ETH_SS_STATS_RMON] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_RMON_CNT, + .strings = stats_rmon_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From 73807523f9a6612106582ab19217f280ed128f24 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 9 Apr 2021 12:40:25 +0300 Subject: nl80211/cfg80211: add a flag to negotiate for LMR feedback in NDP ranging Add a flag that indicates that the ISTA shall indicate support for LMR feedback in NDP ranging negotiation. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.eff546283504.I2606161e700ac24d94d0b50c8edcdedd4c0395c2@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 ++++- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 1 + net/wireless/pmsr.c | 12 +++++++++++- 4 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 73b17ea89248..5224f885a99a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3521,6 +3521,8 @@ struct cfg80211_pmsr_result { * @non_trigger_based: use non trigger based ranging for the measurement * If neither @trigger_based nor @non_trigger_based is set, * EDCA based ranging will be used. + * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either + * @trigger_based or @non_trigger_based is set. * * See also nl80211 for the respective attribute documentation. */ @@ -3532,7 +3534,8 @@ struct cfg80211_pmsr_ftm_request_peer { request_lci:1, request_civicloc:1, trigger_based:1, - non_trigger_based:1; + non_trigger_based:1, + lmr_feedback:1; u8 num_bursts_exp; u8 burst_duration; u8 ftms_per_burst; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 18dfe744bcb5..00f696d177e6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6896,6 +6896,9 @@ enum nl80211_peer_measurement_ftm_capa { * if neither %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED nor * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set, EDCA based * ranging will be used. + * @NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK: negotiate for LMR feedback. Only + * valid if either %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED or + * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set. * * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number @@ -6914,6 +6917,7 @@ enum nl80211_peer_measurement_ftm_req { NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC, NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED, NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED, + NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK, /* keep last */ NUM_NL80211_PMSR_FTM_REQ_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index adfa07c67b44..aad19348bb46 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -309,6 +309,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG }, }; static const struct nla_policy diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index a95c79d18349..6bdd96408022 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation */ #ifndef __PMSR_H #define __PMSR_H @@ -158,6 +158,16 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + out->ftm.lmr_feedback = + !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; + if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && + out->ftm.lmr_feedback) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK], + "FTM: LMR feedback set for EDCA based ranging"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From f12ce9f607ffa5c617cd86cb7a7a0aaefe58f127 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Fri, 9 Apr 2021 12:40:15 +0300 Subject: nl80211: Add new RSNXE related nl80211 extended features Draft P802.11ax_D2.5 defines the following capabilities that can be negotiated using RSNXE capabilities: - Secure LTF measurement exchange protocol. - Secure RTT measurement exchange protocol. - Management frame protection for all management frames exchanged during the negotiation and range measurement procedure. Extend the nl80211 API to allow drivers to declare support for these new capabilities as part of extended feature. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.8280e31d8091.Ifcb29f84f432290338f80c8378aa5c9e0a390c93@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 00f696d177e6..f962c06e9818 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5940,6 +5940,16 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_BEACON_RATE_HE: Driver supports beacon rate * configuration (AP/mesh) with HE rates. * + * @NL80211_EXT_FEATURE_SECURE_LTF: Device supports secure LTF measurement + * exchange protocol. + * + * @NL80211_EXT_FEATURE_SECURE_RTT: Device supports secure RTT measurement + * exchange protocol. + * + * @NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE: Device supports management + * frame protection for all management frames exchanged during the + * negotiation and range measurement procedure. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6001,6 +6011,9 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_FILS_DISCOVERY, NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP, NL80211_EXT_FEATURE_BEACON_RATE_HE, + NL80211_EXT_FEATURE_SECURE_LTF, + NL80211_EXT_FEATURE_SECURE_RTT, + NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 7b15523a989b63927c2bb08e9b5b0bbc10b58bef Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:40 +0200 Subject: bpf: Add a bpf_snprintf helper The implementation takes inspiration from the existing bpf_trace_printk helper but there are a few differences: To allow for a large number of format-specifiers, parameters are provided in an array, like in bpf_seq_printf. Because the output string takes two arguments and the array of parameters also takes two arguments, the format string needs to fit in one argument. Thankfully, ARG_PTR_TO_CONST_STR is guaranteed to point to a zero-terminated read-only map so we don't need a format string length arg. Because the format-string is known at verification time, we also do a first pass of format string validation in the verifier logic. This makes debugging easier. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210419155243.1632274-4-revest@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ kernel/bpf/helpers.c | 50 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 41 ++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ 6 files changed, 150 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c160526fc8bf..f8a45f109e96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1953,6 +1953,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9ca57eb1fc0d..85b26ca5aacd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -925,6 +925,54 @@ out: return err; } +#define MAX_SNPRINTF_VARARGS 12 + +BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) +{ + enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; + u64 args[MAX_SNPRINTF_VARARGS]; + int err, num_args; + + if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + if (err < 0) + return err; + + /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give + * all of them to snprintf(). + */ + err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); + + bpf_printf_cleanup(); + + return err + 1; +} + +const struct bpf_func_proto bpf_snprintf_proto = { + .func = bpf_snprintf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -1013,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f46dd6f3383..994ef36c5f60 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5918,6 +5918,41 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } +static int check_bpf_snprintf_call(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; + + /* data must be an array of u64 */ + if (data_len_reg->var_off.value % 8) + return -EINVAL; + num_args = data_len_reg->var_off.value / 8; + + /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const + * and map_direct_value_addr is set. + */ + fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, + fmt_map_off); + if (err) + return err; + fmt = (char *)(long)fmt_addr + fmt_map_off; + + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + if (err < 0) + verbose(env, "Invalid format string\n"); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -6032,6 +6067,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_snprintf) { + err = check_bpf_snprintf_call(env, regs); + if (err < 0) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a13f8644b357..2a8bcdc927c7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1076,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From e0bb96db96f8ca94349344a2ea7bebc6f8cefdae Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Apr 2021 01:12:44 +0200 Subject: netfilter: nft_socket: add support for cgroupsv2 Allow to match on the cgroupsv2 id from ancestor level. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +++ net/netfilter/nft_socket.c | 48 +++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 79bab7a36b30..467365ed59a7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1014,11 +1014,13 @@ enum nft_rt_attributes { * * @NFTA_SOCKET_KEY: socket key to match * @NFTA_SOCKET_DREG: destination register + * @NFTA_SOCKET_LEVEL: cgroups2 ancestor level (only for cgroupsv2) */ enum nft_socket_attributes { NFTA_SOCKET_UNSPEC, NFTA_SOCKET_KEY, NFTA_SOCKET_DREG, + NFTA_SOCKET_LEVEL, __NFTA_SOCKET_MAX }; #define NFTA_SOCKET_MAX (__NFTA_SOCKET_MAX - 1) @@ -1029,11 +1031,13 @@ enum nft_socket_attributes { * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option * @NFT_SOCKET_MARK: Value of the socket mark * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0) + * @NFT_SOCKET_CGROUPV2: Match on cgroups version 2 */ enum nft_socket_keys { NFT_SOCKET_TRANSPARENT, NFT_SOCKET_MARK, NFT_SOCKET_WILDCARD, + NFT_SOCKET_CGROUPV2, __NFT_SOCKET_MAX }; #define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index c9b8a2b03b71..9c169d100651 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,6 +9,7 @@ struct nft_socket { enum nft_socket_keys key:8; + u8 level; union { u8 dreg; }; @@ -33,6 +34,26 @@ static void nft_socket_wildcard(const struct nft_pktinfo *pkt, } } +#ifdef CONFIG_CGROUPS +static noinline bool +nft_sock_get_eval_cgroupv2(u32 *dest, const struct nft_pktinfo *pkt, u32 level) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (level > cgrp->level) + return false; + + memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64)); + + return true; +} +#endif + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -85,6 +106,14 @@ static void nft_socket_eval(const struct nft_expr *expr, } nft_socket_wildcard(pkt, regs, sk, dest); break; +#ifdef CONFIG_CGROUPS + case NFT_SOCKET_CGROUPV2: + if (!nft_sock_get_eval_cgroupv2(dest, pkt, priv->level)) { + regs->verdict.code = NFT_BREAK; + return; + } + break; +#endif default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -97,6 +126,7 @@ static void nft_socket_eval(const struct nft_expr *expr, static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, + [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 }, }; static int nft_socket_init(const struct nft_ctx *ctx, @@ -104,7 +134,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_socket *priv = nft_expr_priv(expr); - unsigned int len; + unsigned int len, level; if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY]) return -EINVAL; @@ -129,6 +159,19 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_MARK: len = sizeof(u32); break; +#ifdef CONFIG_CGROUPS + case NFT_SOCKET_CGROUPV2: + if (!tb[NFTA_SOCKET_LEVEL]) + return -EINVAL; + + level = ntohl(nla_get_u32(tb[NFTA_SOCKET_LEVEL])); + if (level > 255) + return -EOPNOTSUPP; + + priv->level = level; + len = sizeof(u64); + break; +#endif default: return -EOPNOTSUPP; } @@ -146,6 +189,9 @@ static int nft_socket_dump(struct sk_buff *skb, return -1; if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; + if (priv->key == NFT_SOCKET_CGROUPV2 && + nla_put_u32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) + return -1; return 0; } -- cgit v1.2.3 From 427f0c8c194b22edcafef1b0a42995ddc5c2227d Mon Sep 17 00:00:00 2001 From: Jethro Beekman Date: Sun, 25 Apr 2021 11:22:03 +0200 Subject: macvlan: Add nodst option to macvlan type source The default behavior for source MACVLAN is to duplicate packets to appropriate type source devices, and then do the normal destination MACVLAN flow. This patch adds an option to skip destination MACVLAN processing if any matching source MACVLAN device has the option set. This allows setting up a "catch all" device for source MACVLAN: create one or more devices with type source nodst, and one device with e.g. type vepa, and incoming traffic will be received on exactly one device. v2: netdev wants non-standard line length Signed-off-by: Jethro Beekman Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 19 ++++++++++++++----- include/uapi/linux/if_link.h | 1 + 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 9a9a5cf36a4b..7427b989607e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -423,18 +423,24 @@ static void macvlan_forward_source_one(struct sk_buff *skb, macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } -static void macvlan_forward_source(struct sk_buff *skb, +static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; + bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { - if (ether_addr_equal_64bits(entry->addr, addr)) + if (ether_addr_equal_64bits(entry->addr, addr)) { + if (entry->vlan->flags & MACVLAN_FLAG_NODST) + consume = true; macvlan_forward_source_one(skb, entry->vlan); + } } + + return consume; } /* called under rcu_read_lock() from netif_receive_skb */ @@ -463,7 +469,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { @@ -482,7 +489,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; } - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); @@ -1286,7 +1294,8 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], return 0; if (data[IFLA_MACVLAN_FLAGS] && - nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC) + nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | + MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 91c8dda6d95d..cd5b382a4138 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -614,6 +614,7 @@ enum macvlan_macaddr_mode { }; #define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ /* VRF section */ enum { -- cgit v1.2.3 From aaa31047a6d25da0fa101da1ed544e1247949b40 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Apr 2021 18:05:55 +0200 Subject: netfilter: nftables: add catch-all set element support This patch extends the set infrastructure to add a special catch-all set element. If the lookup fails to find an element (or range) in the set, then the catch-all element is selected. Users can specify a mapping, expression(s) and timeout to be attached to the catch-all element. This patch adds a catchall list to the set, this list might contain more than one single catch-all element (e.g. in case that the catch-all element is removed and a new one is added in the same transaction). However, most of the time, there will be either one element or no elements at all in this list. The catch-all element is identified via NFT_SET_ELEM_CATCHALL flag and such special element has no NFTA_SET_ELEM_KEY attribute. There is a new nft_set_elem_catchall object that stores a reference to the dummy catch-all element (catchall->elem) whose layout is the same of the set element type to reuse the existing set element codebase. The set size does not apply to the catch-all element, users can define a catch-all element even if the set is full. The check for valid set element flags hava been updates to report EOPNOTSUPP in case userspace requests flags that are not supported when using new userspace nftables and old kernel. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 + include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 480 +++++++++++++++++++++++++++---- net/netfilter/nft_lookup.c | 12 +- net/netfilter/nft_objref.c | 11 +- net/netfilter/nft_set_hash.c | 6 + net/netfilter/nft_set_pipapo.c | 6 +- net/netfilter/nft_set_rbtree.c | 6 + 8 files changed, 465 insertions(+), 63 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index eb708b77c4a5..27eeb613bb4e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -497,6 +497,7 @@ struct nft_set { u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; + struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; @@ -522,6 +523,10 @@ struct nft_set *nft_set_lookup_global(const struct net *net, const struct nlattr *nla_set_id, u8 genmask); +struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + const struct nft_set *set); +void *nft_set_catchall_gc(const struct nft_set *set); + static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { return set->gc_int ? msecs_to_jiffies(set->gc_int) : HZ; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 467365ed59a7..1fb4ca18ffbb 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -398,9 +398,11 @@ enum nft_set_attributes { * enum nft_set_elem_flags - nf_tables set element flags * * @NFT_SET_ELEM_INTERVAL_END: element ends the previous interval + * @NFT_SET_ELEM_CATCHALL: special catch-all element */ enum nft_set_elem_flags { NFT_SET_ELEM_INTERVAL_END = 0x1, + NFT_SET_ELEM_CATCHALL = 0x2, }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index faf0424375e8..0b7fe0a902ff 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4389,6 +4389,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, } INIT_LIST_HEAD(&set->bindings); + INIT_LIST_HEAD(&set->catchall_list); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -4434,6 +4435,24 @@ err_set_name: return err; } +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +static void nft_set_catchall_destroy(const struct nft_ctx *ctx, + struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_del_rcu(&catchall->list); + nft_set_elem_destroy(set, catchall->elem, true); + kfree_rcu(catchall); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; @@ -4445,6 +4464,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) nft_expr_destroy(ctx, set->exprs[i]); set->ops->destroy(set); + nft_set_catchall_destroy(ctx, set); kfree(set->name); kvfree(set); } @@ -4521,6 +4541,29 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, return nft_setelem_data_validate(ctx, set, elem); } +static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + ret = nft_setelem_data_validate(ctx, set, &elem); + if (ret < 0) + break; + } + + return ret; +} + int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { @@ -4550,6 +4593,9 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, iter.fn = nf_tables_bind_check_setelem; set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_bind_check(ctx, set); + if (iter.err < 0) return iter.err; } @@ -4736,7 +4782,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; @@ -4825,6 +4872,29 @@ struct nft_set_dump_ctx { struct nft_ctx ctx; }; +static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_expired(ext)) + continue; + + elem.priv = catchall->elem; + ret = nf_tables_fill_setelem(skb, set, &elem); + break; + } + + return ret; +} + static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; @@ -4889,6 +4959,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); + + if (!args.iter.err && args.iter.count == cb->args[0]) + args.iter.err = nft_set_catchall_dump(net, skb, set); rcu_read_unlock(); nla_nest_end(skb, nest); @@ -4968,8 +5041,8 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; *flags = ntohl(nla_get_be32(attr)); - if (*flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; + if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) + return -EOPNOTSUPP; if (!(set->flags & NFT_SET_INTERVAL) && *flags & NFT_SET_ELEM_INTERVAL_END) return -EINVAL; @@ -5014,6 +5087,46 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, return 0; } +static void *nft_setelem_catchall_get(const struct net *net, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_ext *ext; + void *priv = NULL; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_expired(ext)) + continue; + + priv = catchall->elem; + break; + } + + return priv; +} + +static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_elem *elem, u32 flags) +{ + void *priv; + + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + priv = set->ops->get(ctx->net, set, elem, flags); + if (IS_ERR(priv)) + return PTR_ERR(priv); + } else { + priv = nft_setelem_catchall_get(ctx->net, set); + if (!priv) + return -ENOENT; + } + elem->priv = priv; + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -5021,7 +5134,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; - void *priv; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, @@ -5029,17 +5141,19 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY]) - return -EINVAL; - err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - return err; + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + return -EINVAL; + + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -5048,11 +5162,9 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - priv = set->ops->get(ctx->net, set, &elem, flags); - if (IS_ERR(priv)) - return PTR_ERR(priv); - - elem.priv = priv; + err = nft_setelem_get(ctx, set, &elem, flags); + if (err < 0) + return err; err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); @@ -5212,7 +5324,8 @@ void *nft_set_elem_init(const struct nft_set *set, ext = nft_set_elem_ext(set, elem); nft_set_ext_init(ext, tmpl); - memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY)) + memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) memcpy(nft_set_ext_key_end(ext), key_end, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) @@ -5343,6 +5456,192 @@ err_elem_expr_setup: return -ENOMEM; } +struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (nft_set_elem_active(ext, genmask) && + !nft_set_elem_expired(ext)) + return ext; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); + +void *nft_set_catchall_gc(const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall, *next; + struct nft_set_ext *ext; + void *elem = NULL; + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext) || + nft_set_elem_mark_busy(ext)) + continue; + + elem = catchall->elem; + list_del_rcu(&catchall->list); + kfree_rcu(catchall, rcu); + break; + } + + return elem; +} +EXPORT_SYMBOL_GPL(nft_set_catchall_gc); + +static int nft_setelem_catchall_insert(const struct net *net, + struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **pext) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_next(net); + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (nft_set_elem_active(ext, genmask)) { + *pext = ext; + return -EEXIST; + } + } + + catchall = kmalloc(sizeof(*catchall), GFP_KERNEL); + if (!catchall) + return -ENOMEM; + + catchall->elem = elem->priv; + list_add_tail_rcu(&catchall->list, &set->catchall_list); + + return 0; +} + +static int nft_setelem_insert(const struct net *net, + struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext, unsigned int flags) +{ + int ret; + + if (flags & NFT_SET_ELEM_CATCHALL) + ret = nft_setelem_catchall_insert(net, set, elem, ext); + else + ret = set->ops->insert(net, set, elem, ext); + + return ret; +} + +static bool nft_setelem_is_catchall(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL) + return true; + + return false; +} + +static void nft_setelem_activate(struct net *net, struct nft_set *set, + struct nft_set_elem *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_setelem_is_catchall(set, elem)) { + nft_set_elem_change_active(net, set, ext); + nft_set_elem_clear_busy(ext); + } else { + set->ops->activate(net, set, elem); + } +} + +static int nft_setelem_catchall_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem) +{ + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_is_active(net, ext) || + nft_set_elem_mark_busy(ext)) + continue; + + kfree(elem->priv); + elem->priv = catchall->elem; + nft_set_elem_change_active(net, set, ext); + return 0; + } + + return -ENOENT; +} + +static int __nft_setelem_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem) +{ + void *priv; + + priv = set->ops->deactivate(net, set, elem); + if (!priv) + return -ENOENT; + + kfree(elem->priv); + elem->priv = priv; + set->ndeact++; + + return 0; +} + +static int nft_setelem_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem, u32 flags) +{ + int ret; + + if (flags & NFT_SET_ELEM_CATCHALL) + ret = nft_setelem_catchall_deactivate(net, set, elem); + else + ret = __nft_setelem_deactivate(net, set, elem); + + return ret; +} + +static void nft_setelem_catchall_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_set_elem_catchall *catchall, *next; + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + if (catchall->elem == elem->priv) { + list_del_rcu(&catchall->list); + kfree_rcu(catchall); + break; + } + } +} + +static void nft_setelem_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + if (nft_setelem_is_catchall(set, elem)) + nft_setelem_catchall_remove(net, set, elem); + else + set->ops->remove(net, set, elem); +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { @@ -5369,14 +5668,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (nla[NFTA_SET_ELEM_KEY] == NULL) - return -EINVAL; - nft_set_ext_prepare(&tmpl); err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; + + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + return -EINVAL; + if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); @@ -5481,12 +5781,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, num_exprs = set->num_exprs; } - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - goto err_set_elem_expr; + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + goto err_set_elem_expr; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -5603,7 +5905,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(ctx->net, set, &elem, &ext2); + + err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); if (err) { if (err == -EEXIST) { if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ @@ -5630,7 +5933,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (set->size && + if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size && !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { err = -ENFILE; goto err_set_full; @@ -5641,7 +5944,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return 0; err_set_full: - set->ops->remove(ctx->net, set, &elem); + nft_setelem_remove(ctx->net, set, &elem); err_element_clash: kfree(trans); err_elem_expr: @@ -5773,7 +6076,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_ext *ext; struct nft_trans *trans; u32 flags = 0; - void *priv; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, @@ -5781,23 +6083,26 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (nla[NFTA_SET_ELEM_KEY] == NULL) + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) return -EINVAL; nft_set_ext_prepare(&tmpl); - err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); - if (err < 0) - return err; if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - return err; + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -5823,13 +6128,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (trans == NULL) goto fail_trans; - priv = set->ops->deactivate(ctx->net, set, &elem); - if (priv == NULL) { - err = -ENOENT; + err = nft_setelem_deactivate(ctx->net, set, &elem, flags); + if (err < 0) goto fail_ops; - } - kfree(elem.priv); - elem.priv = priv; nft_setelem_data_deactivate(ctx->net, set, &elem); @@ -5876,6 +6177,49 @@ err1: return err; } +static int __nft_set_catchall_flush(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_elem *elem) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, + sizeof(struct nft_trans_elem), GFP_KERNEL); + if (!trans) + return -ENOMEM; + + nft_setelem_data_deactivate(ctx->net, set, elem); + nft_trans_elem_set(trans) = set; + nft_trans_elem(trans) = *elem; + nft_trans_commit_list_add_tail(ctx->net, trans); + + return 0; +} + +static int nft_set_catchall_flush(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_mark_busy(ext)) + continue; + + elem.priv = catchall->elem; + ret = __nft_set_catchall_flush(ctx, set, &elem); + if (ret < 0) + break; + } + + return ret; +} + static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { @@ -5884,6 +6228,8 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) }; set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_flush(ctx, set); return iter.err; } @@ -5918,8 +6264,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb, err = nft_del_setelem(&ctx, set, attr); if (err < 0) break; - - set->ndeact++; } return err; } @@ -8270,7 +8614,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(net, te->set, &te->elem); + nft_setelem_activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); @@ -8282,9 +8626,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_DELSETELEM, 0); - te->set->ops->remove(net, te->set, &te->elem); - atomic_dec(&te->set->nelems); - te->set->ndeact--; + nft_setelem_remove(net, te->set, &te->elem); + if (!nft_setelem_is_catchall(te->set, &te->elem)) { + atomic_dec(&te->set->nelems); + te->set->ndeact--; + } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { @@ -8485,15 +8831,17 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; } te = (struct nft_trans_elem *)trans->data; - te->set->ops->remove(net, te->set, &te->elem); - atomic_dec(&te->set->nelems); + nft_setelem_remove(net, te->set, &te->elem); + if (!nft_setelem_is_catchall(te->set, &te->elem)) + atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; nft_setelem_data_activate(net, te->set, &te->elem); - te->set->ops->activate(net, te->set, &te->elem); - te->set->ndeact--; + nft_setelem_activate(net, te->set, &te->elem); + if (!nft_setelem_is_catchall(te->set, &te->elem)) + te->set->ndeact--; nft_trans_destroy(trans); break; @@ -8672,6 +9020,27 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, return nft_check_loops(ctx, ext); } +static int nft_set_catchall_loops(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + ret = nft_check_loops(ctx, ext); + if (ret < 0) + return ret; + } + + return ret; +} + static int nf_tables_check_loops(const struct nft_ctx *ctx, const struct nft_chain *chain) { @@ -8731,6 +9100,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, iter.fn = nf_tables_loop_check_setelem; set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_loops(ctx, set); + if (iter.err < 0) return iter.err; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b0f558b4fea5..a479f8a1270c 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -30,13 +30,17 @@ void nft_lookup_eval(const struct nft_expr *expr, const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; const struct nft_set_ext *ext = NULL; + const struct net *net = nft_net(pkt); bool found; - found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], - &ext) ^ priv->invert; + found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext) ^ + priv->invert; if (!found) { - regs->verdict.code = NFT_BREAK; - return; + ext = nft_set_catchall_lookup(net, set); + if (!ext) { + regs->verdict.code = NFT_BREAK; + return; + } } if (ext) { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index bc104d36d3bb..7e47edee88ee 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -105,15 +105,18 @@ static void nft_objref_map_eval(const struct nft_expr *expr, { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; + struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; bool found; - found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], - &ext); + found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext); if (!found) { - regs->verdict.code = NFT_BREAK; - return; + ext = nft_set_catchall_lookup(net, set); + if (!ext) { + regs->verdict.code = NFT_BREAK; + return; + } } obj = *nft_set_ext_obj(ext); obj->ops->eval(obj, regs, pkt); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index bf618b7ec1ae..58f576abcd4a 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -350,6 +350,12 @@ needs_gc_run: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); + he = nft_set_catchall_gc(set); + if (he) { + gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); + if (gcb) + nft_set_gc_batch_add(gcb, he); + } nft_set_gc_batch_complete(gcb); queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9944523f5c2c..528a2d7ca991 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1529,11 +1529,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; struct nft_pipapo_field *f; - struct nft_pipapo_elem *e; int i, start, rules_fx; start = first_rule; @@ -1569,6 +1569,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } + e = nft_set_catchall_gc(set); + if (e) + nft_set_elem_destroy(set, e, true); + priv->last_gc = jiffies; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 217ab3644c25..9e36eb4a7429 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -541,6 +541,12 @@ static void nft_rbtree_gc(struct work_struct *work) write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); + rbe = nft_set_catchall_gc(set); + if (rbe) { + gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); + if (gcb) + nft_set_gc_batch_add(gcb, rbe); + } nft_set_gc_batch_complete(gcb); queue_delayed_work(system_power_efficient_wq, &priv->gc_work, -- cgit v1.2.3 From e542d29ca81d005651680a0a697b72ca13ddc4cc Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Tue, 27 Apr 2021 10:36:35 -0500 Subject: icmp: standardize naming of RFC 8335 PROBE constants The current definitions of constants for PROBE, currently defined only in the net-next kernel branch, are inconsistent, with some beginning with ICMP and others with simply EXT. This patch attempts to standardize the naming conventions of the constants for PROBE before their release into a stable Kernel, and to update the relevant definitions in net/ipv4/icmp.c. Similarly, the definitions for the code field (previously ICMP_EXT_MAL_QUERY, etc) use the same prefixes as the type field. This patch adds _CODE_ to the prefix to clarify the distinction of these constants. Signed-off-by: Andreas Roeseler Acked-by: David Ahern Link: https://lore.kernel.org/r/20210427153635.2591-1-andreas.a.roeseler@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/icmp.h | 28 ++++++++++++++-------------- net/ipv4/icmp.c | 16 ++++++++-------- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index 222325d1d80e..c1da8244c5e1 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -70,22 +70,22 @@ #define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */ /* Codes for EXT_ECHO (PROBE) */ -#define ICMP_EXT_ECHO 42 -#define ICMP_EXT_ECHOREPLY 43 -#define ICMP_EXT_MAL_QUERY 1 /* Malformed Query */ -#define ICMP_EXT_NO_IF 2 /* No such Interface */ -#define ICMP_EXT_NO_TABLE_ENT 3 /* No such Table Entry */ -#define ICMP_EXT_MULT_IFS 4 /* Multiple Interfaces Satisfy Query */ +#define ICMP_EXT_ECHO 42 +#define ICMP_EXT_ECHOREPLY 43 +#define ICMP_EXT_CODE_MAL_QUERY 1 /* Malformed Query */ +#define ICMP_EXT_CODE_NO_IF 2 /* No such Interface */ +#define ICMP_EXT_CODE_NO_TABLE_ENT 3 /* No such Table Entry */ +#define ICMP_EXT_CODE_MULT_IFS 4 /* Multiple Interfaces Satisfy Query */ /* Constants for EXT_ECHO (PROBE) */ -#define EXT_ECHOREPLY_ACTIVE (1 << 2)/* active bit in reply message */ -#define EXT_ECHOREPLY_IPV4 (1 << 1)/* ipv4 bit in reply message */ -#define EXT_ECHOREPLY_IPV6 1 /* ipv6 bit in reply message */ -#define EXT_ECHO_CTYPE_NAME 1 -#define EXT_ECHO_CTYPE_INDEX 2 -#define EXT_ECHO_CTYPE_ADDR 3 -#define ICMP_AFI_IP 1 /* Address Family Identifier for ipv4 */ -#define ICMP_AFI_IP6 2 /* Address Family Identifier for ipv6 */ +#define ICMP_EXT_ECHOREPLY_ACTIVE (1 << 2)/* active bit in reply message */ +#define ICMP_EXT_ECHOREPLY_IPV4 (1 << 1)/* ipv4 bit in reply message */ +#define ICMP_EXT_ECHOREPLY_IPV6 1 /* ipv6 bit in reply message */ +#define ICMP_EXT_ECHO_CTYPE_NAME 1 +#define ICMP_EXT_ECHO_CTYPE_INDEX 2 +#define ICMP_EXT_ECHO_CTYPE_ADDR 3 +#define ICMP_AFI_IP 1 /* Address Family Identifier for ipv4 */ +#define ICMP_AFI_IP6 2 /* Address Family Identifier for ipv6 */ struct icmphdr { __u8 type; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8bd988fbcb31..7b6931a4d775 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1033,7 +1033,7 @@ static bool icmp_echo(struct sk_buff *skb) status = 0; dev = NULL; switch (iio->extobj_hdr.class_type) { - case EXT_ECHO_CTYPE_NAME: + case ICMP_EXT_ECHO_CTYPE_NAME: iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); if (ident_len >= IFNAMSIZ) goto send_mal_query; @@ -1041,14 +1041,14 @@ static bool icmp_echo(struct sk_buff *skb) memcpy(buff, &iio->ident.name, ident_len); dev = dev_get_by_name(net, buff); break; - case EXT_ECHO_CTYPE_INDEX: + case ICMP_EXT_ECHO_CTYPE_INDEX: iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + sizeof(iio->ident.ifindex), &_iio); if (ident_len != sizeof(iio->ident.ifindex)) goto send_mal_query; dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); break; - case EXT_ECHO_CTYPE_ADDR: + case ICMP_EXT_ECHO_CTYPE_ADDR: if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + iio->ident.addr.ctype3_hdr.addrlen) goto send_mal_query; @@ -1080,23 +1080,23 @@ static bool icmp_echo(struct sk_buff *skb) goto send_mal_query; } if (!dev) { - icmp_param.data.icmph.code = ICMP_EXT_NO_IF; + icmp_param.data.icmph.code = ICMP_EXT_CODE_NO_IF; goto send_reply; } /* Fill bits in reply message */ if (dev->flags & IFF_UP) - status |= EXT_ECHOREPLY_ACTIVE; + status |= ICMP_EXT_ECHOREPLY_ACTIVE; if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list) - status |= EXT_ECHOREPLY_IPV4; + status |= ICMP_EXT_ECHOREPLY_IPV4; if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) - status |= EXT_ECHOREPLY_IPV6; + status |= ICMP_EXT_ECHOREPLY_IPV6; dev_put(dev); icmp_param.data.icmph.un.echo.sequence |= htons(status); send_reply: icmp_reply(&icmp_param, skb); return true; send_mal_query: - icmp_param.data.icmph.code = ICMP_EXT_MAL_QUERY; + icmp_param.data.icmph.code = ICMP_EXT_CODE_MAL_QUERY; goto send_reply; } -- cgit v1.2.3