From dee872e124e8d5de22b68c58f6f6c3f5e8889160 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:45 +0530 Subject: bpf: Populate kfunc BTF ID sets in struct btf This patch prepares the kernel to support putting all kinds of kfunc BTF ID sets in the struct btf itself. The various kernel subsystems will make register_btf_kfunc_id_set call in the initcalls (for built-in code and modules). The 'hook' is one of the many program types, e.g. XDP and TC/SCHED_CLS, STRUCT_OPS, and 'types' are check (allowed or not), acquire, release, and ret_null (with PTR_TO_BTF_ID_OR_NULL return type). A maximum of BTF_KFUNC_SET_MAX_CNT (32) kfunc BTF IDs are permitted in a set of certain hook and type for vmlinux sets, since they are allocated on demand, and otherwise set as NULL. Module sets can only be registered once per hook and type, hence they are directly assigned. A new btf_kfunc_id_set_contains function is exposed for use in verifier, this new method is faster than the existing list searching method, and is also automatic. It also lets other code not care whether the set is unallocated or not. Note that module code can only do single register_btf_kfunc_id_set call per hook. This is why sorting is only done for in-kernel vmlinux sets, because there might be multiple sets for the same hook and type that must be concatenated, hence sorting them is required to ensure bsearch in btf_id_set_contains continues to work correctly. Next commit will update the kernel users to make use of this infrastructure. Finally, add __maybe_unused annotation for BTF ID macros for the !CONFIG_DEBUG_INFO_BTF case, so that they don't produce warnings during build time. The previous patch is also needed to provide synchronization against initialization for module BTF's kfunc_set_tab introduced here, as described below: The kfunc_set_tab pointer in struct btf is write-once (if we consider the registration phase (comprised of multiple register_btf_kfunc_id_set calls) as a single operation). In this sense, once it has been fully prepared, it isn't modified, only used for lookup (from the verifier context). For btf_vmlinux, it is initialized fully during the do_initcalls phase, which happens fairly early in the boot process, before any processes are present. This also eliminates the possibility of bpf_check being called at that point, thus relieving us of ensuring any synchronization between the registration and lookup function (btf_kfunc_id_set_contains). However, the case for module BTF is a bit tricky. The BTF is parsed, prepared, and published from the MODULE_STATE_COMING notifier callback. After this, the module initcalls are invoked, where our registration function will be called to populate the kfunc_set_tab for module BTF. At this point, BTF may be available to userspace while its corresponding module is still intializing. A BTF fd can then be passed to verifier using bpf syscall (e.g. for kfunc call insn). Hence, there is a race window where verifier may concurrently try to lookup the kfunc_set_tab. To prevent this race, we must ensure the operations are serialized, or waiting for the __init functions to complete. In the earlier registration API, this race was alleviated as verifier bpf_check_mod_kfunc_call didn't find the kfunc BTF ID until it was added by the registration function (called usually at the end of module __init function after all module resources have been initialized). If the verifier made the check_kfunc_call before kfunc BTF ID was added to the list, it would fail verification (saying call isn't allowed). The access to list was protected using a mutex. Now, it would still fail verification, but for a different reason (returning ENXIO due to the failed btf_try_get_module call in add_kfunc_call), because if the __init call is in progress the module will be in the middle of MODULE_STATE_COMING -> MODULE_STATE_LIVE transition, and the BTF_MODULE_LIVE flag for btf_module instance will not be set, so the btf_try_get_module call will fail. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/btf_ids.h | 13 +++++++------ 2 files changed, 46 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 0c74348cbc9d..c451f8e2612a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -12,11 +12,33 @@ #define BTF_TYPE_EMIT(type) ((void)(type *)0) #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val) +enum btf_kfunc_type { + BTF_KFUNC_TYPE_CHECK, + BTF_KFUNC_TYPE_ACQUIRE, + BTF_KFUNC_TYPE_RELEASE, + BTF_KFUNC_TYPE_RET_NULL, + BTF_KFUNC_TYPE_MAX, +}; + struct btf; struct btf_member; struct btf_type; union bpf_attr; struct btf_show; +struct btf_id_set; + +struct btf_kfunc_id_set { + struct module *owner; + union { + struct { + struct btf_id_set *check_set; + struct btf_id_set *acquire_set; + struct btf_id_set *release_set; + struct btf_id_set *ret_null_set; + }; + struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX]; + }; +}; extern const struct file_operations btf_fops; @@ -307,6 +329,11 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); +bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, u32 kfunc_btf_id); +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *s); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -318,6 +345,18 @@ static inline const char *btf_name_by_offset(const struct btf *btf, { return NULL; } +static inline bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, + u32 kfunc_btf_id) +{ + return false; +} +static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *s) +{ + return 0; +} #endif struct kfunc_btf_id_set { diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 919c0fde1c51..bc5d9cc34e4c 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -11,6 +11,7 @@ struct btf_id_set { #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ +#include /* for __maybe_unused */ /* * Following macros help to define lists of BTF IDs placed @@ -146,14 +147,14 @@ extern struct btf_id_set name; #else -#define BTF_ID_LIST(name) static u32 name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n]; -#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; -#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; -#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; -#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 __maybe_unused name[1]; +#define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From b202d84422223b7222cba5031d182f20b37e146e Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:46 +0530 Subject: bpf: Remove check_kfunc_call callback and old kfunc BTF ID API Completely remove the old code for check_kfunc_call to help it work with modules, and also the callback itself. The previous commit adds infrastructure to register all sets and put them in vmlinux or module BTF, and concatenates all related sets organized by the hook and the type. Once populated, these sets remain immutable for the lifetime of the struct btf. Also, since we don't need the 'owner' module anywhere when doing check_kfunc_call, drop the 'btf_modp' module parameter from find_kfunc_desc_btf. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ---- include/linux/btf.h | 44 --------------------- kernel/bpf/btf.c | 46 ---------------------- kernel/bpf/verifier.c | 20 ++++------ net/bpf/test_run.c | 23 ++++++----- net/core/filter.c | 1 - net/ipv4/bpf_tcp_ca.c | 22 ++++++----- net/ipv4/tcp_bbr.c | 18 +++++---- net/ipv4/tcp_cubic.c | 17 ++++---- net/ipv4/tcp_dctcp.c | 18 +++++---- .../selftests/bpf/bpf_testmod/bpf_testmod.c | 17 ++++---- 11 files changed, 73 insertions(+), 161 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e947cd91152..6d7346c54d83 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -573,7 +573,6 @@ struct bpf_verifier_ops { const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); - bool (*check_kfunc_call)(u32 kfunc_btf_id, struct module *owner); }; struct bpf_prog_offload_ops { @@ -1719,7 +1718,6 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1971,12 +1969,6 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, return -ENOTSUPP; } -static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, - struct module *owner) -{ - return false; -} - static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/linux/btf.h b/include/linux/btf.h index c451f8e2612a..b12cfe3b12bb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -359,48 +359,4 @@ static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } #endif -struct kfunc_btf_id_set { - struct list_head list; - struct btf_id_set *set; - struct module *owner; -}; - -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; -}; - -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s); -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s); -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner); - -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; -#else -static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ -} -static inline void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ -} -static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, - u32 kfunc_id, struct module *owner) -{ - return false; -} - -static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; -static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; -#endif - -#define DEFINE_KFUNC_BTF_ID_SET(set, name) \ - struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ - THIS_MODULE } - #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 74037bd65d17..4be5cf629ca9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6682,52 +6682,6 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES - -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ - mutex_lock(&l->mutex); - list_add(&s->list, &l->list); - mutex_unlock(&l->mutex); -} -EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set); - -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ - mutex_lock(&l->mutex); - list_del_init(&s->list); - mutex_unlock(&l->mutex); -} -EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set); - -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner) -{ - struct kfunc_btf_id_set *s; - - mutex_lock(&klist->mutex); - list_for_each_entry(s, &klist->list, list) { - if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { - mutex_unlock(&klist->mutex); - return true; - } - } - mutex_unlock(&klist->mutex); - return false; -} - -#define DEFINE_KFUNC_BTF_ID_LIST(name) \ - struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ - __MUTEX_INITIALIZER(name.mutex) }; \ - EXPORT_SYMBOL_GPL(name) - -DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); -DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); - -#endif - int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfb45381fb3f..72802c1eb5ac 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1741,7 +1741,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) } static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, - s16 offset, struct module **btf_modp) + s16 offset) { struct bpf_kfunc_btf kf_btf = { .offset = offset }; struct bpf_kfunc_btf_tab *tab; @@ -1795,8 +1795,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); } - if (btf_modp) - *btf_modp = b->module; return b->btf; } @@ -1813,8 +1811,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) } static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset, - struct module **btf_modp) + u32 func_id, s16 offset) { if (offset) { if (offset < 0) { @@ -1825,7 +1822,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, return ERR_PTR(-EINVAL); } - return __find_kfunc_desc_btf(env, offset, btf_modp); + return __find_kfunc_desc_btf(env, offset); } return btf_vmlinux ?: ERR_PTR(-ENOENT); } @@ -1888,7 +1885,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL); + desc_btf = find_kfunc_desc_btf(env, func_id, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -2349,7 +2346,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL); + desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off); if (IS_ERR(desc_btf)) return ""; @@ -6820,7 +6817,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; - struct module *btf_mod = NULL; const struct btf_param *args; struct btf *desc_btf; int err; @@ -6829,7 +6825,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod); + desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); @@ -6838,8 +6834,8 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!env->ops->check_kfunc_call || - !env->ops->check_kfunc_call(func_id, btf_mod)) { + if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_CHECK, func_id)) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 46dd95755967..7796a8c747a0 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -236,18 +237,11 @@ __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); -BTF_SET_START(test_sk_kfunc_ids) +BTF_SET_START(test_sk_check_kfunc_ids) BTF_ID(func, bpf_kfunc_call_test1) BTF_ID(func, bpf_kfunc_call_test2) BTF_ID(func, bpf_kfunc_call_test3) -BTF_SET_END(test_sk_kfunc_ids) - -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner) -{ - if (btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id)) - return true; - return bpf_check_mod_kfunc_call(&prog_test_kfunc_list, kfunc_id, owner); -} +BTF_SET_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) @@ -1067,3 +1061,14 @@ out: kfree(ctx); return err; } + +static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &test_sk_check_kfunc_ids, +}; + +static int __init bpf_prog_test_run_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); +} +late_initcall(bpf_prog_test_run_init); diff --git a/net/core/filter.c b/net/core/filter.c index 4603b7cd3cd1..f73a84c75970 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10062,7 +10062,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, - .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index de610cb83694..b60c9fd7147e 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ +#include #include #include #include @@ -212,26 +213,23 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, } } -BTF_SET_START(bpf_tcp_ca_kfunc_ids) +BTF_SET_START(bpf_tcp_ca_check_kfunc_ids) BTF_ID(func, tcp_reno_ssthresh) BTF_ID(func, tcp_reno_cong_avoid) BTF_ID(func, tcp_reno_undo_cwnd) BTF_ID(func, tcp_slow_start) BTF_ID(func, tcp_cong_avoid_ai) -BTF_SET_END(bpf_tcp_ca_kfunc_ids) +BTF_SET_END(bpf_tcp_ca_check_kfunc_ids) -static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id, struct module *owner) -{ - if (btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id)) - return true; - return bpf_check_mod_kfunc_call(&bpf_tcp_ca_kfunc_list, kfunc_btf_id, owner); -} +static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &bpf_tcp_ca_check_kfunc_ids, +}; static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { .get_func_proto = bpf_tcp_ca_get_func_proto, .is_valid_access = bpf_tcp_ca_is_valid_access, .btf_struct_access = bpf_tcp_ca_btf_struct_access, - .check_kfunc_call = bpf_tcp_ca_check_kfunc_call, }; static int bpf_tcp_ca_init_member(const struct btf_type *t, @@ -300,3 +298,9 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .init = bpf_tcp_ca_init, .name = "tcp_congestion_ops", }; + +static int __init bpf_tcp_ca_kfunc_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set); +} +late_initcall(bpf_tcp_ca_kfunc_init); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index ec5550089b4d..02e8626ccb27 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1154,7 +1154,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; -BTF_SET_START(tcp_bbr_kfunc_ids) +BTF_SET_START(tcp_bbr_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID(func, bbr_init) @@ -1167,25 +1167,27 @@ BTF_ID(func, bbr_min_tso_segs) BTF_ID(func, bbr_set_state) #endif #endif -BTF_SET_END(tcp_bbr_kfunc_ids) +BTF_SET_END(tcp_bbr_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set); +static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &tcp_bbr_check_kfunc_ids, +}; static int __init bbr_register(void) { int ret; BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); - ret = tcp_register_congestion_control(&tcp_bbr_cong_ops); - if (ret) + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set); - return 0; + return tcp_register_congestion_control(&tcp_bbr_cong_ops); } static void __exit bbr_unregister(void) { - unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set); tcp_unregister_congestion_control(&tcp_bbr_cong_ops); } diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index e07837e23b3f..24d562dd6225 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -485,7 +485,7 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .name = "cubic", }; -BTF_SET_START(tcp_cubic_kfunc_ids) +BTF_SET_START(tcp_cubic_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID(func, cubictcp_init) @@ -496,9 +496,12 @@ BTF_ID(func, cubictcp_cwnd_event) BTF_ID(func, cubictcp_acked) #endif #endif -BTF_SET_END(tcp_cubic_kfunc_ids) +BTF_SET_END(tcp_cubic_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set); +static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &tcp_cubic_check_kfunc_ids, +}; static int __init cubictcp_register(void) { @@ -534,16 +537,14 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); - ret = tcp_register_congestion_control(&cubictcp); - if (ret) + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set); - return 0; + return tcp_register_congestion_control(&cubictcp); } static void __exit cubictcp_unregister(void) { - unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set); tcp_unregister_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 0d7ab3cc7b61..1943a6630341 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -238,7 +238,7 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { .name = "dctcp-reno", }; -BTF_SET_START(tcp_dctcp_kfunc_ids) +BTF_SET_START(tcp_dctcp_check_kfunc_ids) #ifdef CONFIG_X86 #ifdef CONFIG_DYNAMIC_FTRACE BTF_ID(func, dctcp_init) @@ -249,25 +249,27 @@ BTF_ID(func, dctcp_cwnd_undo) BTF_ID(func, dctcp_state) #endif #endif -BTF_SET_END(tcp_dctcp_kfunc_ids) +BTF_SET_END(tcp_dctcp_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set); +static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &tcp_dctcp_check_kfunc_ids, +}; static int __init dctcp_register(void) { int ret; BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); - ret = tcp_register_congestion_control(&dctcp); - if (ret) + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set); - return 0; + return tcp_register_congestion_control(&dctcp); } static void __exit dctcp_unregister(void) { - unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set); tcp_unregister_congestion_control(&dctcp); } diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c index df3b292a8ffe..c0805d0d753f 100644 --- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c +++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c @@ -109,26 +109,27 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = { .write = bpf_testmod_test_write, }; -BTF_SET_START(bpf_testmod_kfunc_ids) +BTF_SET_START(bpf_testmod_check_kfunc_ids) BTF_ID(func, bpf_testmod_test_mod_kfunc) -BTF_SET_END(bpf_testmod_kfunc_ids) +BTF_SET_END(bpf_testmod_check_kfunc_ids) -static DEFINE_KFUNC_BTF_ID_SET(&bpf_testmod_kfunc_ids, bpf_testmod_kfunc_btf_set); +static const struct btf_kfunc_id_set bpf_testmod_kfunc_set = { + .owner = THIS_MODULE, + .check_set = &bpf_testmod_check_kfunc_ids, +}; static int bpf_testmod_init(void) { int ret; - ret = sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); - if (ret) + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_testmod_kfunc_set); + if (ret < 0) return ret; - register_kfunc_btf_id_set(&prog_test_kfunc_list, &bpf_testmod_kfunc_btf_set); - return 0; + return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } static void bpf_testmod_exit(void) { - unregister_kfunc_btf_id_set(&prog_test_kfunc_list, &bpf_testmod_kfunc_btf_set); return sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file); } -- cgit v1.2.3 From d583691c47dc0424ebe926000339a6d6cd590ff7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:47 +0530 Subject: bpf: Introduce mem, size argument pair support for kfunc BPF helpers can associate two adjacent arguments together to pass memory of certain size, using ARG_PTR_TO_MEM and ARG_CONST_SIZE arguments. Since we don't use bpf_func_proto for kfunc, we need to leverage BTF to implement similar support. The ARG_CONST_SIZE processing for helpers is refactored into a common check_mem_size_reg helper that is shared with kfunc as well. kfunc ptr_to_mem support follows logic similar to global functions, where verification is done as if pointer is not null, even when it may be null. This leads to a simple to follow rule for writing kfunc: always check the argument pointer for NULL, except when it is PTR_TO_CTX. Also, the PTR_TO_CTX case is also only safe when the helper expecting pointer to program ctx is not exposed to other programs where same struct is not ctx type. In that case, the type check will fall through to other cases and would permit passing other types of pointers, possibly NULL at runtime. Currently, we require the size argument to be suffixed with "__sz" in the parameter name. This information is then recorded in kernel BTF and verified during function argument checking. In the future we can use BTF tagging instead, and modify the kernel function definitions. This will be a purely kernel-side change. This allows us to have some form of backwards compatibility for structures that are passed in to the kernel function with their size, and allow variable length structures to be passed in if they are accompanied by a size parameter. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 + kernel/bpf/btf.c | 48 +++++++++++++++-- kernel/bpf/verifier.c | 124 +++++++++++++++++++++++++++---------------- 3 files changed, 126 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 143401d4c9d9..857fd687bdc2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -521,6 +521,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4be5cf629ca9..cf46694cb266 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5654,6 +5654,32 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, return true; } +static bool is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + int len, sfx_len = sizeof("__sz") - 1; + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len < sfx_len) + return false; + param_name += len - sfx_len; + if (strncmp(param_name, "__sz", sfx_len)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, @@ -5765,17 +5791,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, u32 type_size; if (is_kfunc) { + bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + /* Permit pointer to mem, but only when argument * type is pointer to scalar, or struct composed * (recursively) of scalars. + * When arg_mem_size is true, the pointer can be + * void *. */ if (!btf_type_is_scalar(ref_t) && - !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) { + !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { bpf_log(log, - "arg#%d pointer type %s %s must point to scalar or struct with scalar\n", - i, btf_type_str(ref_t), ref_tname); + "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", + i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); return -EINVAL; } + + /* Check for mem, len pair */ + if (arg_mem_size) { + if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { + bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", + i, i + 1); + return -EINVAL; + } + i++; + continue; + } } resolve_ret = btf_resolve_size(btf, ref_t, &type_size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 72802c1eb5ac..2b186185b6b2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4864,6 +4864,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static int check_mem_size_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + int err; + + /* This is used to refine r0 return value bounds for helpers + * that enforce this value as an upper bound on return values. + * See do_refine_retval_range() for helpers that can refine + * the return value. C type of helper is u32 so we pull register + * bound from umax_value however, if negative verifier errors + * out. Only upper bounds can be learned because retval is an + * int type and negative retvals are allowed. + */ + if (meta) + meta->msize_max_value = reg->umax_value; + + /* The register is SCALAR_VALUE; the access check + * happens using its boundaries. + */ + if (!tnum_is_const(reg->var_off)) + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->umin_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->umax_value >= BPF_MAX_VAR_SIZ) { + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->umax_value, + zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); + return err; +} + int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { @@ -4887,6 +4943,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return check_helper_mem_access(env, regno, mem_size, true, NULL); } +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno) +{ + struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; + bool may_be_null = type_may_be_null(mem_reg->type); + struct bpf_reg_state saved_reg; + int err; + + WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + + if (may_be_null) { + saved_reg = *mem_reg; + mark_ptr_not_null_reg(mem_reg); + } + + err = check_mem_size_reg(env, reg, regno, true, NULL); + + if (may_be_null) + *mem_reg = saved_reg; + return err; +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -5408,51 +5486,7 @@ skip_type_check: } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* This is used to refine r0 return value bounds for helpers - * that enforce this value as an upper bound on return values. - * See do_refine_retval_range() for helpers that can refine - * the return value. C type of helper is u32 so we pull register - * bound from umax_value however, if negative verifier errors - * out. Only upper bounds can be learned because retval is an - * int type and negative retvals are allowed. - */ - meta->msize_max_value = reg->umax_value; - - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. - */ - if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ - meta = NULL; - - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); - return -EACCES; - } - - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; - } - - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); - return -EACCES; - } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, regno); + err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", -- cgit v1.2.3 From 5c073f26f9dc78a6c8194b23eac7537c9692c7d7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:48 +0530 Subject: bpf: Add reference tracking support to kfunc This patch adds verifier support for PTR_TO_BTF_ID return type of kfunc to be a reference, by reusing acquire_reference_state/release_reference support for existing in-kernel bpf helpers. We make use of the three kfunc types: - BTF_KFUNC_TYPE_ACQUIRE Return true if kfunc_btf_id is an acquire kfunc. This will acquire_reference_state for the returned PTR_TO_BTF_ID (this is the only allow return value). Note that acquire kfunc must always return a PTR_TO_BTF_ID{_OR_NULL}, otherwise the program is rejected. - BTF_KFUNC_TYPE_RELEASE Return true if kfunc_btf_id is a release kfunc. This will release the reference to the passed in PTR_TO_BTF_ID which has a reference state (from earlier acquire kfunc). The btf_check_func_arg_match returns the regno (of argument register, hence > 0) if the kfunc is a release kfunc, and a proper referenced PTR_TO_BTF_ID is being passed to it. This is similar to how helper call check uses bpf_call_arg_meta to store the ref_obj_id that is later used to release the reference. Similar to in-kernel helper, we only allow passing one referenced PTR_TO_BTF_ID as an argument. It can also be passed in to normal kfunc, but in case of release kfunc there must always be one PTR_TO_BTF_ID argument that is referenced. - BTF_KFUNC_TYPE_RET_NULL For kfunc returning PTR_TO_BTF_ID, tells if it can be NULL, hence force caller to mark the pointer not null (using check) before accessing it. Note that taking into account the case fixed by commit 93c230e3f5bd ("bpf: Enforce id generation for all may-be-null register type") we assign a non-zero id for mark_ptr_or_null_reg logic. Later, if more return types are supported by kfunc, which have a _OR_NULL variant, it might be better to move this id generation under a common reg_type_may_be_null check, similar to the case in the commit. Referenced PTR_TO_BTF_ID is currently only limited to kfunc, but can be extended in the future to other BPF helpers as well. For now, we can rely on the btf_struct_ids_match check to ensure we get the pointer to the expected struct type. In the future, care needs to be taken to avoid ambiguity for reference PTR_TO_BTF_ID passed to release function, in case multiple candidates can release same BTF ID. e.g. there might be two release kfuncs (or kfunc and helper): foo(struct abc *p); bar(struct abc *p); ... such that both release a PTR_TO_BTF_ID with btf_id of struct abc. In this case we would need to track the acquire function corresponding to the release function to avoid type confusion, and store this information in the register state so that an incorrect program can be rejected. This is not a problem right now, hence it is left as an exercise for the future patch introducing such a case in the kernel. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/btf.c | 32 +++++++++++++++++++++++++-- kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++++--------- 3 files changed, 77 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 857fd687bdc2..ac4797155412 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -566,4 +566,9 @@ static inline u32 type_flag(u32 type) return type & ~BPF_BASE_TYPE_MASK; } +static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +{ + return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; +} + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cf46694cb266..57f5fd5af2f9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5686,11 +5686,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; + u32 i, nargs, ref_id, ref_obj_id = 0; bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs, ref_id; + int ref_regno = 0; + bool rel = false; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -5768,6 +5770,16 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; + /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + if (reg->ref_obj_id) { + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[reg->type]; @@ -5838,7 +5850,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, } } - return 0; + /* Either both are set, or neither */ + WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); + if (is_kfunc) { + rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RELEASE, func_id); + /* We already made sure ref_obj_id is set only for one argument */ + if (rel && !ref_obj_id) { + bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; + } + /* Allow (!rel && ref_obj_id), so that passing such referenced PTR_TO_BTF_ID to + * other kfuncs works + */ + } + /* returns argument register number > 0 in case of reference release kfunc */ + return rel ? ref_regno : 0; } /* Compare BTF of a function with given bpf_reg_state. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b186185b6b2..8c5a46d41f28 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -452,7 +452,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { return base_type(type) == PTR_TO_SOCKET || base_type(type) == PTR_TO_TCP_SOCK || - base_type(type) == PTR_TO_MEM; + base_type(type) == PTR_TO_MEM || + base_type(type) == PTR_TO_BTF_ID; } static bool type_is_rdonly_mem(u32 type) @@ -3493,11 +3494,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, #define MAX_PACKET_OFF 0xffff -static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) -{ - return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; -} - static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) @@ -6845,15 +6841,17 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; + int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; - int err; + bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) @@ -6875,16 +6873,36 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } + acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_ACQUIRE, func_id); + /* Check the arguments */ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); - if (err) + if (err < 0) return err; + /* In case of release function, we get register number of refcounted + * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now + */ + if (err) { + err = release_reference(env, regs[err].ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, func_id); + return err; + } + } for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + + if (acq && !btf_type_is_ptr(t)) { + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); + return -EINVAL; + } + if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); mark_btf_func_reg_size(env, BPF_REG_0, t->size); @@ -6903,7 +6921,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RET_NULL, func_id)) { + regs[BPF_REG_0].type |= PTR_MAYBE_NULL; + /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ + regs[BPF_REG_0].id = ++env->id_gen; + } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + if (acq) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + regs[BPF_REG_0].id = id; + regs[BPF_REG_0].ref_obj_id = id; + } } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ nargs = btf_type_vlen(func_proto); @@ -11548,7 +11580,7 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) - err = check_kfunc_call(env, insn); + err = check_kfunc_call(env, insn, &env->insn_idx); else err = check_helper_call(env, insn, &env->insn_idx); if (err) -- cgit v1.2.3 From f10d059661968b01ef61a8b516775f95a18ab8ae Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:25 +0000 Subject: bpf: Make BPF_PROG_RUN_ARRAY return -err instead of allow boolean Right now BPF_PROG_RUN_ARRAY and related macros return 1 or 0 for whether the prog array allows or rejects whatever is being hooked. The caller of these macros then return -EPERM or continue processing based on thw macro's return value. Unforunately this is inflexible, since -EPERM is the only err that can be returned. This patch should be a no-op; it prepares for the next patch. The returning of the -EPERM is moved to inside the macros, so the outer functions are directly returning what the macros returned if they are non-zero. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/788abcdca55886d1f43274c918eaa9f792a9f33b.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++------- kernel/bpf/cgroup.c | 41 +++++++++++++++-------------------------- security/device_cgroup.c | 2 +- 3 files changed, 25 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6d7346c54d83..83da1764fcfe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1277,7 +1277,7 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); -static __always_inline u32 +static __always_inline int BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog, u32 *ret_flags) @@ -1287,7 +1287,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - u32 ret = 1; + int ret = 0; u32 func_ret; migrate_disable(); @@ -1298,7 +1298,8 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); - ret &= (func_ret & 1); + if (!(func_ret & 1)) + ret = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; } @@ -1308,7 +1309,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, return ret; } -static __always_inline u32 +static __always_inline int BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog) { @@ -1317,7 +1318,7 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - u32 ret = 1; + int ret = 0; migrate_disable(); rcu_read_lock(); @@ -1326,7 +1327,8 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; - ret &= run_prog(prog, ctx); + if (!run_prog(prog, ctx)) + ret = -EPERM; item++; } bpf_reset_run_ctx(old_run_ctx); @@ -1394,7 +1396,7 @@ out: u32 _ret; \ _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ - if (_ret) \ + if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 514b4681a90a..386155d279b3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1080,7 +1080,6 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); - ret = (ret == 1 ? 0 : -EPERM); } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1107,10 +1106,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, + bpf_prog_run); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1142,7 +1140,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }; struct sockaddr_storage unspec; struct cgroup *cgrp; - int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). @@ -1156,10 +1153,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, flags); - - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1184,11 +1179,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, + bpf_prog_run); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1201,15 +1194,15 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow; + int ret; rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run); rcu_read_unlock(); - return !allow; + return ret; } static const struct bpf_func_proto * @@ -1350,7 +1343,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.new_val); } - return ret == 1 ? 0 : -EPERM; + return ret; } #ifdef CONFIG_NET @@ -1455,10 +1448,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, &ctx, bpf_prog_run); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret) goto out; - } if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ @@ -1565,10 +1556,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, &ctx, bpf_prog_run); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret) goto out; - } if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; @@ -1624,8 +1613,8 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], &ctx, bpf_prog_run); - if (!ret) - return -EPERM; + if (ret) + return ret; if (ctx.optlen > *optlen) return -EFAULT; diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 842889f3dcb7..a9f8c63a96d1 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -838,7 +838,7 @@ int devcgroup_check_permission(short type, u32 major, u32 minor, short access) int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access); if (rc) - return -EPERM; + return rc; #ifdef CONFIG_CGROUP_DEVICE return devcgroup_legacy_check_permission(type, major, minor, access); -- cgit v1.2.3 From c4dcfdd406aa2167396ac215e351e5e4dfd7efe3 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:26 +0000 Subject: bpf: Move getsockopt retval to struct bpf_cg_run_ctx The retval value is moved to struct bpf_cg_run_ctx for ease of access in different prog types with different context structs layouts. The helper implementation (to be added in a later patch in the series) can simply perform a container_of from current->bpf_ctx to retrieve bpf_cg_run_ctx. Unfortunately, there is no easy way to access the current task_struct via the verifier BPF bytecode rewrite, aside from possibly calling a helper, so a pointer to current task is added to struct bpf_sockopt_kern so that the rewritten BPF bytecode can access struct bpf_cg_run_ctx with an indirection. For backward compatibility, if a getsockopt program rejects a syscall by returning 0, an -EPERM will be generated, by having the BPF_PROG_RUN_ARRAY_CG family macros automatically set the retval to -EPERM. Unlike prior to this patch, this -EPERM will be visible to ctx->retval for any other hooks down the line in the prog array. Additionally, the restriction that getsockopt filters can only set the retval to 0 is removed, considering that certain getsockopt implementations may return optlen. Filters are now able to set the value arbitrarily. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/73b0325f5c29912ccea7ea57ec1ed4d388fc1d37.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 ++++++------ include/linux/filter.h | 5 ++- kernel/bpf/cgroup.c | 82 +++++++++++++++++++++++++++++--------------------- 3 files changed, 63 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83da1764fcfe..7b0c11f414d0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1245,6 +1245,7 @@ struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; const struct bpf_prog_array_item *prog_item; + int retval; }; struct bpf_trace_run_ctx { @@ -1280,16 +1281,16 @@ typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); static __always_inline int BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog, - u32 *ret_flags) + int retval, u32 *ret_flags) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - int ret = 0; u32 func_ret; + run_ctx.retval = retval; migrate_disable(); rcu_read_lock(); array = rcu_dereference(array_rcu); @@ -1299,27 +1300,28 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); if (!(func_ret & 1)) - ret = -EPERM; + run_ctx.retval = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; } bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); migrate_enable(); - return ret; + return run_ctx.retval; } static __always_inline int BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog) + const void *ctx, bpf_prog_run_fn run_prog, + int retval) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - int ret = 0; + run_ctx.retval = retval; migrate_disable(); rcu_read_lock(); array = rcu_dereference(array_rcu); @@ -1328,13 +1330,13 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; if (!run_prog(prog, ctx)) - ret = -EPERM; + run_ctx.retval = -EPERM; item++; } bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); migrate_enable(); - return ret; + return run_ctx.retval; } static __always_inline u32 @@ -1394,7 +1396,7 @@ out: u32 _flags = 0; \ bool _cn; \ u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ + _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 71fa57b88bfc..d23e999dc032 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1356,7 +1356,10 @@ struct bpf_sockopt_kern { s32 level; s32 optname; s32 optlen; - s32 retval; + /* for retval in struct bpf_cg_run_ctx */ + struct task_struct *current_task; + /* Temporary "register" for indirect stores to ppos. */ + u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 386155d279b3..b6fad0bbf5a7 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1079,7 +1079,7 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb); + __bpf_prog_run_save_cb, 0); } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1108,7 +1108,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, - bpf_prog_run); + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1154,7 +1154,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, flags); + bpf_prog_run, 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1181,7 +1181,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run); + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1199,7 +1199,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, rcu_read_lock(); cgrp = task_dfl_cgroup(current); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run); + bpf_prog_run, 0); rcu_read_unlock(); return ret; @@ -1330,7 +1330,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1445,7 +1446,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, 0); release_sock(sk); if (ret) @@ -1509,7 +1510,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, + .current_task = current, }; int ret; @@ -1553,10 +1554,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, retval); release_sock(sk); - if (ret) + if (ret < 0) goto out; if (ctx.optlen > max_optlen || ctx.optlen < 0) { @@ -1564,14 +1565,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) { - ret = -EFAULT; - goto out; - } - if (ctx.optlen != 0) { if (copy_to_user(optval, ctx.optval, ctx.optlen) || put_user(ctx.optlen, optlen)) { @@ -1580,8 +1573,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } } - ret = ctx.retval; - out: sockopt_free_buf(&ctx, &buf); return ret; @@ -1596,10 +1587,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, .optlen = *optlen, .optval = optval, .optval_end = optval + *optlen, + .current_task = current, }; int ret; @@ -1612,25 +1603,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); - if (ret) + &ctx, bpf_prog_run, retval); + if (ret < 0) return ret; if (ctx.optlen > *optlen) return -EFAULT; - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) - return -EFAULT; - /* BPF programs can shrink the buffer, export the modifications. */ if (ctx.optlen != 0) *optlen = ctx.optlen; - return ctx.retval; + return ret; } #endif @@ -2046,10 +2031,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); break; case offsetof(struct bpf_sockopt, retval): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); + + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + treg, treg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + treg, si->src_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + si->dst_reg, si->dst_reg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + si->dst_reg, si->dst_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + } break; case offsetof(struct bpf_sockopt, optval): *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); -- cgit v1.2.3 From b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:27 +0000 Subject: bpf: Add cgroup helpers bpf_{get,set}_retval to get/set syscall return value The helpers continue to use int for retval because all the hooks are int-returning rather than long-returning. The return value of bpf_set_retval is int for future-proofing, in case in the future there may be errors trying to set the retval. After the previous patch, if a program rejects a syscall by returning 0, an -EPERM will be generated no matter if the retval is already set to -err. This patch change it being forced only if retval is not -err. This is because we want to support, for example, invoking bpf_set_retval(-EINVAL) and return 0, and have the syscall return value be -EINVAL not -EPERM. For BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY, the prior behavior is that, if the return value is NET_XMIT_DROP, the packet is silently dropped. We preserve this behavior for backward compatibility reasons, so even if an errno is set, the errno does not return to caller. However, setting a non-err to retval cannot propagate so this is not allowed and we return a -EFAULT in that case. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/b4013fd5d16bed0b01977c1fafdeae12e1de61fb.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++---- include/uapi/linux/bpf.h | 18 ++++++++++++++++++ kernel/bpf/cgroup.c | 38 +++++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 18 ++++++++++++++++++ 4 files changed, 79 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7b0c11f414d0..dce54eb0aae8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1299,7 +1299,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); - if (!(func_ret & 1)) + if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; @@ -1329,7 +1329,7 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; - if (!run_prog(prog, ctx)) + if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -1389,7 +1389,7 @@ out: * 0: NET_XMIT_SUCCESS skb should be transmitted * 1: NET_XMIT_DROP skb should be dropped and cn * 2: NET_XMIT_CN skb should be transmitted and cn - * 3: -EPERM skb should be dropped + * 3: -err skb should be dropped */ #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ ({ \ @@ -1398,10 +1398,12 @@ out: u32 _ret; \ _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ + if (_ret && !IS_ERR_VALUE((long)_ret)) \ + _ret = -EFAULT; \ if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ - _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret = (_cn ? NET_XMIT_DROP : _ret); \ _ret; \ }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9c96c21330a..fe2272defcd9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5033,6 +5033,22 @@ union bpf_attr { * * Return * The number of arguments of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * The syscall's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5221,6 +5237,8 @@ union bpf_attr { FN(get_func_arg), \ FN(get_func_ret), \ FN(get_func_arg_cnt), \ + FN(get_retval), \ + FN(set_retval), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b6fad0bbf5a7..279ebbed75a5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr * NET_XMIT_CN (2) - continue with packet output and notify TCP * to call cwr - * -EPERM - drop packet + * -err - drop packet * * For ingress packets, this function will return -EPERM if any * attached program was found and if it returned != 1 during execution. @@ -1080,6 +1080,8 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb, 0); + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1205,6 +1207,36 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, return ret; } +BPF_CALL_0(bpf_get_retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + return ctx->retval; +} + +static const struct bpf_func_proto bpf_get_retval_proto = { + .func = bpf_get_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_1(bpf_set_retval, int, retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + ctx->retval = retval; + return 0; +} + +static const struct bpf_func_proto bpf_set_retval_proto = { + .func = bpf_set_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1217,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; + case BPF_FUNC_get_retval: + return &bpf_get_retval_proto; + case BPF_FUNC_set_retval: + return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a9c96c21330a..fe2272defcd9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5033,6 +5033,22 @@ union bpf_attr { * * Return * The number of arguments of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * The syscall's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5221,6 +5237,8 @@ union bpf_attr { FN(get_func_arg), \ FN(get_func_ret), \ FN(get_func_arg_cnt), \ + FN(get_retval), \ + FN(set_retval), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 748cd5729ac7421091316e32dcdffb0578563880 Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 19 Jan 2022 09:40:04 +0800 Subject: bpf: support BPF_PROG_QUERY for progs attached to sockmap Right now there is no way to query whether BPF programs are attached to a sockmap or not. we can use the standard interface in libbpf to query, such as: bpf_prog_query(mapFd, BPF_SK_SKB_STREAM_PARSER, 0, NULL, ...); the mapFd is the fd of sockmap. Signed-off-by: Di Zhu Acked-by: Yonghong Song Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220119014005.1209-1-zhudi2@huawei.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++++ kernel/bpf/syscall.c | 5 ++++ net/core/sock_map.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 84 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dce54eb0aae8..80e3387ea3af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2069,6 +2069,9 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog, int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); + void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else @@ -2122,6 +2125,12 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } + +static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fa4505f9b611..9e0631f091a6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3318,6 +3318,11 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_FLOW_DISSECTOR: case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_MSG_VERDICT: + case BPF_SK_SKB_VERDICT: + return sock_map_bpf_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1827669eedd6..2d213c4011db 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1416,38 +1416,50 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog, + u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); - struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - pprog = &progs->msg_parser; + *pprog = &progs->msg_parser; break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->stream_parser; + *pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: if (progs->skb_verdict) return -EBUSY; - pprog = &progs->stream_verdict; + *pprog = &progs->stream_verdict; break; case BPF_SK_SKB_VERDICT: if (progs->stream_verdict) return -EBUSY; - pprog = &progs->skb_verdict; + *pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + return 0; +} + +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) +{ + struct bpf_prog **pprog; + int ret; + + ret = sock_map_prog_lookup(map, &pprog, which); + if (ret) + return ret; + if (old) return psock_replace_prog(pprog, prog, old); @@ -1455,6 +1467,57 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, return 0; } +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_cnt = 0, flags = 0, ufd = attr->target_fd; + struct bpf_prog **pprog; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + u32 id = 0; + int ret; + + if (attr->query.query_flags) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + rcu_read_lock(); + + ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type); + if (ret) + goto end; + + prog = *pprog; + prog_cnt = !prog ? 0 : 1; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + goto end; + + /* we do not hold the refcnt, the bpf prog may be released + * asynchronously and the id would be set to 0. + */ + id = data_race(prog->aux->id); + if (id == 0) + prog_cnt = 0; + +end: + rcu_read_unlock(); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) || + (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) || + copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + ret = -EFAULT; + + fdput(f); + return ret; +} + static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link) { switch (link->map->map_type) { -- cgit v1.2.3 From d16697cb6261d4cc23422e6b1cb2759df8aa76d0 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:44 +0100 Subject: net: skbuff: add size metadata to skb_shared_info for xdp Introduce xdp_frags_size field in skb_shared_info data structure to store xdp_buff/xdp_frame frame paged size (xdp_frags_size will be used in xdp frags support). In order to not increase skb_shared_info size we will use a hole due to skb_shared_info alignment. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/8a849819a3e0a143d540f78a3a5add76e17e980d.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bf11e1fbd69b..8131d0de7559 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -557,6 +557,7 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; + unsigned int xdp_frags_size; /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ -- cgit v1.2.3 From c2f2cdbeffda7b153c19e0f3d73149c41026c0db Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:52 +0100 Subject: bpf: introduce BPF_F_XDP_HAS_FRAGS flag in prog_flags loading the ebpf program Introduce BPF_F_XDP_HAS_FRAGS and the related field in bpf_prog_aux in order to notify the driver the loaded program support xdp frags. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/db2e8075b7032a356003f407d1b0deb99adaa0ed.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ kernel/bpf/syscall.c | 4 +++- tools/include/uapi/linux/bpf.h | 5 +++++ 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80e3387ea3af..e93ed028a030 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -933,6 +933,7 @@ struct bpf_prog_aux { bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; + bool xdp_has_frags; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fe2272defcd9..945649c67e03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1113,6 +1113,11 @@ enum bpf_link_type { */ #define BPF_F_SLEEPABLE (1U << 4) +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9e0631f091a6..f29090643c6e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2217,7 +2217,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | - BPF_F_TEST_RND_HI32)) + BPF_F_TEST_RND_HI32 | + BPF_F_XDP_HAS_FRAGS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2303,6 +2304,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; + prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; err = security_bpf_prog_alloc(prog->aux); if (err) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fe2272defcd9..945649c67e03 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1113,6 +1113,11 @@ enum bpf_link_type { */ #define BPF_F_SLEEPABLE (1U << 4) +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * -- cgit v1.2.3 From f45d5b6ce2e835834c94b8b700787984f02cd662 Mon Sep 17 00:00:00 2001 From: Toke Hoiland-Jorgensen Date: Fri, 21 Jan 2022 11:10:02 +0100 Subject: bpf: generalise tail call map compatibility check The check for tail call map compatibility ensures that tail calls only happen between maps of the same type. To ensure backwards compatibility for XDP frags we need a similar type of check for cpumap and devmap programs, so move the state from bpf_array_aux into bpf_map, add xdp_has_frags to the check, and apply the same check to cpumap and devmap. Acked-by: John Fastabend Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Toke Hoiland-Jorgensen Link: https://lore.kernel.org/r/f19fd97c0328a39927f3ad03e1ca6b43fd53cdfd.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 +++++++++++++++++++----------- kernel/bpf/arraymap.c | 4 +--- kernel/bpf/core.c | 28 ++++++++++++++-------------- kernel/bpf/cpumap.c | 8 +++++--- kernel/bpf/devmap.c | 3 ++- kernel/bpf/syscall.c | 15 +++++++-------- 6 files changed, 48 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e93ed028a030..e8ec8d2f2fe3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -194,6 +194,17 @@ struct bpf_map { struct work_struct work; struct mutex freeze_mutex; atomic64_t writecnt; + /* 'Ownership' of program-containing map is claimed by the first program + * that is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have the + * same prog type, JITed flag and xdp_has_frags flag. + */ + struct { + spinlock_t lock; + enum bpf_prog_type type; + bool jited; + bool xdp_has_frags; + } owner; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -994,16 +1005,6 @@ struct bpf_prog_aux { }; struct bpf_array_aux { - /* 'Ownership' of prog array is claimed by the first program that - * is going to use this map or by the first program which FD is - * stored in the map to make sure that all callers and callees have - * the same prog type and JITed flag. - */ - struct { - spinlock_t lock; - enum bpf_prog_type type; - bool jited; - } owner; /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; @@ -1178,7 +1179,14 @@ struct bpf_event_entry { struct rcu_head rcu; }; -bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +static inline bool map_type_contains_progs(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_CPUMAP; +} + +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c7a5be3bf8be..7f145aefbff8 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); if (IS_ERR(prog)) return prog; - if (!bpf_prog_array_compatible(array, prog)) { + if (!bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } @@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); - spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index de3e5bc6781f..0a1cfd8544b9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1829,28 +1829,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, } #endif -bool bpf_prog_array_compatible(struct bpf_array *array, - const struct bpf_prog *fp) +bool bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { bool ret; if (fp->kprobe_override) return false; - spin_lock(&array->aux->owner.lock); - - if (!array->aux->owner.type) { + spin_lock(&map->owner.lock); + if (!map->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->owner.type = fp->type; - array->aux->owner.jited = fp->jited; + map->owner.type = fp->type; + map->owner.jited = fp->jited; + map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = array->aux->owner.type == fp->type && - array->aux->owner.jited == fp->jited; + ret = map->owner.type == fp->type && + map->owner.jited == fp->jited && + map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } - spin_unlock(&array->aux->owner.lock); + spin_unlock(&map->owner.lock); + return ret; } @@ -1862,13 +1864,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; - struct bpf_array *array; - if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + if (!map_type_contains_progs(map)) continue; - array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) { + if (!bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b3e6b9422238..650e5d21f90d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, + struct bpf_map *map, int fd) { struct bpf_prog *prog; @@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + if (prog->expected_attach_type != BPF_XDP_CPUMAP || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return -EINVAL; } @@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; /* Setup kthread */ diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index fe019dbdb3f0..038f6d7a83e4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -858,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; - if (prog->expected_attach_type != BPF_XDP_DEVMAP) + if (prog->expected_attach_type != BPF_XDP_DEVMAP || + !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f29090643c6e..72ce1edde950 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -556,16 +556,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { - const struct bpf_map *map = filp->private_data; - const struct bpf_array *array; + struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { - array = container_of(map, struct bpf_array, map); - spin_lock(&array->aux->owner.lock); - type = array->aux->owner.type; - jited = array->aux->owner.jited; - spin_unlock(&array->aux->owner.lock); + if (map_type_contains_progs(map)) { + spin_lock(&map->owner.lock); + type = map->owner.type; + jited = map->owner.jited; + spin_unlock(&map->owner.lock); } seq_printf(m, @@ -874,6 +872,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); + spin_lock_init(&map->owner.lock); map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; -- cgit v1.2.3 From 376040e47334c6dc6a939a32197acceb00fe4acf Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Mon, 24 Jan 2022 10:54:01 -0800 Subject: bpf: Add bpf_copy_from_user_task() helper This adds a helper for bpf programs to read the memory of other tasks. As an example use case at Meta, we are using a bpf task iterator program and this new helper to print C++ async stack traces for all threads of a given process. Signed-off-by: Kenny Yu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220124185403.468466-3-kennyyu@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/helpers.c | 34 ++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 11 +++++++++++ 5 files changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8c92c974bd12..394305a5e02f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2243,6 +2243,7 @@ extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_strncmp_proto; +extern const struct bpf_func_proto bpf_copy_from_user_task_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 16a7574292a5..4a2f7041ebae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5076,6 +5076,16 @@ union bpf_attr { * associated to *xdp_md*, at *offset*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5269,6 +5279,7 @@ union bpf_attr { FN(xdp_get_buff_len), \ FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ + FN(copy_from_user_task), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 01cfdf40c838..ed2780b76cc1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -671,6 +672,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, + const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) +{ + int ret; + + /* flags is not used yet */ + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(!size)) + return 0; + + ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); + if (ret == size) + return 0; + + memset(dst, 0, size); + /* Return -EFAULT for partial read */ + return ret < 0 ? ret : -EFAULT; +} + +const struct bpf_func_proto bpf_copy_from_user_task_proto = { + .func = bpf_copy_from_user_task, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_BTF_ID, + .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg5_type = ARG_ANYTHING +}; + BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 06a9e220069e..a2024ba32a20 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1235,6 +1235,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; + case BPF_FUNC_copy_from_user_task: + return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_per_cpu_ptr: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 16a7574292a5..4a2f7041ebae 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5076,6 +5076,16 @@ union bpf_attr { * associated to *xdp_md*, at *offset*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5269,6 +5279,7 @@ union bpf_attr { FN(xdp_get_buff_len), \ FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ + FN(copy_from_user_task), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From be6ec5b7026620b931e0fa9287d24ad2cd2ab9b6 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 26 Jan 2022 10:25:55 +0000 Subject: net: xpcs: add support for retrieving supported interface modes Add a function to the xpcs driver to retrieve the supported PHY interface modes, which can be used by drivers to fill in phylink's supported_interfaces mask. We validate the interface bit index to ensure that it fits within the bitmap as xpcs lists PHY_INTERFACE_MODE_MAX in an entry. Tested-by: Wong Vee Khee # Intel EHL Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/pcs/pcs-xpcs.c | 14 ++++++++++++++ include/linux/pcs/pcs-xpcs.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index cd6742e6ba8b..f45821524fab 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -662,6 +662,20 @@ void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported, } EXPORT_SYMBOL_GPL(xpcs_validate); +void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) +{ + int i, j; + + for (i = 0; i < DW_XPCS_INTERFACE_MAX; i++) { + const struct xpcs_compat *compat = &xpcs->id->compat[i]; + + for (j = 0; j < compat->num_interfaces; j++) + if (compat->interface[j] < PHY_INTERFACE_MODE_MAX) + __set_bit(compat->interface[j], interfaces); + } +} +EXPORT_SYMBOL_GPL(xpcs_get_interfaces); + int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable) { int ret; diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index add077a81b21..3126a4924d92 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -33,6 +33,7 @@ int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface, unsigned int mode); void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported, struct phylink_link_state *state); +void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev, -- cgit v1.2.3 From fe70fb74b56407c1a5be347258082f8abbe7956d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 26 Jan 2022 10:26:10 +0000 Subject: net: stmmac/xpcs: convert to pcs_validate() stmmac explicitly calls the xpcs driver to validate the ethtool linkmodes. This is no longer necessary as phylink now supports validation through a PCS method. Convert both drivers to use this new mechanism. Tested-by: Wong Vee Khee # Intel EHL Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 ----- drivers/net/pcs/pcs-xpcs.c | 27 +++++++++-------------- include/linux/pcs/pcs-xpcs.h | 2 -- 3 files changed, 11 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index bd20920daf7b..029f21b9d452 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -940,7 +940,6 @@ static void stmmac_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state) { - struct stmmac_priv *priv = netdev_priv(to_net_dev(config->dev)); __ETHTOOL_DECLARE_LINK_MODE_MASK(mac_supported) = { 0, }; /* This is very similar to phylink_generic_validate() except that @@ -958,10 +957,6 @@ static void stmmac_validate(struct phylink_config *config, linkmode_and(supported, supported, mac_supported); linkmode_and(state->advertising, state->advertising, mac_supported); - - /* If PCS is supported, check which modes it supports. */ - if (priv->hw->xpcs) - xpcs_validate(priv->hw->xpcs, supported, state); } static void stmmac_mac_config(struct phylink_config *config, unsigned int mode, diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index f45821524fab..61418d4dc0cd 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -632,35 +632,29 @@ static void xpcs_resolve_pma(struct dw_xpcs *xpcs, } } -void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported, - struct phylink_link_state *state) +static int xpcs_validate(struct phylink_pcs *pcs, unsigned long *supported, + const struct phylink_link_state *state) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(xpcs_supported); + __ETHTOOL_DECLARE_LINK_MODE_MASK(xpcs_supported) = { 0, }; const struct xpcs_compat *compat; + struct dw_xpcs *xpcs; int i; - /* phylink expects us to report all supported modes with - * PHY_INTERFACE_MODE_NA, just don't limit the supported and - * advertising masks and exit. - */ - if (state->interface == PHY_INTERFACE_MODE_NA) - return; - - linkmode_zero(xpcs_supported); - + xpcs = phylink_pcs_to_xpcs(pcs); compat = xpcs_find_compat(xpcs->id, state->interface); - /* Populate the supported link modes for this - * PHY interface type + /* Populate the supported link modes for this PHY interface type. + * FIXME: what about the port modes and autoneg bit? This masks + * all those away. */ if (compat) for (i = 0; compat->supported[i] != __ETHTOOL_LINK_MODE_MASK_NBITS; i++) set_bit(compat->supported[i], xpcs_supported); linkmode_and(supported, supported, xpcs_supported); - linkmode_and(state->advertising, state->advertising, xpcs_supported); + + return 0; } -EXPORT_SYMBOL_GPL(xpcs_validate); void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces) { @@ -1120,6 +1114,7 @@ static const struct xpcs_id xpcs_id_list[] = { }; static const struct phylink_pcs_ops xpcs_phylink_ops = { + .pcs_validate = xpcs_validate, .pcs_config = xpcs_config, .pcs_get_state = xpcs_get_state, .pcs_link_up = xpcs_link_up, diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 3126a4924d92..266eb26fb029 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -31,8 +31,6 @@ void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode, phy_interface_t interface, int speed, int duplex); int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface, unsigned int mode); -void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported, - struct phylink_link_state *state); void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); -- cgit v1.2.3 From 8033c6c2fed235b3d571b5a5ede302b752bc5c7d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 10:54:12 -0800 Subject: bpf: remove unused static inlines Remove two dead stubs, sk_msg_clear_meta() was never used, use of xskq_cons_is_full() got replaced by xsk_tx_writeable() in v5.10. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220126185412.2776254-1-kuba@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ---------- include/linux/skmsg.h | 5 ----- net/xdp/xsk_queue.h | 7 ------- 3 files changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 394305a5e02f..2344f793c4dc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1875,11 +1875,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline bool dev_map_can_have_prog(struct bpf_map *map) -{ - return false; -} - static inline void __dev_flush(void) { } @@ -1943,11 +1938,6 @@ static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, return -EOPNOTSUPP; } -static inline bool cpu_map_prog_allowed(struct bpf_map *map) -{ - return false; -} - static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 18a717fe62eb..ddde5f620901 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -171,11 +171,6 @@ static inline u32 sk_msg_iter_dist(u32 start, u32 end) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) -static inline void sk_msg_clear_meta(struct sk_msg *msg) -{ - memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); -} - static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index e9aa2c236356..79be3e53ddf1 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -304,13 +304,6 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) q->cached_cons += cnt; } -static inline bool xskq_cons_is_full(struct xsk_queue *q) -{ - /* No barriers needed since data is not accessed */ - return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer) == - q->nentries; -} - static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ -- cgit v1.2.3 From bd5daba2d02479729526d35de85807aadf6fba20 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:55 -0800 Subject: mii: remove mii_lpa_to_linkmode_lpa_sgmii() The only caller of mii_lpa_to_linkmode_lpa_sgmii() disappeared in v5.10. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/mii.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index 12ea29e04293..b8a1a17a87dd 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -387,23 +387,6 @@ mii_lpa_mod_linkmode_lpa_sgmii(unsigned long *lp_advertising, u32 lpa) speed_duplex == LPA_SGMII_10FULL); } -/** - * mii_lpa_to_linkmode_adv_sgmii - * @advertising: pointer to destination link mode. - * @lpa: value of the MII_LPA register - * - * A small helper function that translates MII_ADVERTISE bits - * to linkmode advertisement settings when in SGMII mode. - * Clears the old value of advertising. - */ -static inline void mii_lpa_to_linkmode_lpa_sgmii(unsigned long *lp_advertising, - u32 lpa) -{ - linkmode_zero(lp_advertising); - - mii_lpa_mod_linkmode_lpa_sgmii(lp_advertising, lpa); -} - /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. -- cgit v1.2.3 From b1755400b4be33dbd286272b153579631be2e2ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:57 -0800 Subject: net: remove net_invalid_timestamp() No callers since v3.15. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8131d0de7559..60bd2347708c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3899,11 +3899,6 @@ static inline ktime_t net_timedelta(ktime_t t) return ktime_sub(ktime_get_real(), t); } -static inline ktime_t net_invalid_timestamp(void) -{ - return 0; -} - static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; -- cgit v1.2.3 From 08dfa5a19e1f4344ce5d3a5eed4c5529adafe0dc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:58 -0800 Subject: net: remove linkmode_change_bit() No callers since v5.7, the initial use case seems pretty esoteric so removing this should not harm the completeness of the API. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/linkmode.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index f8397f300fcd..15e0e0209da4 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -66,11 +66,6 @@ static inline void linkmode_mod_bit(int nr, volatile unsigned long *addr, linkmode_clear_bit(nr, addr); } -static inline void linkmode_change_bit(int nr, volatile unsigned long *addr) -{ - __change_bit(nr, addr); -} - static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr) { return test_bit(nr, addr); -- cgit v1.2.3 From 8b2d546e23bb3588f897089368b5f09e49f7762d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:02 -0800 Subject: ipv6: remove inet6_rsk() and tcp_twsk_ipv6only() The stubs under !CONFIG_IPV6 were missed when real functions got deleted ca. v3.13. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ipv6.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a59d25f19385..1e0f8a31f3de 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -371,19 +371,12 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) return NULL; } -static inline struct inet6_request_sock * - inet6_rsk(const struct request_sock *rsk) -{ - return NULL; -} - static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; } #define inet6_rcv_saddr(__sk) NULL -#define tcp_twsk_ipv6only(__sk) 0 #define inet_v6_ipv6only(__sk) 0 #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _IPV6_H */ -- cgit v1.2.3 From cc81df835c25b108248bad24021a21e77cbb84ac Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:04 -0800 Subject: udp: remove inner_udp_hdr() Not used since added in v3.8. Signed-off-by: Jakub Kicinski Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index ae66dadd8543..254a2654400f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -23,11 +23,6 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) return (struct udphdr *)skb_transport_header(skb); } -static inline struct udphdr *inner_udp_hdr(const struct sk_buff *skb) -{ - return (struct udphdr *)skb_inner_transport_header(skb); -} - #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) -- cgit v1.2.3 From d59a67f2f3f39012271ed3d11c338706a011c5c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:06 -0800 Subject: netlink: remove nl_set_extack_cookie_u32() Not used since v5.10. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netlink.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1ec631838af9..bda1c385cffb 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -135,15 +135,6 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, extack->cookie_len = sizeof(cookie); } -static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack, - u32 cookie) -{ - if (!extack) - return; - memcpy(extack->cookie, &cookie, sizeof(cookie)); - extack->cookie_len = sizeof(cookie); -} - void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); -- cgit v1.2.3 From 46531a30364bd483bfa1b041c15d42a196e77e93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jan 2022 14:09:13 +0000 Subject: cgroup/bpf: fast path skb BPF filtering Even though there is a static key protecting from overhead from cgroup-bpf skb filtering when there is nothing attached, in many cases it's not enough as registering a filter for one type will ruin the fast path for all others. It's observed in production servers I've looked at but also in laptops, where registration is done during init by systemd or something else. Add a per-socket fast path check guarding from such overhead. This affects both receive and transmit paths of TCP, UDP and other protocols. It showed ~1% tx/s improvement in small payload UDP send benchmarks using a real NIC and in a server environment and the number jumps to 2-3% for preemtible kernels. Reviewed-by: Stanislav Fomichev Signed-off-by: Pavel Begunkov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/d8c58857113185a764927a46f4b5a058d36d3ec3.1643292455.git.asml.silence@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 24 ++++++++++++++++++++---- include/linux/bpf.h | 13 +++++++++++++ kernel/bpf/cgroup.c | 30 ------------------------------ kernel/bpf/core.c | 16 ++++------------ 4 files changed, 37 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b525d8cdc25b..88a51b242adc 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -8,6 +8,7 @@ #include #include #include +#include #include struct sock; @@ -165,11 +166,23 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); +/* Opportunistic check to see whether we have any BPF program attached*/ +static inline bool cgroup_bpf_sock_enabled(struct sock *sk, + enum cgroup_bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog_array *array; + + array = rcu_access_pointer(cgrp->bpf.effective[type]); + return array != &bpf_empty_prog_array.hdr; +} + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ + cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ @@ -181,7 +194,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk)) \ + if (sk_fullsock(__sk) && \ + cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ } \ @@ -347,7 +361,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) && \ + cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -367,7 +382,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) && \ + cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2344f793c4dc..e3b82ce51445 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1233,6 +1233,19 @@ struct bpf_prog_array { struct bpf_prog_array_item items[]; }; +struct bpf_empty_prog_array { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +}; + +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'bpf_empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +extern struct bpf_empty_prog_array bpf_empty_prog_array; + struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 279ebbed75a5..098632fdbc45 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1384,20 +1384,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } #ifdef CONFIG_NET -static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum cgroup_bpf_attach_type attach_type) -{ - struct bpf_prog_array *prog_array; - bool empty; - - rcu_read_lock(); - prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); - empty = bpf_prog_array_is_empty(prog_array); - rcu_read_unlock(); - - return empty; -} - static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, struct bpf_sockopt_buf *buf) { @@ -1456,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, }; int ret, max_optlen; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) - return 0; - /* Allocate a bit more than the initial user buffer for * BPF program. The canonical use case is overriding * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1550,15 +1528,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, }; int ret; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) - return retval; - ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 0a1cfd8544b9..04a8d5bea552 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1968,18 +1968,10 @@ static struct bpf_prog_dummy { }, }; -/* to avoid allocating empty bpf_prog_array for cgroups that - * don't have bpf program attached use one global 'empty_prog_array' - * It will not be modified the caller of bpf_prog_array_alloc() - * (since caller requested prog_cnt == 0) - * that pointer should be 'freed' by bpf_prog_array_free() - */ -static struct { - struct bpf_prog_array hdr; - struct bpf_prog *null_prog; -} empty_prog_array = { +struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; +EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { @@ -1989,12 +1981,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) (prog_cnt + 1), flags); - return &empty_prog_array.hdr; + return &bpf_empty_prog_array.hdr; } void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || progs == &empty_prog_array.hdr) + if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } -- cgit v1.2.3 From 7472d5a642c94a0ee1882ff3038de72ffe803a01 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Jan 2022 07:46:00 -0800 Subject: compiler_types: define __user as __attribute__((btf_type_tag("user"))) The __user attribute is currently mainly used by sparse for type checking. The attribute indicates whether a memory access is in user memory address space or not. Such information is important during tracing kernel internal functions or data structures as accessing user memory often has different mechanisms compared to accessing kernel memory. For example, the perf-probe needs explicit command line specification to indicate a particular argument or string in user-space memory ([1], [2], [3]). Currently, vmlinux BTF is available in kernel with many distributions. If __user attribute information is available in vmlinux BTF, the explicit user memory access information from users will not be necessary as the kernel can figure it out by itself with vmlinux BTF. Besides the above possible use for perf/probe, another use case is for bpf verifier. Currently, for bpf BPF_PROG_TYPE_TRACING type of bpf programs, users can write direct code like p->m1->m2 and "p" could be a function parameter. Without __user information in BTF, the verifier will assume p->m1 accessing kernel memory and will generate normal loads. Let us say "p" actually tagged with __user in the source code. In such cases, p->m1 is actually accessing user memory and direct load is not right and may produce incorrect result. For such cases, bpf_probe_read_user() will be the correct way to read p->m1. To support encoding __user information in BTF, a new attribute __attribute__((btf_type_tag(""))) is implemented in clang ([4]). For example, if we have #define __user __attribute__((btf_type_tag("user"))) during kernel compilation, the attribute "user" information will be preserved in dwarf. After pahole converting dwarf to BTF, __user information will be available in vmlinux BTF. The following is an example with latest upstream clang (clang14) and pahole 1.23: [$ ~] cat test.c #define __user __attribute__((btf_type_tag("user"))) int foo(int __user *arg) { return *arg; } [$ ~] clang -O2 -g -c test.c [$ ~] pahole -JV test.o ... [1] INT int size=4 nr_bits=32 encoding=SIGNED [2] TYPE_TAG user type_id=1 [3] PTR (anon) type_id=2 [4] FUNC_PROTO (anon) return=1 args=(3 arg) [5] FUNC foo type_id=4 [$ ~] You can see for the function argument "int __user *arg", its type is described as PTR -> TYPE_TAG(user) -> INT The kernel can use this information for bpf verification or other use cases. Current btf_type_tag is only supported in clang (>= clang14) and pahole (>= 1.23). gcc support is also proposed and under development ([5]). [1] http://lkml.kernel.org/r/155789874562.26965.10836126971405890891.stgit@devnote2 [2] http://lkml.kernel.org/r/155789872187.26965.4468456816590888687.stgit@devnote2 [3] http://lkml.kernel.org/r/155789871009.26965.14167558859557329331.stgit@devnote2 [4] https://reviews.llvm.org/D111199 [5] https://lore.kernel.org/bpf/0cbeb2fb-1a18-f690-e360-24b1c90c2a91@fb.com/ Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220127154600.652613-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler_types.h | 3 +++ lib/Kconfig.debug | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3c1795fdb568..3f31ff400432 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -31,6 +31,9 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __kernel # ifdef STRUCTLEAK_PLUGIN # define __user __attribute__((user)) +# elif defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ + __has_attribute(btf_type_tag) +# define __user __attribute__((btf_type_tag("user"))) # else # define __user # endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 14b89aa37c5c..6159859769fa 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -325,6 +325,14 @@ config DEBUG_INFO_BTF config PAHOLE_HAS_SPLIT_BTF def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "119") +config PAHOLE_HAS_BTF_TAG + def_bool $(success, test `$(PAHOLE) --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/'` -ge "123") + depends on CC_IS_CLANG + help + Decide whether pahole emits btf_tag attributes (btf_type_tag and + btf_decl_tag) or not. Currently only clang compiler implements + these attributes, so make the config depend on CC_IS_CLANG. + config DEBUG_INFO_BTF_MODULES def_bool y depends on DEBUG_INFO_BTF && MODULES && PAHOLE_HAS_SPLIT_BTF -- cgit v1.2.3 From c6f1bfe89ac95dc829dcb4ed54780da134ac5fce Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Jan 2022 07:46:06 -0800 Subject: bpf: reject program if a __user tagged memory accessed in kernel way BPF verifier supports direct memory access for BPF_PROG_TYPE_TRACING type of bpf programs, e.g., a->b. If "a" is a pointer pointing to kernel memory, bpf verifier will allow user to write code in C like a->b and the verifier will translate it to a kernel load properly. If "a" is a pointer to user memory, it is expected that bpf developer should be bpf_probe_read_user() helper to get the value a->b. Without utilizing BTF __user tagging information, current verifier will assume that a->b is a kernel memory access and this may generate incorrect result. Now BTF contains __user information, it can check whether the pointer points to a user memory or not. If it is, the verifier can reject the program and force users to use bpf_probe_read_user() helper explicitly. In the future, we can easily extend btf_add_space for other address space tagging, for example, rcu/percpu etc. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220127154606.654961-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++++--- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 34 ++++++++++++++++++++++++++++------ kernel/bpf/verifier.c | 35 ++++++++++++++++++++++++----------- net/bpf/bpf_dummy_struct_ops.c | 6 ++++-- net/ipv4/bpf_tcp_ca.c | 6 ++++-- 6 files changed, 71 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e3b82ce51445..6eb0b180d33b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -332,7 +332,10 @@ enum bpf_type_flag { */ MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_ALLOC, + /* MEM is in user address space. */ + MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_USER, }; /* Max number of base types. */ @@ -588,7 +591,7 @@ struct bpf_verifier_ops { const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id); + u32 *next_btf_id, enum bpf_type_flag *flag); }; struct bpf_prog_offload_ops { @@ -1780,7 +1783,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id); + u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id); diff --git a/include/linux/btf.h b/include/linux/btf.h index b12cfe3b12bb..f6c43dd513fa 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -238,6 +238,11 @@ static inline bool btf_type_is_var(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; } +static inline bool btf_type_is_type_tag(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG; +} + /* union is only a special case of struct: * all its offsetof(member) == 0 */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b2a248956100..b983cee8d196 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4886,6 +4886,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + const char *tag_value; u32 nr_args, arg; int i, ret; @@ -5038,6 +5039,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + if (strcmp(tag_value, "user") == 0) + info->reg_type |= MEM_USER; + } + /* skip modifiers */ while (btf_type_is_modifier(t)) { info->btf_id = t->type; @@ -5064,12 +5072,12 @@ enum bpf_struct_walk_result { static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; - const char *tname, *mname; + const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; again: @@ -5253,7 +5261,8 @@ error: } if (btf_type_is_ptr(mtype)) { - const struct btf_type *stype; + const struct btf_type *stype, *t; + enum bpf_type_flag tmp_flag = 0; u32 id; if (msize != size || off != moff) { @@ -5262,9 +5271,19 @@ error: mname, moff, tname, off, size); return -EACCES; } + + /* check __user tag */ + t = btf_type_by_id(btf, mtype->type); + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + if (strcmp(tag_value, "user") == 0) + tmp_flag = MEM_USER; + } + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; + *flag = tmp_flag; return WALK_PTR; } } @@ -5291,13 +5310,14 @@ error: int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { + enum bpf_type_flag tmp_flag = 0; int err; u32 id; do { - err = btf_struct_walk(log, btf, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); switch (err) { case WALK_PTR: @@ -5305,6 +5325,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, * we're done. */ *next_btf_id = id; + *flag = tmp_flag; return PTR_TO_BTF_ID; case WALK_SCALAR: return SCALAR_VALUE; @@ -5349,6 +5370,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *need_btf, u32 need_type_id) { const struct btf_type *type; + enum bpf_type_flag flag; int err; /* Are we already done? */ @@ -5359,7 +5381,7 @@ again: type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, btf, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); if (err != WALK_STRUCT) return false; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dcf065ec2774..1ae41d0cf96c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -536,7 +536,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}, prefix[16] = {0}; + char postfix[16] = {0}, prefix[32] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "inv", @@ -570,9 +570,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env, } if (type & MEM_RDONLY) - strncpy(prefix, "rdonly_", 16); + strncpy(prefix, "rdonly_", 32); if (type & MEM_ALLOC) - strncpy(prefix, "alloc_", 16); + strncpy(prefix, "alloc_", 32); + if (type & MEM_USER) + strncpy(prefix, "user_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -1547,14 +1549,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id) + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); return; } mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID; + regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -4152,6 +4155,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + enum bpf_type_flag flag = 0; u32 btf_id; int ret; @@ -4171,9 +4175,16 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if (reg->type & MEM_USER) { + verbose(env, + "R%d is ptr_%s access user memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id); + off, size, atype, &btf_id, &flag); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); @@ -4181,14 +4192,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id); + atype, &btf_id, &flag); } if (ret < 0) return ret; if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); return 0; } @@ -4201,6 +4212,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; u32 btf_id; @@ -4238,12 +4250,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); return 0; } @@ -4444,7 +4456,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, + &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index fbc896323bec..d0e54e30658a 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -145,7 +145,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id) + u32 *next_btf_id, + enum bpf_type_flag *flag) { const struct btf_type *state; s32 type_id; @@ -162,7 +163,8 @@ static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } - err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id); + err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); if (err < 0) return err; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index b60c9fd7147e..f79ab942f03b 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -96,12 +96,14 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id) + u32 *next_btf_id, + enum bpf_type_flag *flag) { size_t end; if (atype == BPF_READ) - return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); -- cgit v1.2.3 From 9059b04b4108be71397941f4665d5aa79783125a Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 9 Jan 2022 21:46:34 +0200 Subject: net/mlx5: Remove unused TIR modify bitmask enums struct mlx5_ifc_modify_tir_bitmask_bits is used for the bitmask of MODIFY_TIR operations. Remove the unused bitmask enums. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 598ac3bcc901..27145c4d6820 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -63,13 +63,6 @@ enum { MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR = 0x21 }; -enum { - MLX5_MODIFY_TIR_BITMASK_LRO = 0x0, - MLX5_MODIFY_TIR_BITMASK_INDIRECT_TABLE = 0x1, - MLX5_MODIFY_TIR_BITMASK_HASH = 0x2, - MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN = 0x3 -}; - enum { MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0, MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, -- cgit v1.2.3 From b5b3d10ef638fcadc56609961875bf157a92aaa4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Jan 2022 08:33:49 -0800 Subject: net: mii: remove mii_lpa_mod_linkmode_lpa_sgmii() Vladimir points out that since we removed mii_lpa_to_linkmode_lpa_sgmii(), mii_lpa_mod_linkmode_lpa_sgmii() is also no longer called. Suggested-by: Vladimir Oltean Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/mii.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index b8a1a17a87dd..5ee13083cec7 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -354,39 +354,6 @@ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv) return result; } -/** - * mii_lpa_mod_linkmode_adv_sgmii - * @lp_advertising: pointer to destination link mode. - * @lpa: value of the MII_LPA register - * - * A small helper function that translates MII_LPA bits to - * linkmode advertisement settings for SGMII. - * Leaves other bits unchanged. - */ -static inline void -mii_lpa_mod_linkmode_lpa_sgmii(unsigned long *lp_advertising, u32 lpa) -{ - u32 speed_duplex = lpa & LPA_SGMII_DPX_SPD_MASK; - - linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, lp_advertising, - speed_duplex == LPA_SGMII_1000HALF); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, lp_advertising, - speed_duplex == LPA_SGMII_1000FULL); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, lp_advertising, - speed_duplex == LPA_SGMII_100HALF); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, lp_advertising, - speed_duplex == LPA_SGMII_100FULL); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, lp_advertising, - speed_duplex == LPA_SGMII_10HALF); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, lp_advertising, - speed_duplex == LPA_SGMII_10FULL); -} - /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. -- cgit v1.2.3 From 9690ae60429020f38e4aa2540c306f27eb021bc0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Jan 2022 10:42:59 -0800 Subject: ethtool: add header/data split indication For applications running on a mix of platforms it's useful to have a clear indication whether host's NIC supports the geometry requirements of TCP zero-copy. TCP zero-copy Rx requires data to be neatly placed into memory pages. Most NICs can't do that. This patch is adding GET support only, since the NICs I work with either always have the feature enabled or enable it whenever MTU is set to jumbo. In other words I don't need SET. But adding set should be trivial. (The only note on SET is that we will likely want the setting to be "sticky" and use 0 / `unknown` to reset it back to driver default.) Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 8 ++++++++ include/linux/ethtool.h | 2 ++ include/uapi/linux/ethtool_netlink.h | 7 +++++++ net/ethtool/rings.c | 15 ++++++++++----- 4 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 9d98e0511249..cae28af7a476 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -860,8 +860,16 @@ Kernel response contents: ``ETHTOOL_A_RINGS_RX_JUMBO`` u32 size of RX jumbo ring ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring ``ETHTOOL_A_RINGS_RX_BUF_LEN`` u32 size of buffers on the ring + ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` u8 TCP header / data split ==================================== ====== =========================== +``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with +page-flipping TCP zero-copy receive (``getsockopt(TCP_ZEROCOPY_RECEIVE)``). +If enabled the device is configured to place frame headers and data into +separate buffers. The device configuration must make it possible to receive +full memory pages of data, for example because MTU is high enough or through +HW-GRO. + RINGS_SET ========= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 11efc45de66a..e0853f48b75e 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -70,9 +70,11 @@ enum { /** * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. + * @tcp_data_split: Scatter packet headers and data to separate buffers */ struct kernel_ethtool_ringparam { u32 rx_buf_len; + u8 tcp_data_split; }; /** diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index cca6e474a085..417d4280d7b5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -318,6 +318,12 @@ enum { /* RINGS */ +enum { + ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, + ETHTOOL_TCP_DATA_SPLIT_DISABLED, + ETHTOOL_TCP_DATA_SPLIT_ENABLED, +}; + enum { ETHTOOL_A_RINGS_UNSPEC, ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ @@ -330,6 +336,7 @@ enum { ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ ETHTOOL_A_RINGS_TX, /* u32 */ ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ + ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ /* add new constants above here */ __ETHTOOL_A_RINGS_CNT, diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index c1d5f5e0fdc9..18a5035d3bee 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -53,7 +53,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ - nla_total_size(sizeof(u32)); /* _RINGS_RX_BUF_LEN */ + nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ + nla_total_size(sizeof(u8)); /* _RINGS_TCP_DATA_SPLIT */ } static int rings_fill_reply(struct sk_buff *skb, @@ -61,9 +62,11 @@ static int rings_fill_reply(struct sk_buff *skb, const struct ethnl_reply_data *reply_base) { const struct rings_reply_data *data = RINGS_REPDATA(reply_base); - const struct kernel_ethtool_ringparam *kernel_ringparam = &data->kernel_ringparam; + const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam; const struct ethtool_ringparam *ringparam = &data->ringparam; + WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED); + if ((ringparam->rx_max_pending && (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX, ringparam->rx_max_pending) || @@ -84,9 +87,11 @@ static int rings_fill_reply(struct sk_buff *skb, ringparam->tx_max_pending) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX, ringparam->tx_pending))) || - (kernel_ringparam->rx_buf_len && - (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, - kernel_ringparam->rx_buf_len)))) + (kr->rx_buf_len && + (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || + (kr->tcp_data_split && + (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, + kr->tcp_data_split)))) return -EMSGSIZE; return 0; -- cgit v1.2.3 From 6cdef8a6ee748ec522aabee049633b16b4115f73 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Jan 2022 12:09:35 -0800 Subject: SUNRPC: add netns refcount tracker to struct svc_xprt struct svc_xprt holds a long lived reference to a netns, it is worth tracking it. Signed-off-by: Eric Dumazet Acked-by: Chuck Lever Signed-off-by: David S. Miller --- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc_xprt.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 571f605bc91e..382af90320ac 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -88,6 +88,7 @@ struct svc_xprt { struct list_head xpt_users; /* callbacks on free */ struct net *xpt_net; + netns_tracker ns_tracker; const struct cred *xpt_cred; struct rpc_xprt *xpt_bc_xprt; /* NFSv4.1 backchannel */ struct rpc_xprt_switch *xpt_bc_xps; /* NFSv4.1 backchannel */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b21ad7994147..db878e833b67 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -162,7 +162,7 @@ static void svc_xprt_free(struct kref *kref) if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) svcauth_unix_info_release(xprt); put_cred(xprt->xpt_cred); - put_net(xprt->xpt_net); + put_net_track(xprt->xpt_net, &xprt->ns_tracker); /* See comment on corresponding get in xs_setup_bc_tcp(): */ if (xprt->xpt_bc_xprt) xprt_put(xprt->xpt_bc_xprt); @@ -198,7 +198,7 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); - xprt->xpt_net = get_net(net); + xprt->xpt_net = get_net_track(net, &xprt->ns_tracker, GFP_ATOMIC); strcpy(xprt->xpt_remotebuf, "uninitialized"); } EXPORT_SYMBOL_GPL(svc_xprt_init); -- cgit v1.2.3 From b9a0d6d143ec63132beff04802578c499083f87c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Jan 2022 12:09:37 -0800 Subject: SUNRPC: add netns refcount tracker to struct rpc_xprt Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 955ea4d7af0b..3cdc8d878d81 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -284,6 +284,7 @@ struct rpc_xprt { } stat; struct net *xprt_net; + netns_tracker ns_tracker; const char *servername; const char *address_strings[RPC_DISPLAY_MAX]; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a02de2bddb28..5af484d6ba5e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1835,7 +1835,7 @@ EXPORT_SYMBOL_GPL(xprt_alloc); void xprt_free(struct rpc_xprt *xprt) { - put_net(xprt->xprt_net); + put_net_track(xprt->xprt_net, &xprt->ns_tracker); xprt_free_all_slots(xprt); xprt_free_id(xprt); rpc_sysfs_xprt_destroy(xprt); @@ -2027,7 +2027,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) xprt_init_xid(xprt); - xprt->xprt_net = get_net(net); + xprt->xprt_net = get_net_track(net, &xprt->ns_tracker, GFP_KERNEL); } /** -- cgit v1.2.3 From 73c105ad2a3e142d81fc59761ce8353d0b211b8f Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Fri, 28 Jan 2022 21:32:40 +0300 Subject: phy: make phy_set_max_speed() *void* After following the call tree of phy_set_max_speed(), it became clear that this function never returns anything but 0, so we can change its result type to *void* and drop the result checks from the three drivers that actually bothered to do it... Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Signed-off-by: Sergey Shtylyov Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/ethernet/renesas/ravb_main.c | 8 +------- drivers/net/ethernet/renesas/sh_eth.c | 10 ++-------- drivers/net/phy/aquantia_main.c | 4 +--- drivers/net/phy/phy-core.c | 22 ++++++++-------------- include/linux/phy.h | 2 +- 5 files changed, 13 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index b215cde68e10..80366661a361 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1432,11 +1432,7 @@ static int ravb_phy_init(struct net_device *ndev) * at this time. */ if (soc_device_match(r8a7795es10)) { - err = phy_set_max_speed(phydev, SPEED_100); - if (err) { - netdev_err(ndev, "failed to limit PHY to 100Mbit/s\n"); - goto err_phy_disconnect; - } + phy_set_max_speed(phydev, SPEED_100); netdev_info(ndev, "limited PHY to 100Mbit/s\n"); } @@ -1457,8 +1453,6 @@ static int ravb_phy_init(struct net_device *ndev) return 0; -err_phy_disconnect: - phy_disconnect(phydev); err_deregister_fixed_link: if (of_phy_is_fixed_link(np)) of_phy_deregister_fixed_link(np); diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index d947a628e166..8aa91e99227d 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2026,14 +2026,8 @@ static int sh_eth_phy_init(struct net_device *ndev) } /* mask with MAC supported features */ - if (mdp->cd->register_type != SH_ETH_REG_GIGABIT) { - int err = phy_set_max_speed(phydev, SPEED_100); - if (err) { - netdev_err(ndev, "failed to limit PHY to 100 Mbit/s\n"); - phy_disconnect(phydev); - return err; - } - } + if (mdp->cd->register_type != SH_ETH_REG_GIGABIT) + phy_set_max_speed(phydev, SPEED_100); phy_attached_info(phydev); diff --git a/drivers/net/phy/aquantia_main.c b/drivers/net/phy/aquantia_main.c index 968dd43a2b1e..a8db1a19011b 100644 --- a/drivers/net/phy/aquantia_main.c +++ b/drivers/net/phy/aquantia_main.c @@ -533,9 +533,7 @@ static int aqcs109_config_init(struct phy_device *phydev) * PMA speed ability bits are the same for all members of the family, * AQCS109 however supports speeds up to 2.5G only. */ - ret = phy_set_max_speed(phydev, SPEED_2500); - if (ret) - return ret; + phy_set_max_speed(phydev, SPEED_2500); return aqr107_set_downshift(phydev, MDIO_AN_VEND_PROV_DOWNSHIFT_DFLT); } diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 271fc01f7f7f..2001f3329133 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -243,7 +243,7 @@ size_t phy_speeds(unsigned int *speeds, size_t size, return count; } -static int __set_linkmode_max_speed(u32 max_speed, unsigned long *addr) +static void __set_linkmode_max_speed(u32 max_speed, unsigned long *addr) { const struct phy_setting *p; int i; @@ -254,13 +254,11 @@ static int __set_linkmode_max_speed(u32 max_speed, unsigned long *addr) else break; } - - return 0; } -static int __set_phy_supported(struct phy_device *phydev, u32 max_speed) +static void __set_phy_supported(struct phy_device *phydev, u32 max_speed) { - return __set_linkmode_max_speed(max_speed, phydev->supported); + __set_linkmode_max_speed(max_speed, phydev->supported); } /** @@ -273,17 +271,11 @@ static int __set_phy_supported(struct phy_device *phydev, u32 max_speed) * is connected to a 1G PHY. This function allows the MAC to indicate its * maximum speed, and so limit what the PHY will advertise. */ -int phy_set_max_speed(struct phy_device *phydev, u32 max_speed) +void phy_set_max_speed(struct phy_device *phydev, u32 max_speed) { - int err; - - err = __set_phy_supported(phydev, max_speed); - if (err) - return err; + __set_phy_supported(phydev, max_speed); phy_advertise_supported(phydev); - - return 0; } EXPORT_SYMBOL(phy_set_max_speed); @@ -440,7 +432,9 @@ int phy_speed_down_core(struct phy_device *phydev) if (min_common_speed == SPEED_UNKNOWN) return -EINVAL; - return __set_linkmode_max_speed(min_common_speed, phydev->advertising); + __set_linkmode_max_speed(min_common_speed, phydev->advertising); + + return 0; } static void mmd_phy_indirect(struct mii_bus *bus, int phy_addr, int devad, diff --git a/include/linux/phy.h b/include/linux/phy.h index 6de8d7a90d78..cd08cf1a8b0d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1661,7 +1661,7 @@ int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); -int phy_set_max_speed(struct phy_device *phydev, u32 max_speed); +void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); -- cgit v1.2.3 From 6d3ac94bae21de6b6a1d81596752428960012816 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Fri, 14 Jan 2022 08:11:02 +0800 Subject: ssb: fix boolreturn.cocci warning The coccinelle report ./include/linux/ssb/ssb_driver_gige.h:98:8-9: WARNING: return of 0/1 in function 'ssb_gige_must_flush_posted_writes' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: David Yang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/fa4f1fa737e715eb62a85229ac5f12bae21145cf.1642065490.git.davidcomponentone@gmail.com --- include/linux/ssb/ssb_driver_gige.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h index 15ba0df1ee0d..28c145a51e57 100644 --- a/include/linux/ssb/ssb_driver_gige.h +++ b/include/linux/ssb/ssb_driver_gige.h @@ -95,7 +95,7 @@ static inline bool ssb_gige_must_flush_posted_writes(struct pci_dev *pdev) struct ssb_gige *dev = pdev_to_ssb_gige(pdev); if (dev) return (dev->dev->bus->chip_id == 0x4785); - return 0; + return false; } /* Get the device MAC address */ -- cgit v1.2.3 From 3ec762fb13c7e7273800b94c80db1c2cc37590d1 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:23 +0100 Subject: net: dsa: tag_qca: move define to include linux/dsa Move tag_qca define to include dir linux/dsa as the qca8k require access to the tagger define to support in-band mdio read/write using ethernet packet. Signed-off-by: Ansuel Smith Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 21 +++++++++++++++++++++ net/dsa/tag_qca.c | 16 +--------------- 2 files changed, 22 insertions(+), 15 deletions(-) create mode 100644 include/linux/dsa/tag_qca.h (limited to 'include/linux') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h new file mode 100644 index 000000000000..c02d2d39ff4a --- /dev/null +++ b/include/linux/dsa/tag_qca.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TAG_QCA_H +#define __TAG_QCA_H + +#define QCA_HDR_LEN 2 +#define QCA_HDR_VERSION 0x2 + +#define QCA_HDR_RECV_VERSION GENMASK(15, 14) +#define QCA_HDR_RECV_PRIORITY GENMASK(13, 11) +#define QCA_HDR_RECV_TYPE GENMASK(10, 6) +#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3) +#define QCA_HDR_RECV_SOURCE_PORT GENMASK(2, 0) + +#define QCA_HDR_XMIT_VERSION GENMASK(15, 14) +#define QCA_HDR_XMIT_PRIORITY GENMASK(13, 11) +#define QCA_HDR_XMIT_CONTROL GENMASK(10, 8) +#define QCA_HDR_XMIT_FROM_CPU BIT(7) +#define QCA_HDR_XMIT_DP_BIT GENMASK(6, 0) + +#endif /* __TAG_QCA_H */ diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 55fa6b96b4eb..34e565e00ece 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -5,24 +5,10 @@ #include #include +#include #include "dsa_priv.h" -#define QCA_HDR_LEN 2 -#define QCA_HDR_VERSION 0x2 - -#define QCA_HDR_RECV_VERSION GENMASK(15, 14) -#define QCA_HDR_RECV_PRIORITY GENMASK(13, 11) -#define QCA_HDR_RECV_TYPE GENMASK(10, 6) -#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3) -#define QCA_HDR_RECV_SOURCE_PORT GENMASK(2, 0) - -#define QCA_HDR_XMIT_VERSION GENMASK(15, 14) -#define QCA_HDR_XMIT_PRIORITY GENMASK(13, 11) -#define QCA_HDR_XMIT_CONTROL GENMASK(10, 8) -#define QCA_HDR_XMIT_FROM_CPU BIT(7) -#define QCA_HDR_XMIT_DP_BIT GENMASK(6, 0) - static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); -- cgit v1.2.3 From c2ee8181fddb293d296477f60b3eb4fa3ce4e1a6 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:25 +0100 Subject: net: dsa: tag_qca: add define for handling mgmt Ethernet packet Add all the required define to prepare support for mgmt read/write in Ethernet packet. Any packet of this type has to be dropped as the only use of these special packet is receive ack for an mgmt write request or receive data for an mgmt read request. A struct is used that emulates the Ethernet header but is used for a different purpose. Signed-off-by: Ansuel Smith Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ net/dsa/tag_qca.c | 15 ++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index c02d2d39ff4a..f366422ab7a0 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -12,10 +12,54 @@ #define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3) #define QCA_HDR_RECV_SOURCE_PORT GENMASK(2, 0) +/* Packet type for recv */ +#define QCA_HDR_RECV_TYPE_NORMAL 0x0 +#define QCA_HDR_RECV_TYPE_MIB 0x1 +#define QCA_HDR_RECV_TYPE_RW_REG_ACK 0x2 + #define QCA_HDR_XMIT_VERSION GENMASK(15, 14) #define QCA_HDR_XMIT_PRIORITY GENMASK(13, 11) #define QCA_HDR_XMIT_CONTROL GENMASK(10, 8) #define QCA_HDR_XMIT_FROM_CPU BIT(7) #define QCA_HDR_XMIT_DP_BIT GENMASK(6, 0) +/* Packet type for xmit */ +#define QCA_HDR_XMIT_TYPE_NORMAL 0x0 +#define QCA_HDR_XMIT_TYPE_RW_REG 0x1 + +/* Check code for a valid mgmt packet. Switch will ignore the packet + * with this wrong. + */ +#define QCA_HDR_MGMT_CHECK_CODE_VAL 0x5 + +/* Specific define for in-band MDIO read/write with Ethernet packet */ +#define QCA_HDR_MGMT_SEQ_LEN 4 /* 4 byte for the seq */ +#define QCA_HDR_MGMT_COMMAND_LEN 4 /* 4 byte for the command */ +#define QCA_HDR_MGMT_DATA1_LEN 4 /* First 4 byte for the mdio data */ +#define QCA_HDR_MGMT_HEADER_LEN (QCA_HDR_MGMT_SEQ_LEN + \ + QCA_HDR_MGMT_COMMAND_LEN + \ + QCA_HDR_MGMT_DATA1_LEN) + +#define QCA_HDR_MGMT_DATA2_LEN 12 /* Other 12 byte for the mdio data */ +#define QCA_HDR_MGMT_PADDING_LEN 34 /* Padding to reach the min Ethernet packet */ + +#define QCA_HDR_MGMT_PKT_LEN (QCA_HDR_MGMT_HEADER_LEN + \ + QCA_HDR_LEN + \ + QCA_HDR_MGMT_DATA2_LEN + \ + QCA_HDR_MGMT_PADDING_LEN) + +#define QCA_HDR_MGMT_SEQ_NUM GENMASK(31, 0) /* 63, 32 */ +#define QCA_HDR_MGMT_CHECK_CODE GENMASK(31, 29) /* 31, 29 */ +#define QCA_HDR_MGMT_CMD BIT(28) /* 28 */ +#define QCA_HDR_MGMT_LENGTH GENMASK(23, 20) /* 23, 20 */ +#define QCA_HDR_MGMT_ADDR GENMASK(18, 0) /* 18, 0 */ + +/* Special struct emulating a Ethernet header */ +struct qca_mgmt_ethhdr { + u32 command; /* command bit 31:0 */ + u32 seq; /* seq 63:32 */ + u32 mdio_data; /* first 4byte mdio */ + __be16 hdr; /* qca hdr */ +} __packed; + #endif /* __TAG_QCA_H */ diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index f8df49d5956f..f17ed5be20c2 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -32,10 +32,12 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) { - u8 ver; - u16 hdr; - int port; + u8 ver, pk_type; __be16 *phdr; + int port; + u16 hdr; + + BUILD_BUG_ON(sizeof(struct qca_mgmt_ethhdr) != QCA_HDR_MGMT_HEADER_LEN + QCA_HDR_LEN); if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) return NULL; @@ -48,6 +50,13 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) if (unlikely(ver != QCA_HDR_VERSION)) return NULL; + /* Get pk type */ + pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr); + + /* Ethernet MDIO read/write packet */ + if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) + return NULL; + /* Remove QCA tag and recalculate checksum */ skb_pull_rcsum(skb, QCA_HDR_LEN); dsa_strip_etype_header(skb, QCA_HDR_LEN); -- cgit v1.2.3 From 18be654a4345f7d937b4bfbad74bea8093e3a93c Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:26 +0100 Subject: net: dsa: tag_qca: add define for handling MIB packet Add struct to correctly parse a mib Ethernet packet. Signed-off-by: Ansuel Smith Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 10 ++++++++++ net/dsa/tag_qca.c | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index f366422ab7a0..1fff57f2937b 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -62,4 +62,14 @@ struct qca_mgmt_ethhdr { __be16 hdr; /* qca hdr */ } __packed; +enum mdio_cmd { + MDIO_WRITE = 0x0, + MDIO_READ +}; + +struct mib_ethhdr { + u32 data[3]; /* first 3 mib counter */ + __be16 hdr; /* qca hdr */ +} __packed; + #endif /* __TAG_QCA_H */ diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index f17ed5be20c2..be792cf687d9 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -57,6 +57,10 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) return NULL; + /* Ethernet MIB counter packet */ + if (pk_type == QCA_HDR_RECV_TYPE_MIB) + return NULL; + /* Remove QCA tag and recalculate checksum */ skb_pull_rcsum(skb, QCA_HDR_LEN); dsa_strip_etype_header(skb, QCA_HDR_LEN); -- cgit v1.2.3 From 31eb6b4386ad91930417e3f5c8157a4b5e31cbd5 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:27 +0100 Subject: net: dsa: tag_qca: add support for handling mgmt and MIB Ethernet packet Add connect/disconnect helper to assign private struct to the DSA switch. Add support for Ethernet mgmt and MIB if the DSA driver provide an handler to correctly parse and elaborate the data. Signed-off-by: Ansuel Smith Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 7 +++++++ net/dsa/tag_qca.c | 39 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index 1fff57f2937b..4359fb0221cf 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -72,4 +72,11 @@ struct mib_ethhdr { __be16 hdr; /* qca hdr */ } __packed; +struct qca_tagger_data { + void (*rw_reg_ack_handler)(struct dsa_switch *ds, + struct sk_buff *skb); + void (*mib_autocast_handler)(struct dsa_switch *ds, + struct sk_buff *skb); +}; + #endif /* __TAG_QCA_H */ diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index be792cf687d9..57d2e00f1e5d 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -5,6 +5,7 @@ #include #include +#include #include #include "dsa_priv.h" @@ -32,6 +33,9 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) { + struct qca_tagger_data *tagger_data; + struct dsa_port *dp = dev->dsa_ptr; + struct dsa_switch *ds = dp->ds; u8 ver, pk_type; __be16 *phdr; int port; @@ -39,6 +43,8 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) BUILD_BUG_ON(sizeof(struct qca_mgmt_ethhdr) != QCA_HDR_MGMT_HEADER_LEN + QCA_HDR_LEN); + tagger_data = ds->tagger_data; + if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) return NULL; @@ -53,13 +59,19 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) /* Get pk type */ pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr); - /* Ethernet MDIO read/write packet */ - if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) + /* Ethernet mgmt read/write packet */ + if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) { + if (likely(tagger_data->rw_reg_ack_handler)) + tagger_data->rw_reg_ack_handler(ds, skb); return NULL; + } /* Ethernet MIB counter packet */ - if (pk_type == QCA_HDR_RECV_TYPE_MIB) + if (pk_type == QCA_HDR_RECV_TYPE_MIB) { + if (likely(tagger_data->mib_autocast_handler)) + tagger_data->mib_autocast_handler(ds, skb); return NULL; + } /* Remove QCA tag and recalculate checksum */ skb_pull_rcsum(skb, QCA_HDR_LEN); @@ -75,9 +87,30 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) return skb; } +static int qca_tag_connect(struct dsa_switch *ds) +{ + struct qca_tagger_data *tagger_data; + + tagger_data = kzalloc(sizeof(*tagger_data), GFP_KERNEL); + if (!tagger_data) + return -ENOMEM; + + ds->tagger_data = tagger_data; + + return 0; +} + +static void qca_tag_disconnect(struct dsa_switch *ds) +{ + kfree(ds->tagger_data); + ds->tagger_data = NULL; +} + static const struct dsa_device_ops qca_netdev_ops = { .name = "qca", .proto = DSA_TAG_PROTO_QCA, + .connect = qca_tag_connect, + .disconnect = qca_tag_disconnect, .xmit = qca_tag_xmit, .rcv = qca_tag_rcv, .needed_headroom = QCA_HDR_LEN, -- cgit v1.2.3 From 00edb2bac29f2e9c1674e85479fa5e3aa8cc648f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 10:55:44 -0800 Subject: i40e: remove enum i40e_client_state It's not used. Signed-off-by: Jakub Kicinski Reviewed-by: Jesse Brandeburg Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- include/linux/net/intel/i40e_client.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h index 6b3267b49755..ed42bd5f639f 100644 --- a/include/linux/net/intel/i40e_client.h +++ b/include/linux/net/intel/i40e_client.h @@ -26,11 +26,6 @@ struct i40e_client_version { u8 rsvd; }; -enum i40e_client_state { - __I40E_CLIENT_NULL, - __I40E_CLIENT_REGISTERED -}; - enum i40e_client_instance_state { __I40E_CLIENT_INSTANCE_NONE, __I40E_CLIENT_INSTANCE_OPENED, @@ -190,11 +185,6 @@ struct i40e_client { const struct i40e_client_ops *ops; /* client ops provided by the client */ }; -static inline bool i40e_client_is_registered(struct i40e_client *client) -{ - return test_bit(__I40E_CLIENT_REGISTERED, &client->state); -} - void i40e_client_device_register(struct i40e_info *ldev, struct i40e_client *client); void i40e_client_device_unregister(struct i40e_info *ldev); -- cgit v1.2.3 From b794eecb2af77145d36b9c28f056e242add9c4b2 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 23 Nov 2021 10:25:36 -0800 Subject: ice: add support for DSCP QoS for IDC The ice driver provides QoS information to auxiliary drivers through the exported function ice_get_qos_params. This function doesn't currently support L3 DSCP QoS. Add the necessary defines, structure elements and code to support DSCP QoS through the IIDC functions. Signed-off-by: Dave Ertman Signed-off-by: Shiraz Saleem Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_idc.c | 5 +++++ include/linux/net/intel/iidc.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/ice/ice_idc.c b/drivers/net/ethernet/intel/ice/ice_idc.c index fc3580167e7b..263a2e7577a2 100644 --- a/drivers/net/ethernet/intel/ice/ice_idc.c +++ b/drivers/net/ethernet/intel/ice/ice_idc.c @@ -227,6 +227,11 @@ void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos) for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) qos->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i]; + + qos->pfc_mode = dcbx_cfg->pfc_mode; + if (qos->pfc_mode == IIDC_DSCP_PFC_MODE) + for (i = 0; i < IIDC_MAX_DSCP_MAPPING; i++) + qos->dscp_map[i] = dcbx_cfg->dscp_map[i]; } EXPORT_SYMBOL_GPL(ice_get_qos_params); diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h index 1289593411d3..1c1332e4df26 100644 --- a/include/linux/net/intel/iidc.h +++ b/include/linux/net/intel/iidc.h @@ -32,6 +32,8 @@ enum iidc_rdma_protocol { }; #define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_MAX_DSCP_MAPPING 64 +#define IIDC_DSCP_PFC_MODE 0x1 /* Struct to hold per RDMA Qset info */ struct iidc_rdma_qset_params { @@ -60,6 +62,8 @@ struct iidc_qos_params { u8 vport_relative_bw; u8 vport_priority_type; u8 num_tc; + u8 pfc_mode; + u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; }; struct iidc_event { -- cgit v1.2.3 From 1bc91a5ddf3eaea0e0ea957cccf3abdcfcace00e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:07:01 +0100 Subject: netfilter: conntrack: handle ->destroy hook via nat_ops instead The nat module already exposes a few functions to the conntrack core. Move the nat extension destroy hook to it. After this, no conntrack extension needs a destroy hook. 'struct nf_ct_ext_type' and the register/unregister api can be removed in a followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 + include/net/netfilter/nf_conntrack_extend.h | 3 --- net/netfilter/nf_conntrack_core.c | 14 ++++++++++++-- net/netfilter/nf_conntrack_extend.c | 21 --------------------- net/netfilter/nf_nat_core.c | 13 +++---------- 5 files changed, 16 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 15e71bfff726..c2c6f332fb90 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -379,6 +379,7 @@ struct nf_nat_hook { unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); + void (*remove_nat_bysrc)(struct nf_conn *ct); }; extern const struct nf_nat_hook __rcu *nf_nat_hook; diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 87d818414092..343f9194423a 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -79,9 +79,6 @@ void nf_ct_ext_destroy(struct nf_conn *ct); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); struct nf_ct_ext_type { - /* Destroys relationships (can be NULL). */ - void (*destroy)(struct nf_conn *ct); - enum nf_ct_ext_id id; }; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9edd3ae8d62e..8f0c0c0fd329 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -594,7 +594,7 @@ EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); void nf_ct_tmpl_free(struct nf_conn *tmpl) { - nf_ct_ext_destroy(tmpl); + kfree(tmpl->ext); if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) kfree((char *)tmpl - tmpl->proto.tmpl_padto); @@ -1597,7 +1597,17 @@ void nf_conntrack_free(struct nf_conn *ct) */ WARN_ON(refcount_read(&ct->ct_general.use) != 0); - nf_ct_ext_destroy(ct); + if (ct->status & IPS_SRC_NAT_DONE) { + const struct nf_nat_hook *nat_hook; + + rcu_read_lock(); + nat_hook = rcu_dereference(nf_nat_hook); + if (nat_hook) + nat_hook->remove_nat_bysrc(ct); + rcu_read_unlock(); + } + + kfree(ct->ext); kmem_cache_free(nf_conntrack_cachep, ct); cnet = nf_ct_pernet(net); diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 69a6cafcb045..6b772b804ee2 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -89,27 +89,6 @@ static __always_inline unsigned int total_extension_size(void) ; } -void nf_ct_ext_destroy(struct nf_conn *ct) -{ - unsigned int i; - struct nf_ct_ext_type *t; - - for (i = 0; i < NF_CT_EXT_NUM; i++) { - rcu_read_lock(); - t = rcu_dereference(nf_ct_ext_types[i]); - - /* Here the nf_ct_ext_type might have been unregisterd. - * I.e., it has responsible to cleanup private - * area in all conntracks when it is unregisterd. - */ - if (t && t->destroy) - t->destroy(ct); - rcu_read_unlock(); - } - - kfree(ct->ext); -} - void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { unsigned int newlen, newoff, oldlen, alloc; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 2ff20d6a5afb..8cc31d695e36 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -838,7 +838,7 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) return i->status & IPS_NAT_MASK ? 1 : 0; } -static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) +static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; @@ -860,7 +860,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) - __nf_nat_cleanup_conntrack(ct); + nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. @@ -868,15 +868,7 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data) return 0; } -/* No one using conntrack by the time this called. */ -static void nf_nat_cleanup_conntrack(struct nf_conn *ct) -{ - if (ct->status & IPS_SRC_NAT_DONE) - __nf_nat_cleanup_conntrack(ct); -} - static struct nf_ct_ext_type nat_extend __read_mostly = { - .destroy = nf_nat_cleanup_conntrack, .id = NF_CT_EXT_NAT, }; @@ -1171,6 +1163,7 @@ static const struct nf_nat_hook nat_hook = { .decode_session = __nf_nat_decode_session, #endif .manip_pkt = nf_nat_manip_pkt, + .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) -- cgit v1.2.3 From 20ff32024624102596f2b4083a17a97ca71d6cd8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 16:09:13 +0100 Subject: netfilter: conntrack: pptp: use single option structure Instead of exposing the four hooks individually use a sinle hook ops structure. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_pptp.h | 38 ++++++++---------- net/ipv4/netfilter/nf_nat_pptp.c | 24 +++++------- net/netfilter/nf_conntrack_pptp.c | 60 +++++++++-------------------- 3 files changed, 45 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_pptp.h b/include/linux/netfilter/nf_conntrack_pptp.h index a28aa289afdc..c3bdb4370938 100644 --- a/include/linux/netfilter/nf_conntrack_pptp.h +++ b/include/linux/netfilter/nf_conntrack_pptp.h @@ -300,26 +300,22 @@ union pptp_ctrl_union { struct PptpSetLinkInfo setlink; }; -extern int -(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -extern int -(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -extern void -(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *exp_orig, - struct nf_conntrack_expect *exp_reply); - -extern void -(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct, - struct nf_conntrack_expect *exp); +struct nf_nat_pptp_hook { + int (*outbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq); + int (*inbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq); + void (*exp_gre)(struct nf_conntrack_expect *exp_orig, + struct nf_conntrack_expect *exp_reply); + void (*expectfn)(struct nf_conn *ct, + struct nf_conntrack_expect *exp); +}; +extern const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook; #endif /* _NF_CONNTRACK_PPTP_H */ diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 3f248a19faa3..fab357cc8559 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -295,28 +295,24 @@ pptp_inbound_pkt(struct sk_buff *skb, return NF_ACCEPT; } +static const struct nf_nat_pptp_hook pptp_hooks = { + .outbound = pptp_outbound_pkt, + .inbound = pptp_inbound_pkt, + .exp_gre = pptp_exp_gre, + .expectfn = pptp_nat_expected, +}; + static int __init nf_nat_helper_pptp_init(void) { - BUG_ON(nf_nat_pptp_hook_outbound != NULL); - RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt); - - BUG_ON(nf_nat_pptp_hook_inbound != NULL); - RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt); - - BUG_ON(nf_nat_pptp_hook_exp_gre != NULL); - RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre); + WARN_ON(nf_nat_pptp_hook != NULL); + RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks); - BUG_ON(nf_nat_pptp_hook_expectfn != NULL); - RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected); return 0; } static void __exit nf_nat_helper_pptp_fini(void) { - RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL); - RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL); - RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL); - RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL); + RCU_INIT_POINTER(nf_nat_pptp_hook, NULL); synchronize_rcu(); } diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7d5708b92138..f3fa367b455f 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -45,30 +45,8 @@ MODULE_ALIAS_NFCT_HELPER("pptp"); static DEFINE_SPINLOCK(nf_pptp_lock); -int -(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_outbound); - -int -(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_inbound); - -void -(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *expect_orig, - struct nf_conntrack_expect *expect_reply) - __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_exp_gre); - -void -(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct, - struct nf_conntrack_expect *exp) __read_mostly; -EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn); +const struct nf_nat_pptp_hook *nf_nat_pptp_hook; +EXPORT_SYMBOL_GPL(nf_nat_pptp_hook); #if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) /* PptpControlMessageType names */ @@ -111,8 +89,8 @@ EXPORT_SYMBOL(pptp_msg_name); static void pptp_expectfn(struct nf_conn *ct, struct nf_conntrack_expect *exp) { + const struct nf_nat_pptp_hook *hook; struct net *net = nf_ct_net(ct); - typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn; pr_debug("increasing timeouts\n"); /* increase timeout of GRE data channel conntrack entry */ @@ -122,9 +100,9 @@ static void pptp_expectfn(struct nf_conn *ct, /* Can you see how rusty this code is, compared with the pre-2.6.11 * one? That's what happened to my shiny newnat of 2002 ;( -HW */ - nf_nat_pptp_expectfn = rcu_dereference(nf_nat_pptp_hook_expectfn); - if (nf_nat_pptp_expectfn && ct->master->status & IPS_NAT_MASK) - nf_nat_pptp_expectfn(ct, exp); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->master->status & IPS_NAT_MASK) + hook->expectfn(ct, exp); else { struct nf_conntrack_tuple inv_t; struct nf_conntrack_expect *exp_other; @@ -209,9 +187,9 @@ static void pptp_destroy_siblings(struct nf_conn *ct) static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) { struct nf_conntrack_expect *exp_orig, *exp_reply; + const struct nf_nat_pptp_hook *hook; enum ip_conntrack_dir dir; int ret = 1; - typeof(nf_nat_pptp_hook_exp_gre) nf_nat_pptp_exp_gre; exp_orig = nf_ct_expect_alloc(ct); if (exp_orig == NULL) @@ -239,9 +217,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) IPPROTO_GRE, &callid, &peer_callid); exp_reply->expectfn = pptp_expectfn; - nf_nat_pptp_exp_gre = rcu_dereference(nf_nat_pptp_hook_exp_gre); - if (nf_nat_pptp_exp_gre && ct->status & IPS_NAT_MASK) - nf_nat_pptp_exp_gre(exp_orig, exp_reply); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->status & IPS_NAT_MASK) + hook->exp_gre(exp_orig, exp_reply); if (nf_ct_expect_related(exp_orig, 0) != 0) goto out_put_both; if (nf_ct_expect_related(exp_reply, 0) != 0) @@ -279,9 +257,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, enum ip_conntrack_info ctinfo) { struct nf_ct_pptp_master *info = nfct_help_data(ct); + const struct nf_nat_pptp_hook *hook; u_int16_t msg; __be16 cid = 0, pcid = 0; - typeof(nf_nat_pptp_hook_inbound) nf_nat_pptp_inbound; msg = ntohs(ctlh->messageType); pr_debug("inbound control message %s\n", pptp_msg_name(msg)); @@ -383,10 +361,9 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, goto invalid; } - nf_nat_pptp_inbound = rcu_dereference(nf_nat_pptp_hook_inbound); - if (nf_nat_pptp_inbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_inbound(skb, ct, ctinfo, - protoff, ctlh, pptpReq); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->status & IPS_NAT_MASK) + return hook->inbound(skb, ct, ctinfo, protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: @@ -407,9 +384,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, enum ip_conntrack_info ctinfo) { struct nf_ct_pptp_master *info = nfct_help_data(ct); + const struct nf_nat_pptp_hook *hook; u_int16_t msg; __be16 cid = 0, pcid = 0; - typeof(nf_nat_pptp_hook_outbound) nf_nat_pptp_outbound; msg = ntohs(ctlh->messageType); pr_debug("outbound control message %s\n", pptp_msg_name(msg)); @@ -479,10 +456,9 @@ pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, goto invalid; } - nf_nat_pptp_outbound = rcu_dereference(nf_nat_pptp_hook_outbound); - if (nf_nat_pptp_outbound && ct->status & IPS_NAT_MASK) - return nf_nat_pptp_outbound(skb, ct, ctinfo, - protoff, ctlh, pptpReq); + hook = rcu_dereference(nf_nat_pptp_hook); + if (hook && ct->status & IPS_NAT_MASK) + return hook->outbound(skb, ct, ctinfo, protoff, ctlh, pptpReq); return NF_ACCEPT; invalid: -- cgit v1.2.3 From b93235e68921b9acd38ee309953a3a9808105289 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Feb 2022 14:20:31 -0800 Subject: tls: cap the output scatter list to something reasonable TLS recvmsg() passes user pages as destination for decrypt. The decrypt operation is repeated record by record, each record being 16kB, max. TLS allocates an sg_table and uses iov_iter_get_pages() to populate it with enough pages to fit the decrypted record. Even though we decrypt a single message at a time we size the sg_table based on the entire length of the iovec. This leads to unnecessarily large allocations, risking triggering OOM conditions. Use iov_iter_truncate() / iov_iter_reexpand() to construct a "capped" version of iov_iter_npages(). Alternatively we could parametrize iov_iter_npages() to take the size as arg instead of using i->count, or do something else.. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/uio.h | 17 +++++++++++++++++ net/tls/tls_sw.c | 3 ++- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 1198a2bfc9bf..739285fe5a2f 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -273,6 +273,23 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) i->count = count; } +static inline int +iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) +{ + size_t shorted = 0; + int npages; + + if (iov_iter_count(i) > max_bytes) { + shorted = iov_iter_count(i) - max_bytes; + iov_iter_truncate(i, max_bytes); + } + npages = iov_iter_npages(i, INT_MAX); + if (shorted) + iov_iter_reexpand(i, iov_iter_count(i) + shorted); + + return npages; +} + struct csum_state { __wsum csum; size_t off; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index efc84845bb6b..0024a692f0f8 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1433,7 +1433,8 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, if (*zc && (out_iov || out_sg)) { if (out_iov) - n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1; + n_sgout = 1 + + iov_iter_npages_cap(out_iov, INT_MAX, data_len); else n_sgout = sg_nents(out_sg); n_sgin = skb_nsg(skb, rxm->offset + prot->prepend_size, -- cgit v1.2.3 From bed89478934a9803d1a4d97574dcc30e509aa972 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Feb 2022 10:49:38 +0200 Subject: ieee80211: fix -Wcast-qual warnings When enabling -Wcast-qual e.g. via W=3, we get a lot of warnings from this file, whenever it's included. Since the fixes are simple, just do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.79ec4a4bab29.I8177a0c79d656c552e22c88931d8da06f2977896@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 559b6c644938..60ee7b3f58e7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2427,7 +2427,7 @@ struct ieee80211_tx_pwr_env { static inline u8 ieee80211_he_oper_size(const u8 *he_oper_ie) { - struct ieee80211_he_operation *he_oper = (void *)he_oper_ie; + const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie; u8 oper_len = sizeof(struct ieee80211_he_operation); u32 he_oper_params; @@ -2460,7 +2460,7 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) static inline const struct ieee80211_he_6ghz_oper * ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) { - const u8 *ret = (void *)&he_oper->optional; + const u8 *ret = (const void *)&he_oper->optional; u32 he_oper_params; if (!he_oper) @@ -2475,7 +2475,7 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) ret++; - return (void *)ret; + return (const void *)ret; } /* HE Spatial Reuse defines */ @@ -2496,7 +2496,7 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) static inline u8 ieee80211_he_spr_size(const u8 *he_spr_ie) { - struct ieee80211_he_spr *he_spr = (void *)he_spr_ie; + const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie; u8 spr_len = sizeof(struct ieee80211_he_spr); u8 he_spr_params; -- cgit v1.2.3 From e70e13e7d4ab8f932f49db1c9500b30a34a6d420 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 4 Feb 2022 01:55:18 +0100 Subject: bpf: Implement bpf_core_types_are_compat(). Adopt libbpf's bpf_core_types_are_compat() for kernel duty by adding explicit recursion limit of 2 which is enough to handle 2 levels of function prototypes. Signed-off-by: Matteo Croce Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204005519.60361-2-mcroce@linux.microsoft.com --- include/linux/btf.h | 5 +++ kernel/bpf/btf.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 109 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f6c43dd513fa..36bc09b8e890 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -327,6 +327,11 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo( return (const struct btf_var_secinfo *)(t + 1); } +static inline struct btf_param *btf_params(const struct btf_type *t) +{ + return (struct btf_param *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL struct bpf_prog; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9b47972c57a4..11740b300de9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6798,10 +6798,113 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +#define MAX_TYPES_ARE_COMPAT_DEPTH 2 + +static +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, + int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); + targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + btf_type_skip_modifiers(local_btf, local_p->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + btf_type_skip_modifiers(local_btf, local_type->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + return 0; + } +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - return -EOPNOTSUPP; + return __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + MAX_TYPES_ARE_COMPAT_DEPTH); } static bool bpf_core_is_flavor_sep(const char *s) -- cgit v1.2.3 From 0463e320421b313db320bbcf2928ebf49f556b67 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 4 Feb 2022 11:42:10 +0000 Subject: net: phylink: remove phylink_set_10g_modes() phylink_set_10g_modes() is no longer used with the conversion of drivers to phylink_generic_validate(), so we can remove it. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 11 ----------- include/linux/phylink.h | 1 - 2 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 420201858564..5b53a3e23c89 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -132,17 +132,6 @@ void phylink_set_port_modes(unsigned long *mask) } EXPORT_SYMBOL_GPL(phylink_set_port_modes); -void phylink_set_10g_modes(unsigned long *mask) -{ - phylink_set(mask, 10000baseT_Full); - phylink_set(mask, 10000baseCR_Full); - phylink_set(mask, 10000baseSR_Full); - phylink_set(mask, 10000baseLR_Full); - phylink_set(mask, 10000baseLRM_Full); - phylink_set(mask, 10000baseER_Full); -} -EXPORT_SYMBOL_GPL(phylink_set_10g_modes); - static int phylink_is_empty_linkmode(const unsigned long *linkmode) { __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp) = { 0, }; diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 713a0c928b7c..cca149f78d35 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -582,7 +582,6 @@ int phylink_speed_up(struct phylink *pl); #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode) void phylink_set_port_modes(unsigned long *bits); -void phylink_set_10g_modes(unsigned long *mask); void phylink_helper_basex_speed(struct phylink_link_state *state); void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, -- cgit v1.2.3 From 145c7a793838add5e004e7d49a67654dc7eba147 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 12:15:45 -0800 Subject: ipv6: make mc_forwarding atomic This fixes minor data-races in ip6_mc_input() and batadv_mcast_mla_rtr_flags_softif_get_ipv6() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- net/batman-adv/multicast.c | 2 +- net/ipv6/addrconf.c | 4 ++-- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6mr.c | 8 ++++---- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1e0f8a31f3de..16870f86c74d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -51,7 +51,7 @@ struct ipv6_devconf { __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - __s32 mc_forwarding; + atomic_t mc_forwarding; #endif __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index f4004cf0ff6f..9f311fddfaf9 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -134,7 +134,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); - if (in6_dev && in6_dev->cnf.mc_forwarding) + if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f927c199a93c..ff1b2484b8ed 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -554,7 +554,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, #ifdef CONFIG_IPV6_MROUTE if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, - devconf->mc_forwarding) < 0) + atomic_read(&devconf->mc_forwarding)) < 0) goto nla_put_failure; #endif if ((all || type == NETCONFA_PROXY_NEIGH) && @@ -5533,7 +5533,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; + array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding); #endif array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 80256717868e..d4b1e2c5aa76 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -508,7 +508,7 @@ int ip6_mc_input(struct sk_buff *skb) /* * IPv6 multicast router mode is now supported ;) */ - if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) && !(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 7cf73e60e619..541cd0887129 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -732,7 +732,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, in6_dev = __in6_dev_get(dev); if (in6_dev) { - in6_dev->cnf.mc_forwarding--; + atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); @@ -900,7 +900,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt, in6_dev = __in6_dev_get(dev); if (in6_dev) { - in6_dev->cnf.mc_forwarding++; + atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); @@ -1551,7 +1551,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); - net->ipv6.devconf_all->mc_forwarding++; + atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } write_unlock_bh(&mrt_lock); @@ -1584,7 +1584,7 @@ int ip6mr_sk_done(struct sock *sk) * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ - net->ipv6.devconf_all->mc_forwarding--; + atomic_dec(&net->ipv6.devconf_all->mc_forwarding); write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, -- cgit v1.2.3 From e3ececfe668facd87d920b608349a32607060e66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 14:42:35 -0800 Subject: ref_tracker: implement use-after-free detection Whenever ref_tracker_dir_init() is called, mark the struct ref_tracker_dir as dead. Test the dead status from ref_tracker_alloc() and ref_tracker_free() This should detect buggy dev_put()/dev_hold() happening too late in netdevice dismantle process. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ref_tracker.h | 2 ++ lib/ref_tracker.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h index 60f3453be23e..a443abda937d 100644 --- a/include/linux/ref_tracker.h +++ b/include/linux/ref_tracker.h @@ -13,6 +13,7 @@ struct ref_tracker_dir { spinlock_t lock; unsigned int quarantine_avail; refcount_t untracked; + bool dead; struct list_head list; /* List of active trackers */ struct list_head quarantine; /* List of dead trackers */ #endif @@ -26,6 +27,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, INIT_LIST_HEAD(&dir->quarantine); spin_lock_init(&dir->lock); dir->quarantine_avail = quarantine_count; + dir->dead = false; refcount_set(&dir->untracked, 1); stack_depot_init(); } diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c index a6789c0c626b..32ff6bd497f8 100644 --- a/lib/ref_tracker.c +++ b/lib/ref_tracker.c @@ -20,6 +20,7 @@ void ref_tracker_dir_exit(struct ref_tracker_dir *dir) unsigned long flags; bool leak = false; + dir->dead = true; spin_lock_irqsave(&dir->lock, flags); list_for_each_entry_safe(tracker, n, &dir->quarantine, head) { list_del(&tracker->head); @@ -72,6 +73,8 @@ int ref_tracker_alloc(struct ref_tracker_dir *dir, gfp_t gfp_mask = gfp; unsigned long flags; + WARN_ON_ONCE(dir->dead); + if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); @@ -100,6 +103,8 @@ int ref_tracker_free(struct ref_tracker_dir *dir, unsigned int nr_entries; unsigned long flags; + WARN_ON_ONCE(dir->dead); + if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; -- cgit v1.2.3 From 8fd5522f44dcd7f05454ddc4f16d0f821b676cd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 14:42:36 -0800 Subject: ref_tracker: add a count of untracked references We are still chasing a netdev refcount imbalance, and we suspect we have one rogue dev_put() that is consuming a reference taken from a dev_hold_track() To detect this case, allow ref_tracker_alloc() and ref_tracker_free() to be called with a NULL @trackerp parameter, and use a dedicated refcount_t just for them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ref_tracker.h | 2 ++ lib/ref_tracker.c | 12 +++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h index a443abda937d..9ca353ab712b 100644 --- a/include/linux/ref_tracker.h +++ b/include/linux/ref_tracker.h @@ -13,6 +13,7 @@ struct ref_tracker_dir { spinlock_t lock; unsigned int quarantine_avail; refcount_t untracked; + refcount_t no_tracker; bool dead; struct list_head list; /* List of active trackers */ struct list_head quarantine; /* List of dead trackers */ @@ -29,6 +30,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, dir->quarantine_avail = quarantine_count; dir->dead = false; refcount_set(&dir->untracked, 1); + refcount_set(&dir->no_tracker, 1); stack_depot_init(); } diff --git a/lib/ref_tracker.c b/lib/ref_tracker.c index 32ff6bd497f8..9c0c2e09df66 100644 --- a/lib/ref_tracker.c +++ b/lib/ref_tracker.c @@ -38,6 +38,7 @@ void ref_tracker_dir_exit(struct ref_tracker_dir *dir) spin_unlock_irqrestore(&dir->lock, flags); WARN_ON_ONCE(leak); WARN_ON_ONCE(refcount_read(&dir->untracked) != 1); + WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1); } EXPORT_SYMBOL(ref_tracker_dir_exit); @@ -75,6 +76,10 @@ int ref_tracker_alloc(struct ref_tracker_dir *dir, WARN_ON_ONCE(dir->dead); + if (!trackerp) { + refcount_inc(&dir->no_tracker); + return 0; + } if (gfp & __GFP_DIRECT_RECLAIM) gfp_mask |= __GFP_NOFAIL; *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask); @@ -98,13 +103,18 @@ int ref_tracker_free(struct ref_tracker_dir *dir, struct ref_tracker **trackerp) { unsigned long entries[REF_TRACKER_STACK_ENTRIES]; - struct ref_tracker *tracker = *trackerp; depot_stack_handle_t stack_handle; + struct ref_tracker *tracker; unsigned int nr_entries; unsigned long flags; WARN_ON_ONCE(dir->dead); + if (!trackerp) { + refcount_dec(&dir->no_tracker); + return 0; + } + tracker = *trackerp; if (!tracker) { refcount_dec(&dir->untracked); return -EEXIST; -- cgit v1.2.3 From 4c6c11ea0f7b00a1894803efe980dfaf3b074886 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 14:42:37 -0800 Subject: net: refine dev_put()/dev_hold() debugging We are still chasing some syzbot reports where we think a rogue dev_put() is called with no corresponding prior dev_hold(). Unfortunately it eats a reference on dev->dev_refcnt taken by innocent dev_hold_track(), meaning that the refcount saturation splat comes too late to be useful. Make sure that 'not tracked' dev_put() and dev_hold() better use CONFIG_NET_DEV_REFCNT_TRACKER=y debug infrastructure: Prior patch in the series allowed ref_tracker_alloc() and ref_tracker_free() to be called with a NULL @trackerp parameter, and to use a separate refcount only to detect too many put() even in the following case: dev_hold_track(dev, tracker_1, GFP_ATOMIC); dev_hold(dev); dev_put(dev); dev_put(dev); // Should complain loudly here. dev_put_track(dev, tracker_1); // instead of here Add clarification about netdev_tracker_alloc() role. v2: I replaced the dev_put() in linkwatch_do_dev() with __dev_put() because callers called netdev_tracker_free(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 69 +++++++++++++++++++++++++++++++---------------- net/core/dev.c | 2 +- net/core/link_watch.c | 6 ++--- 3 files changed, 50 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e490b84732d1..3fb6fb67ed77 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3817,14 +3817,7 @@ extern unsigned int netdev_budget_usecs; /* Called by rtnetlink.c:rtnl_unlock() */ void netdev_run_todo(void); -/** - * dev_put - release reference to device - * @dev: network device - * - * Release reference to device to allow it to be freed. - * Try using dev_put_track() instead. - */ -static inline void dev_put(struct net_device *dev) +static inline void __dev_put(struct net_device *dev) { if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT @@ -3835,14 +3828,7 @@ static inline void dev_put(struct net_device *dev) } } -/** - * dev_hold - get reference to device - * @dev: network device - * - * Hold reference to device to keep it from being freed. - * Try using dev_hold_track() instead. - */ -static inline void dev_hold(struct net_device *dev) +static inline void __dev_hold(struct net_device *dev) { if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT @@ -3853,11 +3839,24 @@ static inline void dev_hold(struct net_device *dev) } } +static inline void __netdev_tracker_alloc(struct net_device *dev, + netdevice_tracker *tracker, + gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); +#endif +} + +/* netdev_tracker_alloc() can upgrade a prior untracked reference + * taken by dev_get_by_name()/dev_get_by_index() to a tracked one. + */ static inline void netdev_tracker_alloc(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER - ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); + refcount_dec(&dev->refcnt_tracker.no_tracker); + __netdev_tracker_alloc(dev, tracker, gfp); #endif } @@ -3873,8 +3872,8 @@ static inline void dev_hold_track(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { if (dev) { - dev_hold(dev); - netdev_tracker_alloc(dev, tracker, gfp); + __dev_hold(dev); + __netdev_tracker_alloc(dev, tracker, gfp); } } @@ -3883,10 +3882,34 @@ static inline void dev_put_track(struct net_device *dev, { if (dev) { netdev_tracker_free(dev, tracker); - dev_put(dev); + __dev_put(dev); } } +/** + * dev_hold - get reference to device + * @dev: network device + * + * Hold reference to device to keep it from being freed. + * Try using dev_hold_track() instead. + */ +static inline void dev_hold(struct net_device *dev) +{ + dev_hold_track(dev, NULL, GFP_ATOMIC); +} + +/** + * dev_put - release reference to device + * @dev: network device + * + * Release reference to device to allow it to be freed. + * Try using dev_put_track() instead. + */ +static inline void dev_put(struct net_device *dev) +{ + dev_put_track(dev, NULL); +} + static inline void dev_replace_track(struct net_device *odev, struct net_device *ndev, netdevice_tracker *tracker, @@ -3895,11 +3918,11 @@ static inline void dev_replace_track(struct net_device *odev, if (odev) netdev_tracker_free(odev, tracker); - dev_hold(ndev); - dev_put(odev); + __dev_hold(ndev); + __dev_put(odev); if (ndev) - netdev_tracker_alloc(ndev, tracker, gfp); + __netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on diff --git a/net/core/dev.c b/net/core/dev.c index f79744d99413..1eaa0b88e3ba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10172,7 +10172,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; - dev_hold(dev); + __dev_hold(dev); #else refcount_set(&dev->dev_refcnt, 1); #endif diff --git a/net/core/link_watch.c b/net/core/link_watch.c index b0f5344d1185..95098d1a49bd 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -166,10 +166,10 @@ static void linkwatch_do_dev(struct net_device *dev) netdev_state_change(dev); } - /* Note: our callers are responsible for - * calling netdev_tracker_free(). + /* Note: our callers are responsible for calling netdev_tracker_free(). + * This is the reason we use __dev_put() instead of dev_put(). */ - dev_put(dev); + __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) -- cgit v1.2.3 From 5a8fb33e530512ee67a11b30f3451a4f030f4b01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 20:56:14 -0800 Subject: skmsg: convert struct sk_msg_sg::copy to a bitmap We have plans for increasing MAX_SKB_FRAGS, but sk_msg_sg::copy is currently an unsigned long, limiting MAX_SKB_FRAGS to 30 on 32bit arches. Convert it to a bitmap, as Jakub suggested. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skmsg.h | 11 +++++------ net/core/filter.c | 4 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 18a717fe62eb..1ff68a88c58d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -29,7 +29,7 @@ struct sk_msg_sg { u32 end; u32 size; u32 copybreak; - unsigned long copy; + DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the @@ -38,7 +38,6 @@ struct sk_msg_sg { */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; -static_assert(BITS_PER_LONG >= NR_MSG_FRAG_IDS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { @@ -234,7 +233,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); - if (test_bit(msg->sg.start, &msg->sg.copy)) { + if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { @@ -253,7 +252,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sg_set_page(sge, page, len, offset); sg_unmark_end(sge); - __set_bit(msg->sg.end, &msg->sg.copy); + __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } @@ -262,9 +261,9 @@ static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) - __set_bit(i, &msg->sg.copy); + __set_bit(i, msg->sg.copy); else - __clear_bit(i, &msg->sg.copy); + __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; diff --git a/net/core/filter.c b/net/core/filter.c index 9615ae1ab530..f497ca7a16d2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2603,7 +2603,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start, * account for the headroom. */ bytes_sg_total = start - offset + bytes; - if (!test_bit(i, &msg->sg.copy) && bytes_sg_total <= len) + if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist @@ -2809,7 +2809,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, /* Place newly allocated data buffer */ sk_mem_charge(msg->sk, len); msg->sg.size += len; - __clear_bit(new, &msg->sg.copy); + __clear_bit(new, msg->sg.copy); sg_set_page(&msg->sg.data[new], page, len + copy, 0); if (rsge.length) { get_page(sg_page(&rsge)); -- cgit v1.2.3 From 88590b369354092183bcba04e2368010c462557f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:33 +0800 Subject: net: skb_drop_reason: add document for drop reasons Add document for following existing drop reasons: SKB_DROP_REASON_NOT_SPECIFIED SKB_DROP_REASON_NO_SOCKET SKB_DROP_REASON_PKT_TOO_SMALL SKB_DROP_REASON_TCP_CSUM SKB_DROP_REASON_SOCKET_FILTER SKB_DROP_REASON_UDP_CSUM Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a27bcc4f7e9a..f04e3a1f4455 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -314,12 +314,12 @@ struct sk_buff; * used to translate the reason to string. */ enum skb_drop_reason { - SKB_DROP_REASON_NOT_SPECIFIED, - SKB_DROP_REASON_NO_SOCKET, - SKB_DROP_REASON_PKT_TOO_SMALL, - SKB_DROP_REASON_TCP_CSUM, - SKB_DROP_REASON_SOCKET_FILTER, - SKB_DROP_REASON_UDP_CSUM, + SKB_DROP_REASON_NOT_SPECIFIED, /* drop reason is not specified */ + SKB_DROP_REASON_NO_SOCKET, /* socket not found */ + SKB_DROP_REASON_PKT_TOO_SMALL, /* packet size is too small */ + SKB_DROP_REASON_TCP_CSUM, /* TCP checksum error */ + SKB_DROP_REASON_SOCKET_FILTER, /* dropped by socket filter */ + SKB_DROP_REASON_UDP_CSUM, /* UDP checksum error */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 2df3041ba3be950376e8c25a8f6da22f7fcc765c Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:34 +0800 Subject: net: netfilter: use kfree_drop_reason() for NF_DROP Replace kfree_skb() with kfree_skb_reason() in nf_hook_slow() when skb is dropped by reason of NF_DROP. Following new drop reasons are introduced: SKB_DROP_REASON_NETFILTER_DROP Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + net/netfilter/core.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f04e3a1f4455..9060159b4375 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -320,6 +320,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_CSUM, /* TCP checksum error */ SKB_DROP_REASON_SOCKET_FILTER, /* dropped by socket filter */ SKB_DROP_REASON_UDP_CSUM, /* UDP checksum error */ + SKB_DROP_REASON_NETFILTER_DROP, /* dropped by netfilter */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index a8a64b97504d..3d89f7b09a43 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -16,6 +16,7 @@ EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM) \ EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ + EM(SKB_DROP_REASON_NETFILTER_DROP, NETFILTER_DROP) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 354cb472f386..d1c9dfbb11fa 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -621,7 +621,8 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, case NF_ACCEPT: break; case NF_DROP: - kfree_skb(skb); + kfree_skb_reason(skb, + SKB_DROP_REASON_NETFILTER_DROP); ret = NF_DROP_GETERR(verdict); if (ret == 0) ret = -EPERM; -- cgit v1.2.3 From 33cba42985c8144eef78d618fc1e51aaa074b169 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:35 +0800 Subject: net: ipv4: use kfree_skb_reason() in ip_rcv_core() Replace kfree_skb() with kfree_skb_reason() in ip_rcv_core(). Three new drop reasons are introduced: SKB_DROP_REASON_OTHERHOST SKB_DROP_REASON_IP_CSUM SKB_DROP_REASON_IP_INHDR Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ include/trace/events/skb.h | 3 +++ net/ipv4/ip_input.c | 12 ++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9060159b4375..8e82130b3c52 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -321,6 +321,15 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /* dropped by socket filter */ SKB_DROP_REASON_UDP_CSUM, /* UDP checksum error */ SKB_DROP_REASON_NETFILTER_DROP, /* dropped by netfilter */ + SKB_DROP_REASON_OTHERHOST, /* packet don't belong to current + * host (interface is in promisc + * mode) + */ + SKB_DROP_REASON_IP_CSUM, /* IP checksum error */ + SKB_DROP_REASON_IP_INHDR, /* there is something wrong with + * IP header (see + * IPSTATS_MIB_INHDRERRORS) + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 3d89f7b09a43..f2b1778485f0 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -17,6 +17,9 @@ EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ EM(SKB_DROP_REASON_NETFILTER_DROP, NETFILTER_DROP) \ + EM(SKB_DROP_REASON_OTHERHOST, OTHERHOST) \ + EM(SKB_DROP_REASON_IP_CSUM, IP_CSUM) \ + EM(SKB_DROP_REASON_IP_INHDR, IP_INHDR) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 3a025c011971..7be18de32e16 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -436,13 +436,16 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; + int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ - if (skb->pkt_type == PACKET_OTHERHOST) + if (skb->pkt_type == PACKET_OTHERHOST) { + drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; + } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); @@ -452,6 +455,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) goto out; } + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; @@ -488,6 +492,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) len = ntohs(iph->tot_len); if (skb->len < len) { + drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) @@ -516,11 +521,14 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) return skb; csum_error: + drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: + if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) + drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); out: return NULL; } -- cgit v1.2.3 From c1f166d1f7eef212096a98b22f5acf92f9af353d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:36 +0800 Subject: net: ipv4: use kfree_skb_reason() in ip_rcv_finish_core() Replace kfree_skb() with kfree_skb_reason() in ip_rcv_finish_core(), following drop reasons are introduced: SKB_DROP_REASON_IP_RPFILTER SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ include/trace/events/skb.h | 3 +++ net/ipv4/ip_input.c | 14 ++++++++++---- 3 files changed, 22 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8e82130b3c52..4baba45f223d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -330,6 +330,15 @@ enum skb_drop_reason { * IP header (see * IPSTATS_MIB_INHDRERRORS) */ + SKB_DROP_REASON_IP_RPFILTER, /* IP rpfilter validate failed. + * see the document for rp_filter + * in ip-sysctl.rst for more + * information + */ + SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, /* destination address of L2 + * is multicast, but L3 is + * unicast. + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index f2b1778485f0..485a1d3034a4 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -20,6 +20,9 @@ EM(SKB_DROP_REASON_OTHERHOST, OTHERHOST) \ EM(SKB_DROP_REASON_IP_CSUM, IP_CSUM) \ EM(SKB_DROP_REASON_IP_INHDR, IP_INHDR) \ + EM(SKB_DROP_REASON_IP_RPFILTER, IP_RPFILTER) \ + EM(SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, \ + UNICAST_IN_L2_MULTICAST) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7be18de32e16..d5222c0fa87c 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -318,8 +318,10 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); + int err, drop_reason; struct rtable *rt; - int err; + + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (ip_can_use_hint(skb, iph, hint)) { err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos, @@ -396,19 +398,23 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, * so-called "hole-196" attack) so do it for both. */ if (in_dev && - IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { + drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; + } } return NET_RX_SUCCESS; drop: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: - if (err == -EXDEV) + if (err == -EXDEV) { + drop_reason = SKB_DROP_REASON_IP_RPFILTER; __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + } goto drop; } -- cgit v1.2.3 From 10580c4791902571777603cb980414892dd5f149 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:37 +0800 Subject: net: ipv4: use kfree_skb_reason() in ip_protocol_deliver_rcu() Replace kfree_skb() with kfree_skb_reason() in ip_protocol_deliver_rcu(). Following new drop reasons are introduced: SKB_DROP_REASON_XFRM_POLICY SKB_DROP_REASON_IP_NOPROTO Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ include/trace/events/skb.h | 2 ++ net/ipv4/ip_input.c | 5 +++-- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4baba45f223d..2a64afa97910 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -339,6 +339,8 @@ enum skb_drop_reason { * is multicast, but L3 is * unicast. */ + SKB_DROP_REASON_XFRM_POLICY, /* xfrm policy check failed */ + SKB_DROP_REASON_IP_NOPROTO, /* no support for IP protocol */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 485a1d3034a4..985e481c092d 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -23,6 +23,8 @@ EM(SKB_DROP_REASON_IP_RPFILTER, IP_RPFILTER) \ EM(SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, \ UNICAST_IN_L2_MULTICAST) \ + EM(SKB_DROP_REASON_XFRM_POLICY, XFRM_POLICY) \ + EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d5222c0fa87c..d94f9f7e60c3 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -196,7 +196,8 @@ resubmit: if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - kfree_skb(skb); + kfree_skb_reason(skb, + SKB_DROP_REASON_XFRM_POLICY); return; } nf_reset_ct(skb); @@ -215,7 +216,7 @@ resubmit: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); -- cgit v1.2.3 From 08d4c0370c400fa6ef2194f9ee2e8dccc4a7ab39 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:39 +0800 Subject: net: udp: use kfree_skb_reason() in __udp_queue_rcv_skb() Replace kfree_skb() with kfree_skb_reason() in __udp_queue_rcv_skb(). Following new drop reasons are introduced: SKB_DROP_REASON_SOCKET_RCVBUFF SKB_DROP_REASON_PROTO_MEM Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ include/trace/events/skb.h | 2 ++ net/ipv4/udp.c | 10 +++++++--- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2a64afa97910..a5adbf6b51e8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -341,6 +341,11 @@ enum skb_drop_reason { */ SKB_DROP_REASON_XFRM_POLICY, /* xfrm policy check failed */ SKB_DROP_REASON_IP_NOPROTO, /* no support for IP protocol */ + SKB_DROP_REASON_SOCKET_RCVBUFF, /* socket receive buff is full */ + SKB_DROP_REASON_PROTO_MEM, /* proto memory limition, such as + * udp packet drop out of + * udp_memory_allocated. + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 985e481c092d..cfcfd26399f7 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -25,6 +25,8 @@ UNICAST_IN_L2_MULTICAST) \ EM(SKB_DROP_REASON_XFRM_POLICY, XFRM_POLICY) \ EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO) \ + EM(SKB_DROP_REASON_SOCKET_RCVBUFF, SOCKET_RCVBUFF) \ + EM(SKB_DROP_REASON_PROTO_MEM, PROTO_MEM) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 952f5bf108a5..6b4d8361560f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2093,16 +2093,20 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); + int drop_reason; /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) + if (rc == -ENOMEM) { UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); - else + drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; + } else { UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, is_udplite); + drop_reason = SKB_DROP_REASON_PROTO_MEM; + } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } -- cgit v1.2.3 From 3486bedd99196ecdfe99c0ab5b67ad3c47e8a8fa Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:35 -0800 Subject: bpf: Use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem This enables sub-page memory charge and allocation. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-3-song@kernel.org --- include/linux/bpf.h | 4 ++-- kernel/bpf/core.c | 17 ++++++++--------- kernel/bpf/trampoline.c | 6 +++--- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6eb0b180d33b..366f88afd56b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -846,8 +846,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -int bpf_jit_charge_modmem(u32 pages); -void bpf_jit_uncharge_modmem(u32 pages); +int bpf_jit_charge_modmem(u32 size); +void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 04a8d5bea552..6ca0550c4b24 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -833,12 +833,11 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(pages, &bpf_jit_current) > - (bpf_jit_limit >> PAGE_SHIFT)) { + if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { if (!bpf_capable()) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } @@ -846,9 +845,9 @@ int bpf_jit_charge_modmem(u32 pages) return 0; } -void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 size) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) @@ -879,11 +878,11 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); pages = size / PAGE_SIZE; - if (bpf_jit_charge_modmem(pages)) + if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); return NULL; } @@ -906,7 +905,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) u32 pages = hdr->pages; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(pages << PAGE_SHIFT); } /* This symbol is only overridden by archs that have different diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 4b6974a195c1..e76a488c09c3 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -213,7 +213,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -310,7 +310,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) if (!im) goto out; - err = bpf_jit_charge_modmem(1); + err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; @@ -332,7 +332,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) out_free_image: bpf_jit_free_exec(im->image); out_uncharge: - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: -- cgit v1.2.3 From ed2d9e1a26cca963ff5ed3b76326d70f7d8201a9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:36 -0800 Subject: bpf: Use size instead of pages in bpf_binary_header This is necessary to charge sub page memory for the BPF program. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-4-song@kernel.org --- include/linux/filter.h | 6 +++--- kernel/bpf/core.c | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index d23e999dc032..5855eb474c62 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -548,7 +548,7 @@ struct sock_fprog_kern { #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { - u32 pages; + u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; @@ -886,8 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); - set_memory_ro((unsigned long)hdr, hdr->pages); - set_memory_x((unsigned long)hdr, hdr->pages); + set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT); + set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } static inline struct bpf_binary_header * diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6ca0550c4b24..14199228a6f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -543,7 +543,7 @@ bpf_prog_ksym_set_addr(struct bpf_prog *prog) WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.end = addr + hdr->size; } static void @@ -866,7 +866,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - u32 size, hole, start, pages; + u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); @@ -876,7 +876,6 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - pages = size / PAGE_SIZE; if (bpf_jit_charge_modmem(size)) return NULL; @@ -889,7 +888,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = pages; + hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -902,10 +901,10 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - u32 pages = hdr->pages; + u32 size = hdr->size; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages << PAGE_SHIFT); + bpf_jit_uncharge_modmem(size); } /* This symbol is only overridden by archs that have different -- cgit v1.2.3 From ebc1415d9b4f043cef5a1fb002ec316e32167e7a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:39 -0800 Subject: bpf: Introduce bpf_arch_text_copy This will be used to copy JITed text to RO protected module memory. On x86, bpf_arch_text_copy is implemented with text_poke_copy. bpf_arch_text_copy returns pointer to dst on success, and ERR_PTR(errno) on errors. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-7-song@kernel.org --- arch/x86/net/bpf_jit_comp.c | 7 +++++++ include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 5 +++++ 3 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 36f6fc3e6e69..c13d148f7396 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2412,3 +2412,10 @@ bool bpf_jit_supports_kfunc_call(void) { return true; } + +void *bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + if (text_poke_copy(dst, src, len) == NULL) + return ERR_PTR(-EINVAL); + return dst; +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 366f88afd56b..ea0d7fd4a410 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2362,6 +2362,8 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void *bpf_arch_text_copy(void *dst, void *src, size_t len); + struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e3fe53df0a71..a5ec480f9862 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2440,6 +2440,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return -ENOTSUPP; } +void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + return ERR_PTR(-ENOTSUPP); +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); -- cgit v1.2.3 From 33c9805860e584b194199cab1a1e81f4e6395408 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:41 -0800 Subject: bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free] This is the jit binary allocator built on top of bpf_prog_pack. bpf_prog_pack allocates RO memory, which cannot be used directly by the JIT engine. Therefore, a temporary rw buffer is allocated for the JIT engine. Once JIT is done, bpf_jit_binary_pack_finalize is used to copy the program to the RO memory. bpf_jit_binary_pack_alloc reserves 16 bytes of extra space for illegal instructions, which is small than the 128 bytes space reserved by bpf_jit_binary_alloc. This change is necessary for bpf_jit_binary_hdr to find the correct header. Also, flag use_bpf_prog_pack is added to differentiate a program allocated by bpf_jit_binary_pack_alloc. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-9-song@kernel.org --- include/linux/bpf.h | 1 + include/linux/filter.h | 21 +++++----- kernel/bpf/core.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 120 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea0d7fd4a410..2fc7e5c5ef41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -953,6 +953,7 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; bool xdp_has_frags; + bool use_bpf_prog_pack; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5855eb474c62..1cb1af917617 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -890,15 +890,6 @@ static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } -static inline struct bpf_binary_header * -bpf_jit_binary_hdr(const struct bpf_prog *fp) -{ - unsigned long real_start = (unsigned long)fp->bpf_func; - unsigned long addr = real_start & PAGE_MASK; - - return (void *)addr; -} - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { @@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, + unsigned int alignment, + struct bpf_binary_header **rw_hdr, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); + int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7ae590897b73..306aa63fa58e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1031,6 +1031,109 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) bpf_jit_uncharge_modmem(size); } +/* Allocate jit binary from bpf_prog_pack allocator. + * Since the allocated memory is RO+X, the JIT engine cannot write directly + * to the memory. To solve this problem, a RW buffer is also allocated at + * as the same time. The JIT engine should calculate offsets based on the + * RO memory address, but write JITed program to the RW buffer. Once the + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies + * the JITed program to the RO memory. + */ +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + struct bpf_binary_header **rw_header, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *ro_header; + u32 size, hole, start; + + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + + /* add 16 bytes for a random section of illegal instructions */ + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); + + if (bpf_jit_charge_modmem(size)) + return NULL; + ro_header = bpf_prog_pack_alloc(size); + if (!ro_header) { + bpf_jit_uncharge_modmem(size); + return NULL; + } + + *rw_header = kvmalloc(size, GFP_KERNEL); + if (!*rw_header) { + bpf_prog_pack_free(ro_header); + bpf_jit_uncharge_modmem(size); + return NULL; + } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(*rw_header, size); + (*rw_header)->size = size; + + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); + start = (get_random_int() % hole) & ~(alignment - 1); + + *image_ptr = &ro_header->image[start]; + *rw_image = &(*rw_header)->image[start]; + + return ro_header; +} + +/* Copy JITed text from rw_header to its final location, the ro_header. */ +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + void *ptr; + + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); + + kvfree(rw_header); + + if (IS_ERR(ptr)) { + bpf_prog_pack_free(ro_header); + return PTR_ERR(ptr); + } + prog->aux->use_bpf_prog_pack = true; + return 0; +} + +/* bpf_jit_binary_pack_free is called in two different scenarios: + * 1) when the program is freed after; + * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). + * For case 2), we need to free both the RO memory and the RW buffer. + * Also, ro_header->size in 2) is not properly set yet, so rw_header->size + * is used for uncharge. + */ +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + u32 size = rw_header ? rw_header->size : ro_header->size; + + bpf_prog_pack_free(ro_header); + kvfree(rw_header); + bpf_jit_uncharge_modmem(size); +} + +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + if (fp->aux->use_bpf_prog_pack) + addr = real_start & BPF_PROG_CHUNK_MASK; + else + addr = real_start & PAGE_MASK; + + return (void *)addr; +} + /* This symbol is only overridden by archs that have different * requirements than the usual eBPF JITs, f.e. when they only * implement cBPF JIT, do not set images read-only, etc. @@ -1040,7 +1143,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_free(hdr); + if (fp->aux->use_bpf_prog_pack) + bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); + else + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } -- cgit v1.2.3 From b2309a71c1f2fc841feb184195b2e46b2e139bf4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 10:41:07 -0800 Subject: net: add dev->dev_registered_tracker Convert one dev_hold()/dev_put() pair in register_netdevice() and unregister_netdevice_many() to dev_hold_track() and dev_put_track(). This would allow to detect a rogue dev_put() a bit earlier. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20220207184107.1401096-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3fb6fb67ed77..5f6e2c0b0c90 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1948,6 +1948,8 @@ enum netdev_ml_priv_type { * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * @watchdog_dev_tracker: refcount tracker used by watchdog. + * @dev_registered_tracker: tracker for reference held while + * registered * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2282,6 +2284,7 @@ struct net_device { u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; netdevice_tracker watchdog_dev_tracker; + netdevice_tracker dev_registered_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/net/core/dev.c b/net/core/dev.c index f662c6a7d7b4..66556a21800a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9683,8 +9683,10 @@ int register_netdevice(struct net_device *dev) linkwatch_init_dev(dev); dev_init_scheduler(dev); - dev_hold(dev); + + dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL); list_netdevice(dev); + add_device_randomness(dev->dev_addr, dev->addr_len); /* If the device has permanent device address, driver should @@ -10449,7 +10451,7 @@ void unregister_netdevice_many(struct list_head *head) synchronize_net(); list_for_each_entry(dev, head, unreg_list) { - dev_put(dev); + dev_put_track(dev, &dev->dev_registered_tracker); net_set_todo(dev); } -- cgit v1.2.3 From 48b6190a00425a1bebac9f7ae4b338a1e20f50f3 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:36 +0800 Subject: net/smc: Limit SMC visits when handshake workqueue congested This patch intends to provide a mechanism to put constraint on SMC connections visit according to the pressure of SMC handshake process. At present, frequent visits will cause the incoming connections to be backlogged in SMC handshake queue, raise the connections established time. Which is quite unacceptable for those applications who base on short lived connections. There are two ways to implement this mechanism: 1. Put limitation after TCP established. 2. Put limitation before TCP established. In the first way, we need to wait and receive CLC messages that the client will potentially send, and then actively reply with a decline message, in a sense, which is also a sort of SMC handshake, affect the connections established time on its way. In the second way, the only problem is that we need to inject SMC logic into TCP when it is about to reply the incoming SYN, since we already do that, it's seems not a problem anymore. And advantage is obvious, few additional processes are required to complete the constraint. This patch use the second way. After this patch, connections who beyond constraint will not informed any SMC indication, and SMC will not be involved in any of its subsequent processes. Link: https://lore.kernel.org/all/1641301961-59331-1-git-send-email-alibuda@linux.alibaba.com/ Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 3 ++- net/smc/af_smc.c | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 78b91bb92f0d..1168302b7927 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -394,6 +394,7 @@ struct tcp_sock { bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) + bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index af94a6d22a9d..92e65d56dc2c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6703,7 +6703,8 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) - ireq->smc_ok = rx_opt->smc_ok; + ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && + tcp_sk(sk)->smc_hs_congested(sk)); #endif } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 858724256078..a05ffb268c3e 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -103,6 +103,21 @@ drop: return NULL; } +static bool smc_hs_congested(const struct sock *sk) +{ + const struct smc_sock *smc; + + smc = smc_clcsock_user_data(sk); + + if (!smc) + return true; + + if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) + return true; + + return false; +} + static struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; @@ -2311,6 +2326,8 @@ static int smc_listen(struct socket *sock, int backlog) inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + rc = kernel_listen(smc->clcsock, backlog); if (rc) { smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready; -- cgit v1.2.3 From a6a6fe27bab48f0d09a64b051e7bde432fcae081 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:37 +0800 Subject: net/smc: Dynamic control handshake limitation by socket options This patch aims to add dynamic control for SMC handshake limitation for every smc sockets, in production environment, it is possible for the same applications to handle different service types, and may have different opinion on SMC handshake limitation. This patch try socket options to complete it, since we don't have socket option level for SMC yet, which requires us to implement it at the same time. This patch does the following: - add new socket option level: SOL_SMC. - add new SMC socket option: SMC_LIMIT_HS. - provide getter/setter for SMC socket options. Link: https://lore.kernel.org/all/20f504f961e1a803f85d64229ad84260434203bd.1644323503.git.alibuda@linux.alibaba.com/ Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + include/uapi/linux/smc.h | 4 +++ net/smc/af_smc.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc.h | 1 + 4 files changed, 74 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 8ef26d89ef49..6f85f5d957ef 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -366,6 +366,7 @@ struct ucred { #define SOL_XDP 283 #define SOL_MPTCP 284 #define SOL_MCTP 285 +#define SOL_SMC 286 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 6c2874fd2c00..343e7450c3a3 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -284,4 +284,8 @@ enum { __SMC_NLA_SEID_TABLE_MAX, SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; + +/* SMC socket options */ +#define SMC_LIMIT_HS 1 /* constraint on smc handshake */ + #endif /* _UAPI_LINUX_SMC_H */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a05ffb268c3e..97dcdc0a2107 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2326,7 +2326,8 @@ static int smc_listen(struct socket *sock, int backlog) inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; - tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; + if (smc->limit_smc_hs) + tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; rc = kernel_listen(smc->clcsock, backlog); if (rc) { @@ -2621,6 +2622,67 @@ out: return rc ? rc : rc1; } +static int __smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct smc_sock *smc; + int val, len; + + smc = smc_sk(sock->sk); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case SMC_LIMIT_HS: + val = smc->limit_smc_hs; + break; + default: + return -EOPNOTSUPP; + } + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +static int __smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct sock *sk = sock->sk; + struct smc_sock *smc; + int val, rc; + + smc = smc_sk(sk); + + lock_sock(sk); + switch (optname) { + case SMC_LIMIT_HS: + if (optlen < sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + + smc->limit_smc_hs = !!val; + rc = 0; + break; + default: + rc = -EOPNOTSUPP; + break; + } + release_sock(sk); + + return rc; +} + static int smc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -2630,6 +2692,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (level == SOL_TCP && optname == TCP_ULP) return -EOPNOTSUPP; + else if (level == SOL_SMC) + return __smc_setsockopt(sock, level, optname, optval, optlen); smc = smc_sk(sk); @@ -2712,6 +2776,9 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, struct smc_sock *smc; int rc; + if (level == SOL_SMC) + return __smc_getsockopt(sock, level, optname, optval, optlen); + smc = smc_sk(sock->sk); mutex_lock(&smc->clcsock_release_lock); if (!smc->clcsock) { diff --git a/net/smc/smc.h b/net/smc/smc.h index e91e40040d07..7e2693832a1b 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -249,6 +249,7 @@ struct smc_sock { /* smc sock container */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ + bool limit_smc_hs; /* put constraint on handshake */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ -- cgit v1.2.3 From 2618a0dae09ef37728dab89ff60418cbe25ae6bd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 12 Feb 2022 09:14:49 -0800 Subject: etherdevice: Adjust ether_addr* prototypes to silence -Wstringop-overead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With GCC 12, -Wstringop-overread was warning about an implicit cast from char[6] to char[8]. However, the extra 2 bytes are always thrown away, alignment doesn't matter, and the risk of hitting the edge of unallocated memory has been accepted, so this prototype can just be converted to a regular char *. Silences: net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread] 4618 | orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’ 375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], | ^~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Marc Kleine-Budde Tested-by: Marc Kleine-Budde Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de Cc: Jakub Kicinski Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2ad71cc90b37..92b10e67d5f8 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -134,7 +134,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr) #endif } -static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN @@ -372,8 +372,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ -static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], - const u8 addr2[6+2]) +static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); -- cgit v1.2.3 From baebdf48c360080710f80699eea3affbb13d6c65 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 12 Feb 2022 00:38:38 +0100 Subject: net: dev: Makes sure netif_rx() can be invoked in any context. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dave suggested a while ago (eleven years by now) "Let's make netif_rx() work in all contexts and get rid of netif_rx_ni()". Eric agreed and pointed out that modern devices should use netif_receive_skb() to avoid the overhead. In the meantime someone added another variant, netif_rx_any_context(), which behaves as suggested. netif_rx() must be invoked with disabled bottom halves to ensure that pending softirqs, which were raised within the function, are handled. netif_rx_ni() can be invoked only from process context (bottom halves must be enabled) because the function handles pending softirqs without checking if bottom halves were disabled or not. netif_rx_any_context() invokes on the former functions by checking in_interrupts(). netif_rx() could be taught to handle both cases (disabled and enabled bottom halves) by simply disabling bottom halves while invoking netif_rx_internal(). The local_bh_enable() invocation will then invoke pending softirqs only if the BH-disable counter drops to zero. Eric is concerned about the overhead of BH-disable+enable especially in regard to the loopback driver. As critical as this driver is, it will receive a shortcut to avoid the additional overhead which is not needed. Add a local_bh_disable() section in netif_rx() to ensure softirqs are handled if needed. Provide __netif_rx() which does not disable BH and has a lockdep assert to ensure that interrupts are disabled. Use this shortcut in the loopback driver and in drivers/net/*.c. Make netif_rx_ni() and netif_rx_any_context() invoke netif_rx() so they can be removed once they are no more users left. Link: https://lkml.kernel.org/r/20100415.020246.218622820.davem@davemloft.net Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Eric Dumazet Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- drivers/net/amt.c | 4 +-- drivers/net/geneve.c | 4 +-- drivers/net/gtp.c | 2 +- drivers/net/loopback.c | 4 +-- drivers/net/macsec.c | 6 ++--- drivers/net/macvlan.c | 4 +-- drivers/net/mhi_net.c | 2 +- drivers/net/ntb_netdev.c | 2 +- drivers/net/rionet.c | 2 +- drivers/net/sb1000.c | 2 +- drivers/net/veth.c | 2 +- drivers/net/vrf.c | 2 +- drivers/net/vxlan.c | 2 +- include/linux/netdevice.h | 14 ++++++++-- include/trace/events/net.h | 14 ---------- net/core/dev.c | 67 ++++++++++++++++++++-------------------------- 16 files changed, 60 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/amt.c b/drivers/net/amt.c index f1a36d7e2151..10455c9b9da0 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -2373,7 +2373,7 @@ static bool amt_membership_query_handler(struct amt_dev *amt, skb->pkt_type = PACKET_MULTICAST; skb->ip_summed = CHECKSUM_NONE; len = skb->len; - if (netif_rx(skb) == NET_RX_SUCCESS) { + if (__netif_rx(skb) == NET_RX_SUCCESS) { amt_update_gw_status(amt, AMT_STATUS_RECEIVED_QUERY, true); dev_sw_netstats_rx_add(amt->dev, len); } else { @@ -2470,7 +2470,7 @@ report: skb->pkt_type = PACKET_MULTICAST; skb->ip_summed = CHECKSUM_NONE; len = skb->len; - if (netif_rx(skb) == NET_RX_SUCCESS) { + if (__netif_rx(skb) == NET_RX_SUCCESS) { amt_update_relay_status(tunnel, AMT_STATUS_RECEIVED_UPDATE, true); dev_sw_netstats_rx_add(amt->dev, len); diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index c1fdd721a730..a895ff756093 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -925,7 +925,7 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, } skb->protocol = eth_type_trans(skb, geneve->dev); - netif_rx(skb); + __netif_rx(skb); dst_release(&rt->dst); return -EMSGSIZE; } @@ -1021,7 +1021,7 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, } skb->protocol = eth_type_trans(skb, geneve->dev); - netif_rx(skb); + __netif_rx(skb); dst_release(dst); return -EMSGSIZE; } diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 24e5c54d06c1..bf087171bcf0 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -207,7 +207,7 @@ static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, dev_sw_netstats_rx_add(pctx->dev, skb->len); - netif_rx(skb); + __netif_rx(skb); return 0; err: diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index ed0edf5884ef..d05f86fe78c9 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -78,7 +78,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, skb_orphan(skb); - /* Before queueing this packet to netif_rx(), + /* Before queueing this packet to __netif_rx(), * make sure dst is refcounted. */ skb_dst_force(skb); @@ -86,7 +86,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, skb->protocol = eth_type_trans(skb, dev); len = skb->len; - if (likely(netif_rx(skb) == NET_RX_SUCCESS)) + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_lstats_add(dev, len); return NETDEV_TX_OK; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 3d0874331763..832f09ac075e 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1033,7 +1033,7 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) else nskb->pkt_type = PACKET_MULTICAST; - netif_rx(nskb); + __netif_rx(nskb); } continue; } @@ -1056,7 +1056,7 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) nskb->dev = ndev; - if (netif_rx(nskb) == NET_RX_SUCCESS) { + if (__netif_rx(nskb) == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); @@ -1288,7 +1288,7 @@ nosci: macsec_reset_skb(nskb, macsec->secy.netdev); - ret = netif_rx(nskb); + ret = __netif_rx(nskb); if (ret == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUnknownSCI++; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 6ef5f77be4d0..d87c06c317ed 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -410,7 +410,7 @@ static void macvlan_forward_source_one(struct sk_buff *skb, if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr)) nskb->pkt_type = PACKET_HOST; - ret = netif_rx(nskb); + ret = __netif_rx(nskb); macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } @@ -468,7 +468,7 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) /* forward to original port. */ vlan = src; ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?: - netif_rx(skb); + __netif_rx(skb); handle_res = RX_HANDLER_CONSUMED; goto out; } diff --git a/drivers/net/mhi_net.c b/drivers/net/mhi_net.c index aaa628f859fd..0b1b6f650104 100644 --- a/drivers/net/mhi_net.c +++ b/drivers/net/mhi_net.c @@ -225,7 +225,7 @@ static void mhi_net_dl_callback(struct mhi_device *mhi_dev, u64_stats_inc(&mhi_netdev->stats.rx_packets); u64_stats_add(&mhi_netdev->stats.rx_bytes, skb->len); u64_stats_update_end(&mhi_netdev->stats.rx_syncp); - netif_rx(skb); + __netif_rx(skb); } /* Refill if RX buffers queue becomes low */ diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c index 98ca6b18415e..80bdc07f2cd3 100644 --- a/drivers/net/ntb_netdev.c +++ b/drivers/net/ntb_netdev.c @@ -119,7 +119,7 @@ static void ntb_netdev_rx_handler(struct ntb_transport_qp *qp, void *qp_data, skb->protocol = eth_type_trans(skb, ndev); skb->ip_summed = CHECKSUM_NONE; - if (netif_rx(skb) == NET_RX_DROP) { + if (__netif_rx(skb) == NET_RX_DROP) { ndev->stats.rx_errors++; ndev->stats.rx_dropped++; } else { diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 1a95f3beb784..39e61e07e489 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -109,7 +109,7 @@ static int rionet_rx_clean(struct net_device *ndev) skb_put(rnet->rx_skb[i], RIO_MAX_MSG_SIZE); rnet->rx_skb[i]->protocol = eth_type_trans(rnet->rx_skb[i], ndev); - error = netif_rx(rnet->rx_skb[i]); + error = __netif_rx(rnet->rx_skb[i]); if (error == NET_RX_DROP) { ndev->stats.rx_dropped++; diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c index 57a6d598467b..c3f8020571ad 100644 --- a/drivers/net/sb1000.c +++ b/drivers/net/sb1000.c @@ -872,7 +872,7 @@ printk("cm0: IP identification: %02x%02x fragment offset: %02x%02x\n", buffer[3 /* datagram completed: send to upper level */ skb_trim(skb, dlen); - netif_rx(skb); + __netif_rx(skb); stats->rx_bytes+=dlen; stats->rx_packets++; lp->rx_skb[ns] = NULL; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index d29fb9759cc9..58b20ea171dd 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -287,7 +287,7 @@ static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, { return __dev_forward_skb(dev, skb) ?: xdp ? veth_xdp_rx(rq, skb) : - netif_rx(skb); + __netif_rx(skb); } /* return true if the specified skb has chances of GRO aggregation diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index e0b1ab99a359..714cafcf6c6c 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -418,7 +418,7 @@ static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev, skb->protocol = eth_type_trans(skb, dev); - if (likely(netif_rx(skb) == NET_RX_SUCCESS)) + if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) vrf_rx_stats(dev, len); else this_cpu_inc(dev->dstats->rx_drps); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 359d16780dbb..d0dc90d3dac2 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2541,7 +2541,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, tx_stats->tx_bytes += len; u64_stats_update_end(&tx_stats->syncp); - if (netif_rx(skb) == NET_RX_SUCCESS) { + if (__netif_rx(skb) == NET_RX_SUCCESS) { u64_stats_update_begin(&rx_stats->syncp); rx_stats->rx_packets++; rx_stats->rx_bytes += len; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5f6e2c0b0c90..e99db8341da5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3672,8 +3672,18 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); -int netif_rx_ni(struct sk_buff *skb); -int netif_rx_any_context(struct sk_buff *skb); +int __netif_rx(struct sk_buff *skb); + +static inline int netif_rx_ni(struct sk_buff *skb) +{ + return netif_rx(skb); +} + +static inline int netif_rx_any_context(struct sk_buff *skb) +{ + return netif_rx(skb); +} + int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); diff --git a/include/trace/events/net.h b/include/trace/events/net.h index 78c448c6ab4c..032b431b987b 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -260,13 +260,6 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_ARGS(skb) ); -DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry, - - TP_PROTO(const struct sk_buff *skb), - - TP_ARGS(skb) -); - DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), @@ -312,13 +305,6 @@ DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_ARGS(ret) ); -DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit, - - TP_PROTO(int ret), - - TP_ARGS(ret) -); - DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), diff --git a/net/core/dev.c b/net/core/dev.c index 441bacf79199..ce5047e8db1f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4815,66 +4815,57 @@ static int netif_rx_internal(struct sk_buff *skb) return ret; } +/** + * __netif_rx - Slightly optimized version of netif_rx + * @skb: buffer to post + * + * This behaves as netif_rx except that it does not disable bottom halves. + * As a result this function may only be invoked from the interrupt context + * (either hard or soft interrupt). + */ +int __netif_rx(struct sk_buff *skb) +{ + int ret; + + lockdep_assert_once(hardirq_count() | softirq_count()); + + trace_netif_rx_entry(skb); + ret = netif_rx_internal(skb); + trace_netif_rx_exit(ret); + return ret; +} +EXPORT_SYMBOL(__netif_rx); + /** * netif_rx - post buffer to the network code * @skb: buffer to post * * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. + * the upper (protocol) levels to process via the backlog NAPI device. It + * always succeeds. The buffer may be dropped during processing for + * congestion control or by the protocol layers. + * The network buffer is passed via the backlog NAPI device. Modern NIC + * driver should use NAPI and GRO. + * This function can used from any context. * * return values: * NET_RX_SUCCESS (no congestion) * NET_RX_DROP (packet was dropped) * */ - int netif_rx(struct sk_buff *skb) { int ret; + local_bh_disable(); trace_netif_rx_entry(skb); - ret = netif_rx_internal(skb); trace_netif_rx_exit(ret); - + local_bh_enable(); return ret; } EXPORT_SYMBOL(netif_rx); -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - trace_netif_rx_ni_entry(skb); - - preempt_disable(); - err = netif_rx_internal(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - trace_netif_rx_ni_exit(err); - - return err; -} -EXPORT_SYMBOL(netif_rx_ni); - -int netif_rx_any_context(struct sk_buff *skb) -{ - /* - * If invoked from contexts which do not invoke bottom half - * processing either at return from interrupt or when softrqs are - * reenabled, use netif_rx_ni() which invokes bottomhalf processing - * directly. - */ - if (in_interrupt()) - return netif_rx(skb); - else - return netif_rx_ni(skb); -} -EXPORT_SYMBOL(netif_rx_any_context); - static __latent_entropy void net_tx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); -- cgit v1.2.3 From 76f05d88623e559eb9fc41db9fb911e67fab0e7a Mon Sep 17 00:00:00 2001 From: M Chetan Kumar Date: Mon, 14 Feb 2022 12:46:52 +0530 Subject: net: wwan: debugfs obtained dev reference not dropped WWAN driver call's wwan_get_debugfs_dir() to obtain WWAN debugfs dir entry. As part of this procedure it returns a reference to a found device. Since there is no debugfs interface available at WWAN subsystem, it is not possible to drop dev reference post debugfs use. This leads to side effects like post wwan driver load and reload the wwan instance gets increment from wwanX to wwanX+1. A new debugfs interface is added in wwan subsystem so that wwan driver can drop the obtained dev reference post debugfs use. void wwan_put_debugfs_dir(struct dentry *dir) Signed-off-by: M Chetan Kumar Signed-off-by: David S. Miller --- drivers/net/wwan/wwan_core.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/wwan.h | 2 ++ 2 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index 1508dc2a497b..b8c7843730ed 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -160,6 +160,42 @@ struct dentry *wwan_get_debugfs_dir(struct device *parent) return wwandev->debugfs_dir; } EXPORT_SYMBOL_GPL(wwan_get_debugfs_dir); + +static int wwan_dev_debugfs_match(struct device *dev, const void *dir) +{ + struct wwan_device *wwandev; + + if (dev->type != &wwan_dev_type) + return 0; + + wwandev = to_wwan_dev(dev); + + return wwandev->debugfs_dir == dir; +} + +static struct wwan_device *wwan_dev_get_by_debugfs(struct dentry *dir) +{ + struct device *dev; + + dev = class_find_device(wwan_class, NULL, dir, wwan_dev_debugfs_match); + if (!dev) + return ERR_PTR(-ENODEV); + + return to_wwan_dev(dev); +} + +void wwan_put_debugfs_dir(struct dentry *dir) +{ + struct wwan_device *wwandev = wwan_dev_get_by_debugfs(dir); + + if (WARN_ON(IS_ERR(wwandev))) + return; + + /* wwan_dev_get_by_debugfs() also got a reference */ + put_device(&wwandev->dev); + put_device(&wwandev->dev); +} +EXPORT_SYMBOL_GPL(wwan_put_debugfs_dir); #endif /* This function allocates and registers a new WWAN device OR if a WWAN device diff --git a/include/linux/wwan.h b/include/linux/wwan.h index afb3334ec8c5..5ce2acf444fb 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -174,11 +174,13 @@ void wwan_unregister_ops(struct device *parent); #ifdef CONFIG_WWAN_DEBUGFS struct dentry *wwan_get_debugfs_dir(struct device *parent); +void wwan_put_debugfs_dir(struct dentry *dir); #else static inline struct dentry *wwan_get_debugfs_dir(struct device *parent) { return ERR_PTR(-ENODEV); } +static inline void wwan_put_debugfs_dir(struct dentry *dir) {} #endif #endif /* __WWAN_H */ -- cgit v1.2.3 From 08bc13d8efe30258850ecdc5d4f38c0bc07689c0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Feb 2022 20:12:43 +0100 Subject: ieee80211: use tab to indent struct ieee80211_neighbor_ap_info Somehow spaces were used here, use tab instead. Link: https://lore.kernel.org/r/20220210201242.da8fa2e5ae8d.Ia452db01876e52e815f6337fef437049df0d8bd9@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 60ee7b3f58e7..a2940a853783 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4005,10 +4005,10 @@ static inline bool for_each_element_completed(const struct element *element, #define IEEE80211_RNR_TBTT_PARAMS_COLOC_AP 0x40 struct ieee80211_neighbor_ap_info { - u8 tbtt_info_hdr; - u8 tbtt_info_len; - u8 op_class; - u8 channel; + u8 tbtt_info_hdr; + u8 tbtt_info_len; + u8 op_class; + u8 channel; } __packed; enum ieee80211_range_params_max_total_ltf { -- cgit v1.2.3 From d61f4274daa41565b5f9ce589b4ba1239781c139 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 14 Feb 2022 17:29:21 +0100 Subject: ieee80211: add helper to check HE capability element size This element has a very dynamic structure, create a small helper function to validate its size. We're currently checking it in mac80211 in a conversion function, but that's actually slightly buggy. Link: https://lore.kernel.org/r/20220214172920.750bee9eaf37.Ie18359bd38143b7dc949078f10752413e6d36854@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a2940a853783..dfc84680af82 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2021 Intel Corporation + * Copyright (c) 2018 - 2022 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -2338,6 +2338,29 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) return n; } +static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data; + u8 needed = sizeof(*he_cap_ie_elem); + + if (len < needed) + return false; + + needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem); + if (len < needed) + return false; + + if (he_cap_ie_elem->phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { + if (len < needed + 1) + return false; + needed += ieee80211_he_ppe_size(data[needed], + he_cap_ie_elem->phy_cap_info); + } + + return len >= needed; +} + /* HE Operation defines */ #define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 #define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 -- cgit v1.2.3 From cbc1ca0a9d0a54cb8aa7c54c16e71599551e3f74 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:51 +0100 Subject: ieee80211: Add EHT (802.11be) definitions Based on Draft P802.11be_D1.4. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.928e23cacb2b.Id30a3ef2844b296efbd5486fe1da9ca36a95c5cf@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 299 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index dfc84680af82..7204bc28ca31 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -1925,6 +1926,111 @@ struct ieee80211_mu_edca_param_set { struct ieee80211_he_mu_edca_param_ac_rec ac_vo; } __packed; +#define IEEE80211_EHT_MCS_NSS_RX 0x0f +#define IEEE80211_EHT_MCS_NSS_TX 0xf0 + +/** + * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max + * supported NSS for per MCS. + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 7. + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 8 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + */ +struct ieee80211_eht_mcs_nss_supp_20mhz_only { + u8 rx_tx_mcs7_max_nss; + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; +}; + +/** + * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except + * 20MHz only stations). + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + */ +struct ieee80211_eht_mcs_nss_supp_bw { + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; +}; + +/** + * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data + * + * This structure is the "EHT Capabilities element" fixed fields as + * described in P802.11be_D1.4 section 9.4.2.313. + * + * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP* + * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP* + */ +struct ieee80211_eht_cap_elem_fixed { + u8 mac_cap_info[2]; + u8 phy_cap_info[9]; +} __packed; + +/** + * struct ieee80211_eht_cap_elem - EHT capabilities element + * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed + * @optional: optional parts + */ +struct ieee80211_eht_cap_elem { + struct ieee80211_eht_cap_elem_fixed fixed; + + /* + * Followed by: + * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets. + * EHT PPE Thresholds field: variable length. + */ + u8 optional[]; +} __packed; + +/** + * struct ieee80211_eht_operation - eht operation element + * + * This structure is the "EHT Operation Element" fields as + * described in P802.11be_D1.4 section 9.4.2.311 + * + * FIXME: The spec is unclear how big the fields are, and doesn't + * indicate the "Disabled Subchannel Bitmap Present" in the + * structure (Figure 9-1002a) at all ... + */ +struct ieee80211_eht_operation { + u8 chan_width; + u8 ccfs; + u8 present_bm; + + u8 disable_subchannel_bitmap[]; +} __packed; + +#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT 0x1 + /* 802.11ac VHT Capabilities */ #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 @@ -2129,6 +2235,8 @@ enum ieee80211_client_reg_power { #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL 0x1e + #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe @@ -2622,6 +2730,194 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +/* EHT MAC capabilities as defined in P802.11be_D1.4 section 9.4.2.313.2 */ +#define IEEE80211_EHT_MAC_CAP0_NSEP_PRIO_ACCESS 0x01 +#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL 0x02 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 0x04 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 0x08 +#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT 0x10 +#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC 0x20 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_MASK 0xc0 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_3895 0 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_7991 1 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_11454 2 + +/* EHT PHY capabilities as defined in P802.11be_D1.4 section 9.4.2.313.3 */ +#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 +#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI 0x08 +#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO 0x10 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER 0x20 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE 0x40 + +/* EHT beamformee number of spatial streams <= 80MHz is split */ +#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK 0x80 +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK 0x03 + +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK 0x1c +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK 0xe0 + +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK 0x07 +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK 0x38 + +/* EHT number of sounding dimensions for 320MHz is split */ +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK 0x01 +#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK 0x02 +#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK 0x04 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK 0x08 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK 0x10 +#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK 0x20 +#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK 0x40 +#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK 0x80 + +#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO 0x01 +#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI 0x08 +#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK 0xf0 + +#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK 0x01 +#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT 0x08 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK 0x30 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US 0 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US 1 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US 2 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US 3 + +/* Maximum number of supported EHT LTF is split */ +#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 + +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 +#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 + +#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW 0x01 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ 0x02 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ 0x20 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ 0x40 +#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT 0x80 + +#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA 0x01 +#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA 0x02 + +/* + * EHT operation channel width as defined in P802.11be_D1.4 section 9.4.2.311 + */ +#define IEEE80211_EHT_OPER_CHAN_WIDTH 0x7 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ 0 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ 1 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ 2 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 + +/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ +static inline u8 +ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, + const struct ieee80211_eht_cap_elem_fixed *eht_cap) +{ + u8 count = 0; + + /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) + return 3; + + /* on 2.4 GHz, these three bits are reserved, so should be 0 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) + count += 3; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 3; + + if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) + count += 3; + + return count ? count : 4; +} + +/* 802.11be EHT PPE Thresholds */ +#define IEEE80211_EHT_PPE_THRES_NSS_POS 0 +#define IEEE80211_EHT_PPE_THRES_NSS_MASK 0xf +#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK 0x1f0 +#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE 3 +#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE 9 + +/* + * Calculate 802.11be EHT capabilities IE EHT field size + */ +static inline u8 +ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u32 n; + + if (!(phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT)) + return 0; + + n = hweight16(ppe_thres_hdr & + IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK); + + /* + * Each pair is 6 bits, and we need to add the 9 "header" bits to the + * total size. + */ + n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 + + IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE; + return DIV_ROUND_UP(n, 8); +} + +static inline bool +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) +{ + const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; + u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); + + if (len < needed || !he_capa) + return false; + + needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, + (const void *)data); + if (len < needed) + return false; + + if (elem->phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { + u16 ppe_thres_hdr; + + if (len < needed + sizeof(ppe_thres_hdr)) + return false; + + ppe_thres_hdr = get_unaligned_le16(data + needed); + needed += ieee80211_eht_ppe_size(ppe_thres_hdr, + elem->phy_cap_info); + } + + return len >= needed; +} + +static inline bool +ieee80211_eht_oper_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_eht_operation *elem = (const void *)data; + u8 needed = sizeof(*elem); + + if (len < needed) + return false; + + if (elem->present_bm & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) + needed += 2; + + return len >= needed; +} #define LISTEN_INT_USF GENMASK(15, 14) #define LISTEN_INT_UI GENMASK(13, 0) @@ -3077,6 +3373,9 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_SHORT_SSID_LIST = 58, WLAN_EID_EXT_HE_6GHZ_CAPA = 59, WLAN_EID_EXT_UL_MU_POWER_CAPA = 60, + WLAN_EID_EXT_EHT_OPERATION = 106, + WLAN_EID_EXT_EHT_MULTI_LINK = 107, + WLAN_EID_EXT_EHT_CAPABILITY = 108, }; /* Action category code */ -- cgit v1.2.3 From 2a2c86f15e17c5013b9897b67d895e64a25ae3cb Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Mon, 14 Feb 2022 17:29:52 +0100 Subject: ieee80211: add EHT 1K aggregation definitions We add the fields for parsing extended ADDBA request/respond, and new max 1K aggregation for limit ADDBA request/respond. Adjust drivers to use the proper macro, IEEE80211_MAX_AMPDU_BUF -> IEEE80211_MAX_AMPDU_BUF_HE. Signed-off-by: Mordechay Goodstein Link: https://lore.kernel.org/r/20220214173004.b8b447ce95b7.I0ee2554c94e89abc7a752b0f7cc7fd79c273efea@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 4 ++-- drivers/net/wireless/mediatek/mt76/mt7915/init.c | 4 ++-- include/linux/ieee80211.h | 6 +++++- net/mac80211/agg-rx.c | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 90fcd6adf2d5..bd40f4c1183a 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -8448,7 +8448,7 @@ static int __ath11k_mac_register(struct ath11k *ar) ar->hw->queues = ATH11K_HW_MAX_QUEUES; ar->hw->wiphy->tx_queue_len = ATH11K_QUEUE_LEN; ar->hw->offchannel_tx_hw_queue = ATH11K_HW_MAX_QUEUES - 1; - ar->hw->max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + ar->hw->max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HE; ar->hw->vif_data_size = sizeof(struct ath11k_vif); ar->hw->sta_data_size = sizeof(struct ath11k_sta); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 87630d38dc52..5da9d98043fd 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -1077,12 +1077,12 @@ iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, if (!hw) return NULL; - hw->max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + hw->max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HE; if (cfg->max_tx_agg_size) hw->max_tx_aggregation_subframes = cfg->max_tx_agg_size; else - hw->max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + hw->max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HE; op_mode = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c index 705f362b8f7b..4cf299ea11f5 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c @@ -311,8 +311,8 @@ mt7915_init_wiphy(struct ieee80211_hw *hw) struct mt7915_dev *dev = phy->dev; hw->queues = 4; - hw->max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; - hw->max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF; + hw->max_rx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HE; + hw->max_tx_aggregation_subframes = IEEE80211_MAX_AMPDU_BUF_HE; hw->netdev_features = NETIF_F_RXCSUM; hw->radiotap_timestamp.units_pos = diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 7204bc28ca31..a4143466cb32 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1024,6 +1024,8 @@ struct ieee80211_tpc_report_ie { #define IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK GENMASK(2, 1) #define IEEE80211_ADDBA_EXT_FRAG_LEVEL_SHIFT 1 #define IEEE80211_ADDBA_EXT_NO_FRAG BIT(0) +#define IEEE80211_ADDBA_EXT_BUF_SIZE_MASK GENMASK(7, 5) +#define IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT 10 struct ieee80211_addba_ext_ie { u8 data; @@ -1698,10 +1700,12 @@ struct ieee80211_ht_operation { * A-MPDU buffer sizes * According to HT size varies from 8 to 64 frames * HE adds the ability to have up to 256 frames. + * EHT adds the ability to have up to 1K frames. */ #define IEEE80211_MIN_AMPDU_BUF 0x8 #define IEEE80211_MAX_AMPDU_BUF_HT 0x40 -#define IEEE80211_MAX_AMPDU_BUF 0x100 +#define IEEE80211_MAX_AMPDU_BUF_HE 0x100 +#define IEEE80211_MAX_AMPDU_BUF_EHT 0x400 /* Spatial Multiplexing Power Save Modes (for capability) */ diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 7d2925bb966e..0d2bab9d351c 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -310,7 +310,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } if (sta->sta.he_cap.has_he) - max_buf_size = IEEE80211_MAX_AMPDU_BUF; + max_buf_size = IEEE80211_MAX_AMPDU_BUF_HE; else max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT; -- cgit v1.2.3 From 091296d30917f6e63a9402974f546412dfb2a8e6 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sat, 5 Feb 2022 11:21:36 +0200 Subject: iwlwifi: mvm: refactor setting PPE thresholds in STA_HE_CTXT_CMD We are setting the PPE Thresholds in STA_HE_CTXT_CMD according to HE PHY Capabilities IE. As EHT is introduced, we will have to set this thresholds according to EHT PHY Capabilities IE if we're in an EHT connection. Some parts of the code can be used for both HE and EHT. Put this parts in functions which will be used in the patch which adds support for EHT PPE. Signed-off-by: Miri Korenblit Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220205112029.48a508dfffef.If392e44d88f96ebed7fadf827e327194d4bd97b1@changeid Signed-off-by: Luca Coelho --- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 204 ++++++++++++---------- include/linux/ieee80211.h | 1 + 2 files changed, 114 insertions(+), 91 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 1d74e8b3576a..8b0124a40ee9 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2012-2014, 2018-2021 Intel Corporation + * Copyright (C) 2012-2014, 2018-2022 Intel Corporation * Copyright (C) 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2016-2017 Intel Deutschland GmbH */ @@ -2086,6 +2086,103 @@ static u8 iwl_mvm_he_get_ppe_val(u8 *ppe, u8 ppe_pos_bit) return res; } +static void iwl_mvm_parse_ppe(struct iwl_mvm *mvm, + struct iwl_he_pkt_ext_v2 *pkt_ext, u8 nss, + u8 ru_index_bitmap, u8 *ppe, u8 ppe_pos_bit) +{ + int i; + + /* + * FW currently supports only nss == MAX_HE_SUPP_NSS + * + * If nss > MAX: we can ignore values we don't support + * If nss < MAX: we can set zeros in other streams + */ + if (nss > MAX_HE_SUPP_NSS) { + IWL_INFO(mvm, "Got NSS = %d - trimming to %d\n", nss, + MAX_HE_SUPP_NSS); + nss = MAX_HE_SUPP_NSS; + } + + for (i = 0; i < nss; i++) { + u8 ru_index_tmp = ru_index_bitmap << 1; + u8 low_th = IWL_HE_PKT_EXT_NONE, high_th = IWL_HE_PKT_EXT_NONE; + u8 bw; + + for (bw = 0; + bw < ARRAY_SIZE(pkt_ext->pkt_ext_qam_th[i]); + bw++) { + ru_index_tmp >>= 1; + + if (!(ru_index_tmp & 1)) + continue; + + high_th = iwl_mvm_he_get_ppe_val(ppe, ppe_pos_bit); + ppe_pos_bit += IEEE80211_PPE_THRES_INFO_PPET_SIZE; + low_th = iwl_mvm_he_get_ppe_val(ppe, ppe_pos_bit); + ppe_pos_bit += IEEE80211_PPE_THRES_INFO_PPET_SIZE; + + pkt_ext->pkt_ext_qam_th[i][bw][0] = low_th; + pkt_ext->pkt_ext_qam_th[i][bw][1] = high_th; + } + } +} + +static void iwl_mvm_set_pkt_ext_from_he_ppe(struct iwl_mvm *mvm, + struct ieee80211_sta *sta, + struct iwl_he_pkt_ext_v2 *pkt_ext) +{ + u8 nss = (sta->he_cap.ppe_thres[0] & IEEE80211_PPE_THRES_NSS_MASK) + 1; + u8 *ppe = &sta->he_cap.ppe_thres[0]; + u8 ru_index_bitmap = + u8_get_bits(*ppe, + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + /* Starting after PPE header */ + u8 ppe_pos_bit = IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE; + + iwl_mvm_parse_ppe(mvm, pkt_ext, nss, ru_index_bitmap, ppe, ppe_pos_bit); +} + +static void iwl_mvm_set_pkt_ext_from_nominal_padding(struct iwl_he_pkt_ext_v2 *pkt_ext, + u8 nominal_padding, + u32 *flags) +{ + int low_th = -1; + int high_th = -1; + int i; + + switch (nominal_padding) { + case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US: + low_th = IWL_HE_PKT_EXT_NONE; + high_th = IWL_HE_PKT_EXT_NONE; + break; + case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US: + low_th = IWL_HE_PKT_EXT_BPSK; + high_th = IWL_HE_PKT_EXT_NONE; + break; + case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US: + low_th = IWL_HE_PKT_EXT_NONE; + high_th = IWL_HE_PKT_EXT_BPSK; + break; + } + + /* Set the PPE thresholds accordingly */ + if (low_th >= 0 && high_th >= 0) { + for (i = 0; i < MAX_HE_SUPP_NSS; i++) { + u8 bw; + + for (bw = 0; + bw < ARRAY_SIZE(pkt_ext->pkt_ext_qam_th[i]); + bw++) { + pkt_ext->pkt_ext_qam_th[i][bw][0] = low_th; + pkt_ext->pkt_ext_qam_th[i][bw][1] = high_th; + } + } + + *flags |= STA_CTXT_HE_PACKET_EXT; + } +} + static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, struct ieee80211_vif *vif, u8 sta_id) { @@ -2195,100 +2292,25 @@ static void iwl_mvm_cfg_he_sta(struct iwl_mvm *mvm, * Initialize the PPE thresholds to "None" (7), as described in Table * 9-262ac of 80211.ax/D3.0. */ - memset(&sta_ctxt_cmd.pkt_ext, 7, sizeof(sta_ctxt_cmd.pkt_ext)); + memset(&sta_ctxt_cmd.pkt_ext, IWL_HE_PKT_EXT_NONE, + sizeof(sta_ctxt_cmd.pkt_ext)); /* If PPE Thresholds exist, parse them into a FW-familiar format. */ if (sta->he_cap.he_cap_elem.phy_cap_info[6] & - IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { - u8 nss = (sta->he_cap.ppe_thres[0] & - IEEE80211_PPE_THRES_NSS_MASK) + 1; - u8 ru_index_bitmap = - (sta->he_cap.ppe_thres[0] & - IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK) >> - IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS; - u8 *ppe = &sta->he_cap.ppe_thres[0]; - u8 ppe_pos_bit = 7; /* Starting after PPE header */ - - /* - * FW currently supports only nss == MAX_HE_SUPP_NSS - * - * If nss > MAX: we can ignore values we don't support - * If nss < MAX: we can set zeros in other streams - */ - if (nss > MAX_HE_SUPP_NSS) { - IWL_INFO(mvm, "Got NSS = %d - trimming to %d\n", nss, - MAX_HE_SUPP_NSS); - nss = MAX_HE_SUPP_NSS; - } - - for (i = 0; i < nss; i++) { - u8 ru_index_tmp = ru_index_bitmap << 1; - u8 bw; - - for (bw = 0; - bw < ARRAY_SIZE(sta_ctxt_cmd.pkt_ext.pkt_ext_qam_th[i]); - bw++) { - ru_index_tmp >>= 1; - if (!(ru_index_tmp & 1)) - continue; - - sta_ctxt_cmd.pkt_ext.pkt_ext_qam_th[i][bw][1] = - iwl_mvm_he_get_ppe_val(ppe, - ppe_pos_bit); - ppe_pos_bit += - IEEE80211_PPE_THRES_INFO_PPET_SIZE; - sta_ctxt_cmd.pkt_ext.pkt_ext_qam_th[i][bw][0] = - iwl_mvm_he_get_ppe_val(ppe, - ppe_pos_bit); - ppe_pos_bit += - IEEE80211_PPE_THRES_INFO_PPET_SIZE; - } - } - + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { + iwl_mvm_set_pkt_ext_from_he_ppe(mvm, sta, + &sta_ctxt_cmd.pkt_ext); flags |= STA_CTXT_HE_PACKET_EXT; - } else if (u8_get_bits(sta->he_cap.he_cap_elem.phy_cap_info[9], - IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK) - != IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED) { - int low_th = -1; - int high_th = -1; - - /* Take the PPE thresholds from the nominal padding info */ - switch (u8_get_bits(sta->he_cap.he_cap_elem.phy_cap_info[9], - IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK)) { - case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US: - low_th = IWL_HE_PKT_EXT_NONE; - high_th = IWL_HE_PKT_EXT_NONE; - break; - case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US: - low_th = IWL_HE_PKT_EXT_BPSK; - high_th = IWL_HE_PKT_EXT_NONE; - break; - case IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US: - low_th = IWL_HE_PKT_EXT_NONE; - high_th = IWL_HE_PKT_EXT_BPSK; - break; - } - - /* Set the PPE thresholds accordingly */ - if (low_th >= 0 && high_th >= 0) { - struct iwl_he_pkt_ext_v2 *pkt_ext = - &sta_ctxt_cmd.pkt_ext; - - for (i = 0; i < MAX_HE_SUPP_NSS; i++) { - u8 bw; - - for (bw = 0; - bw < ARRAY_SIZE(pkt_ext->pkt_ext_qam_th[i]); - bw++) { - pkt_ext->pkt_ext_qam_th[i][bw][0] = - low_th; - pkt_ext->pkt_ext_qam_th[i][bw][1] = - high_th; - } - } - - flags |= STA_CTXT_HE_PACKET_EXT; - } + /* PPE Thresholds doesn't exist - set the API PPE values + * according to Common Nominal Packet Padding fiels. */ + } else { + u8 nominal_padding = + u8_get_bits(sta->he_cap.he_cap_elem.phy_cap_info[9], + IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK); + if (nominal_padding != IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED) + iwl_mvm_set_pkt_ext_from_nominal_padding(&sta_ctxt_cmd.pkt_ext, + nominal_padding, + &flags); } if (sta->he_cap.he_cap_elem.mac_cap_info[2] & diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a4143466cb32..75d40acb60c1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2421,6 +2421,7 @@ ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) #define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 #define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) #define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) +#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE (7) /* * Calculate 802.11ax HE capabilities IE PPE field size -- cgit v1.2.3 From 47f0bd5032106469827cf56c8b45bb9101112105 Mon Sep 17 00:00:00 2001 From: Jacques de Laval Date: Thu, 17 Feb 2022 16:02:02 +0100 Subject: net: Add new protocol attribute to IP addresses This patch adds a new protocol attribute to IPv4 and IPv6 addresses. Inspiration was taken from the protocol attribute of routes. User space applications like iproute2 can set/get the protocol with the Netlink API. The attribute is stored as an 8-bit unsigned integer. The protocol attribute is set by kernel for these categories: - IPv4 and IPv6 loopback addresses - IPv6 addresses generated from router announcements - IPv6 link local addresses User space may pass custom protocols, not defined by the kernel. Grouping addresses on their origin is useful in scenarios where you want to distinguish between addresses based on who added them, e.g. kernel vs. user space. Tagging addresses with a string label is an existing feature that could be used as a solution. Unfortunately the max length of a label is 15 characters, and for compatibility reasons the label must be prefixed with the name of the device followed by a colon. Since device names also have a max length of 15 characters, only -1 characters is guaranteed to be available for any origin tag, which is not that much. A reference implementation of user space setting and getting protocols is available for iproute2: https://github.com/westermo/iproute2/commit/9a6ea18bd79f47f293e5edc7780f315ea42ff540 Signed-off-by: Jacques de Laval Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220217150202.80802-1-Jacques.De.Laval@westermo.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 1 + include/net/addrconf.h | 2 ++ include/net/if_inet6.h | 2 ++ include/uapi/linux/if_addr.h | 9 ++++++++- net/ipv4/devinet.c | 7 +++++++ net/ipv6/addrconf.c | 27 +++++++++++++++++++++------ 6 files changed, 41 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 674aeead6260..ead323243e7b 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -150,6 +150,7 @@ struct in_ifaddr { __be32 ifa_broadcast; unsigned char ifa_scope; unsigned char ifa_prefixlen; + unsigned char ifa_proto; __u32 ifa_flags; char ifa_label[IFNAMSIZ]; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 59940e230b78..f7506f08e505 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -64,6 +64,8 @@ struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; + u8 ifa_proto; + const struct in6_addr *peer_pfx; u32 rt_priority; diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index f026cf08a8e8..4cfdef6ca4f6 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -71,6 +71,8 @@ struct inet6_ifaddr { bool tokenized; + u8 ifa_proto; + struct rcu_head rcu; struct in6_addr peer_addr; }; diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index dfcf3ce0097f..1c392dd95a5e 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -33,8 +33,9 @@ enum { IFA_CACHEINFO, IFA_MULTICAST, IFA_FLAGS, - IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ + IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ IFA_TARGET_NETNSID, + IFA_PROTO, /* u8, address protocol */ __IFA_MAX, }; @@ -69,4 +70,10 @@ struct ifa_cacheinfo { #define IFA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifaddrmsg)) #endif +/* ifa_proto */ +#define IFAPROT_UNSPEC 0 +#define IFAPROT_KERNEL_LO 1 /* loopback */ +#define IFAPROT_KERNEL_RA 2 /* set by kernel from router announcement */ +#define IFAPROT_KERNEL_LL 3 /* link-local set by kernel */ + #endif diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index fba2bffd65f7..53a6b14dc50a 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -104,6 +104,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, + [IFA_PROTO] = { .type = NLA_U8 }, }; struct inet_fill_args { @@ -889,6 +890,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); + if (tb[IFA_PROTO]) + ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); + if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; @@ -1625,6 +1629,7 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } @@ -1699,6 +1704,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || + (ifa->ifa_proto && + nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 2f307da17f21..85bab3a35709 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1115,6 +1115,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, ifa->prefix_len = cfg->plen; ifa->rt_priority = cfg->rt_priority; ifa->flags = cfg->ifa_flags; + ifa->ifa_proto = cfg->ifa_proto; /* No need to add the TENTATIVE flag for addresses with NODAD */ if (!(cfg->ifa_flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; @@ -2593,6 +2594,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, .valid_lft = valid_lft, .preferred_lft = prefered_lft, .scope = addr_type & IPV6_ADDR_SCOPE_MASK, + .ifa_proto = IFAPROT_KERNEL_RA }; #ifdef CONFIG_IPV6_OPTIMISTIC_DAD @@ -3077,7 +3079,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) } static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, - int plen, int scope) + int plen, int scope, u8 proto) { struct inet6_ifaddr *ifp; struct ifa6_config cfg = { @@ -3086,7 +3088,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, .ifa_flags = IFA_F_PERMANENT, .valid_lft = INFINITY_LIFE_TIME, .preferred_lft = INFINITY_LIFE_TIME, - .scope = scope + .scope = scope, + .ifa_proto = proto }; ifp = ipv6_add_addr(idev, &cfg, true, NULL); @@ -3131,7 +3134,7 @@ static void add_v4_addrs(struct inet6_dev *idev) } if (addr.s6_addr32[3]) { - add_addr(idev, &addr, plen, scope); + add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC); addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags, GFP_KERNEL); return; @@ -3154,7 +3157,8 @@ static void add_v4_addrs(struct inet6_dev *idev) flag |= IFA_HOST; } - add_addr(idev, &addr, plen, flag); + add_addr(idev, &addr, plen, flag, + IFAPROT_UNSPEC); addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags, GFP_KERNEL); } @@ -3177,7 +3181,7 @@ static void init_loopback(struct net_device *dev) return; } - add_addr(idev, &in6addr_loopback, 128, IFA_HOST); + add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO); } void addrconf_add_linklocal(struct inet6_dev *idev, @@ -3189,7 +3193,8 @@ void addrconf_add_linklocal(struct inet6_dev *idev, .ifa_flags = flags | IFA_F_PERMANENT, .valid_lft = INFINITY_LIFE_TIME, .preferred_lft = INFINITY_LIFE_TIME, - .scope = IFA_LINK + .scope = IFA_LINK, + .ifa_proto = IFAPROT_KERNEL_LL }; struct inet6_ifaddr *ifp; @@ -4627,6 +4632,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { [IFA_FLAGS] = { .len = sizeof(u32) }, [IFA_RT_PRIORITY] = { .len = sizeof(u32) }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, + [IFA_PROTO] = { .type = NLA_U8 }, }; static int @@ -4752,6 +4758,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, ifp->tstamp = jiffies; ifp->valid_lft = cfg->valid_lft; ifp->prefered_lft = cfg->preferred_lft; + ifp->ifa_proto = cfg->ifa_proto; if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority) ifp->rt_priority = cfg->rt_priority; @@ -4845,6 +4852,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFA_RT_PRIORITY]) cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); + if (tb[IFA_PROTO]) + cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]); + cfg.valid_lft = INFINITY_LIFE_TIME; cfg.preferred_lft = INFINITY_LIFE_TIME; @@ -4948,6 +4958,7 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(16) /* IFA_ADDRESS */ + nla_total_size(sizeof(struct ifa_cacheinfo)) + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */; } @@ -5025,6 +5036,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0) goto error; + if (ifa->ifa_proto && + nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) + goto error; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 64b4a0f8b51b20e0c9dbff7748365994364d5f01 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 19 Feb 2022 11:47:22 +0000 Subject: net: phylink: remove phylink_config's pcs_poll phylink_config's pcs_poll is no longer used, let's get rid of it. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 3 +-- include/linux/phylink.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 5b53a3e23c89..26f1219a005f 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1300,7 +1300,7 @@ void phylink_set_pcs(struct phylink *pl, struct phylink_pcs *pcs) if (!pl->phylink_disable_state && pl->cfg_link_an_mode == MLO_AN_INBAND) { - if (pl->config->pcs_poll || pcs->poll) + if (pcs->poll) mod_timer(&pl->link_poll, jiffies + HZ); else del_timer(&pl->link_poll); @@ -1673,7 +1673,6 @@ void phylink_start(struct phylink *pl) poll |= pl->config->poll_fixed_state; break; case MLO_AN_INBAND: - poll |= pl->config->pcs_poll; if (pl->pcs) poll |= pl->pcs->poll; break; diff --git a/include/linux/phylink.h b/include/linux/phylink.h index cca149f78d35..9ef9b7047f19 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -86,7 +86,6 @@ enum phylink_op_type { * @type: operation type of PHYLINK instance * @legacy_pre_march2020: driver has not been updated for March 2020 updates * (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls") - * @pcs_poll: MAC PCS cannot provide link change interrupt * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND @@ -100,7 +99,6 @@ struct phylink_config { struct device *dev; enum phylink_op_type type; bool legacy_pre_march2020; - bool pcs_poll; bool poll_fixed_state; bool ovr_an_inband; void (*get_fixed_state)(struct phylink_config *config, -- cgit v1.2.3 From 643b622b51f1f0015e0a80f90b4ef9032e6ddb1b Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:32 +0800 Subject: net: tcp: add skb drop reasons to tcp_v{4,6}_inbound_md5_hash() Pass the address of drop reason to tcp_v4_inbound_md5_hash() and tcp_v6_inbound_md5_hash() to store the reasons for skb drops when this function fails. Therefore, the drop reason can be passed to kfree_skb_reason() when the skb needs to be freed. Following drop reasons are added: SKB_DROP_REASON_TCP_MD5NOTFOUND SKB_DROP_REASON_TCP_MD5UNEXPECTED SKB_DROP_REASON_TCP_MD5FAILURE SKB_DROP_REASON_TCP_MD5* above correspond to LINUX_MIB_TCPMD5* Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++++++++ include/trace/events/skb.h | 4 ++++ net/ipv4/tcp_ipv4.c | 13 +++++++++---- net/ipv6/tcp_ipv6.c | 11 ++++++++--- 4 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a5adbf6b51e8..46678eb587ff 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -346,6 +346,18 @@ enum skb_drop_reason { * udp packet drop out of * udp_memory_allocated. */ + SKB_DROP_REASON_TCP_MD5NOTFOUND, /* no MD5 hash and one + * expected, corresponding + * to LINUX_MIB_TCPMD5NOTFOUND + */ + SKB_DROP_REASON_TCP_MD5UNEXPECTED, /* MD5 hash and we're not + * expecting one, corresponding + * to LINUX_MIB_TCPMD5UNEXPECTED + */ + SKB_DROP_REASON_TCP_MD5FAILURE, /* MD5 hash and its wrong, + * corresponding to + * LINUX_MIB_TCPMD5FAILURE + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index cfcfd26399f7..46c06b0be850 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -27,6 +27,10 @@ EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO) \ EM(SKB_DROP_REASON_SOCKET_RCVBUFF, SOCKET_RCVBUFF) \ EM(SKB_DROP_REASON_PROTO_MEM, PROTO_MEM) \ + EM(SKB_DROP_REASON_TCP_MD5NOTFOUND, TCP_MD5NOTFOUND) \ + EM(SKB_DROP_REASON_TCP_MD5UNEXPECTED, \ + TCP_MD5UNEXPECTED) \ + EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3beab01e9a7..d3c417119057 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1412,7 +1412,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb); /* Called with rcu_read_lock() */ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - int dif, int sdif) + int dif, int sdif, + enum skb_drop_reason *reason) { #ifdef CONFIG_TCP_MD5SIG /* @@ -1445,11 +1446,13 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, return false; if (hash_expected && !hash_location) { + *reason = SKB_DROP_REASON_TCP_MD5NOTFOUND; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return true; } if (!hash_expected && hash_location) { + *reason = SKB_DROP_REASON_TCP_MD5UNEXPECTED; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return true; } @@ -1462,6 +1465,7 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + *reason = SKB_DROP_REASON_TCP_MD5FAILURE; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", &iph->saddr, ntohs(th->source), @@ -1971,13 +1975,13 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); + enum skb_drop_reason drop_reason; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; struct sock *sk; - int drop_reason; int ret; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -2025,7 +2029,8 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif, + &drop_reason))) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -2099,7 +2104,7 @@ process: goto discard_and_relse; } - if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) + if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif, &drop_reason)) goto discard_and_relse; nf_reset_ct(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0aa17073df1a..1262b790b146 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -775,7 +775,8 @@ clear_hash_noput: static bool tcp_v6_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - int dif, int sdif) + int dif, int sdif, + enum skb_drop_reason *reason) { #ifdef CONFIG_TCP_MD5SIG const __u8 *hash_location = NULL; @@ -798,11 +799,13 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, return false; if (hash_expected && !hash_location) { + *reason = SKB_DROP_REASON_TCP_MD5NOTFOUND; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return true; } if (!hash_expected && hash_location) { + *reason = SKB_DROP_REASON_TCP_MD5UNEXPECTED; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return true; } @@ -813,6 +816,7 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { + *reason = SKB_DROP_REASON_TCP_MD5FAILURE; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u L3 index %d\n", genhash ? "failed" : "mismatch", @@ -1681,7 +1685,8 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) { + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif, + &drop_reason)) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1752,7 +1757,7 @@ process: goto discard_and_relse; } - if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif)) + if (tcp_v6_inbound_md5_hash(sk, skb, dif, sdif, &drop_reason)) goto discard_and_relse; if (tcp_filter(sk, skb)) { -- cgit v1.2.3 From 7a26dc9e7b43f5a24c4b843713e728582adf1c38 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:33 +0800 Subject: net: tcp: add skb drop reasons to tcp_add_backlog() Pass the address of drop_reason to tcp_add_backlog() to store the reasons for skb drops when fails. Following drop reasons are introduced: SKB_DROP_REASON_SOCKET_BACKLOG Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ include/net/tcp.h | 3 ++- include/trace/events/skb.h | 1 + net/ipv4/tcp_ipv4.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 46678eb587ff..f7f33c79945b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -358,6 +358,10 @@ enum skb_drop_reason { * corresponding to * LINUX_MIB_TCPMD5FAILURE */ + SKB_DROP_REASON_SOCKET_BACKLOG, /* failed to add skb to socket + * backlog (see + * LINUX_MIB_TCPBACKLOGDROP) + */ SKB_DROP_REASON_MAX, }; diff --git a/include/net/tcp.h b/include/net/tcp.h index eff2487d972d..04f4650e0ff0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1367,7 +1367,8 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __skb_checksum_complete(skb); } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason); #ifdef CONFIG_INET void __sk_defer_free_flush(struct sock *sk); diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 46c06b0be850..bfccd77e9071 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -31,6 +31,7 @@ EM(SKB_DROP_REASON_TCP_MD5UNEXPECTED, \ TCP_MD5UNEXPECTED) \ EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ + EM(SKB_DROP_REASON_SOCKET_BACKLOG, SOCKET_BACKLOG) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d3c417119057..cbca8637ba2f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1811,7 +1811,8 @@ int tcp_v4_early_demux(struct sk_buff *skb) return 0; } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason) { u32 limit, tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; @@ -1837,6 +1838,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); trace_tcp_bad_csum(skb); + *reason = SKB_DROP_REASON_TCP_CSUM; __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); return true; @@ -1925,6 +1927,7 @@ no_coalesce: if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); + *reason = SKB_DROP_REASON_SOCKET_BACKLOG; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); return true; } @@ -2133,7 +2136,7 @@ process: if (!sock_owned_by_user(sk)) { ret = tcp_v4_do_rcv(sk, skb); } else { - if (tcp_add_backlog(sk, skb)) + if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1262b790b146..abf0ad547858 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1784,7 +1784,7 @@ process: if (!sock_owned_by_user(sk)) { ret = tcp_v6_do_rcv(sk, skb); } else { - if (tcp_add_backlog(sk, skb)) + if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); -- cgit v1.2.3 From 2a968ef60e1fac4e694d9f60ce19a3b66b40e8c3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:35 +0800 Subject: net: tcp: use tcp_drop_reason() for tcp_rcv_established() Replace tcp_drop() used in tcp_rcv_established() with tcp_drop_reason(). Following drop reasons are added: SKB_DROP_REASON_TCP_FLAGS Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + net/ipv4/tcp_input.c | 9 +++++++-- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f7f33c79945b..671db9f49efe 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -362,6 +362,7 @@ enum skb_drop_reason { * backlog (see * LINUX_MIB_TCPBACKLOGDROP) */ + SKB_DROP_REASON_TCP_FLAGS, /* TCP flags invalid */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index bfccd77e9071..d332e7313a61 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -32,6 +32,7 @@ TCP_MD5UNEXPECTED) \ EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ EM(SKB_DROP_REASON_SOCKET_BACKLOG, SOCKET_BACKLOG) \ + EM(SKB_DROP_REASON_TCP_FLAGS, TCP_FLAGS) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0f41fdfd8c27..2d8495a62bc4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5787,6 +5787,7 @@ discard: */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); unsigned int len = skb->len; @@ -5875,6 +5876,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; return; } else { /* Header too small */ + reason = SKB_DROP_REASON_PKT_TOO_SMALL; TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -5930,8 +5932,10 @@ slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; - if (!th->ack && !th->rst && !th->syn) + if (!th->ack && !th->rst && !th->syn) { + reason = SKB_DROP_REASON_TCP_FLAGS; goto discard; + } /* * Standard slow path. @@ -5957,12 +5961,13 @@ step5: return; csum_error: + reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); discard: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); } EXPORT_SYMBOL(tcp_rcv_established); -- cgit v1.2.3 From a7ec381049c0d1f03e342063d75f5c3b314d0ec2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:36 +0800 Subject: net: tcp: use tcp_drop_reason() for tcp_data_queue() Replace tcp_drop() used in tcp_data_queue() with tcp_drop_reason(). Following drop reasons are introduced: SKB_DROP_REASON_TCP_ZEROWINDOW SKB_DROP_REASON_TCP_OLD_DATA SKB_DROP_REASON_TCP_OVERWINDOW SKB_DROP_REASON_TCP_OLD_DATA is used for the case that end_seq of skb less than the left edges of receive window. (Maybe there is a better name?) Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 +++++++++++++ include/trace/events/skb.h | 3 +++ net/ipv4/tcp_input.c | 13 +++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 671db9f49efe..554ef2c848ee 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -363,6 +363,19 @@ enum skb_drop_reason { * LINUX_MIB_TCPBACKLOGDROP) */ SKB_DROP_REASON_TCP_FLAGS, /* TCP flags invalid */ + SKB_DROP_REASON_TCP_ZEROWINDOW, /* TCP receive window size is zero, + * see LINUX_MIB_TCPZEROWINDOWDROP + */ + SKB_DROP_REASON_TCP_OLD_DATA, /* the TCP data reveived is already + * received before (spurious retrans + * may happened), see + * LINUX_MIB_DELAYEDACKLOST + */ + SKB_DROP_REASON_TCP_OVERWINDOW, /* the TCP data is out of window, + * the seq of the first byte exceed + * the right edges of receive + * window + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index d332e7313a61..cc1c8f7eaf72 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -33,6 +33,9 @@ EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ EM(SKB_DROP_REASON_SOCKET_BACKLOG, SOCKET_BACKLOG) \ EM(SKB_DROP_REASON_TCP_FLAGS, TCP_FLAGS) \ + EM(SKB_DROP_REASON_TCP_ZEROWINDOW, TCP_ZEROWINDOW) \ + EM(SKB_DROP_REASON_TCP_OLD_DATA, TCP_OLD_DATA) \ + EM(SKB_DROP_REASON_TCP_OVERWINDOW, TCP_OVERWINDOW) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2d8495a62bc4..041c778fd0b5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4988,6 +4988,7 @@ void tcp_data_ready(struct sock *sk) static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + enum skb_drop_reason reason; bool fragstolen; int eaten; @@ -5006,6 +5007,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) skb_dst_drop(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); + reason = SKB_DROP_REASON_NOT_SPECIFIED; tp->rx_opt.dsack = 0; /* Queue data for delivery to the user. @@ -5014,6 +5016,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { if (tcp_receive_window(tp) == 0) { + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } @@ -5023,6 +5026,7 @@ queue_and_out: if (skb_queue_len(&sk->sk_receive_queue) == 0) sk_forced_mem_schedule(sk, skb->truesize); else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { + reason = SKB_DROP_REASON_PROTO_MEM; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); sk->sk_data_ready(sk); goto drop; @@ -5059,6 +5063,7 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { tcp_rcv_spurious_retrans(sk, skb); /* A retransmit, 2nd most common case. Force an immediate ack. */ + reason = SKB_DROP_REASON_TCP_OLD_DATA; NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -5066,13 +5071,16 @@ out_of_window: tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); inet_csk_schedule_ack(sk); drop: - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, reason); return; } /* Out of window. F.e. zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + if (!before(TCP_SKB_CB(skb)->seq, + tp->rcv_nxt + tcp_receive_window(tp))) { + reason = SKB_DROP_REASON_TCP_OVERWINDOW; goto out_of_window; + } if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -5082,6 +5090,7 @@ drop: * remembering D-SACK for its head made in previous line. */ if (!tcp_receive_window(tp)) { + reason = SKB_DROP_REASON_TCP_ZEROWINDOW; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); goto out_of_window; } -- cgit v1.2.3 From d25e481be0c519d1a458b14191dc8c2a8bb3e24a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:37 +0800 Subject: net: tcp: use tcp_drop_reason() for tcp_data_queue_ofo() Replace tcp_drop() used in tcp_data_queue_ofo with tcp_drop_reason(). Following drop reasons are introduced: SKB_DROP_REASON_TCP_OFOMERGE Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ include/trace/events/skb.h | 1 + net/ipv4/tcp_input.c | 10 ++++++---- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 554ef2c848ee..a3e90efe6586 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -376,6 +376,10 @@ enum skb_drop_reason { * the right edges of receive * window */ + SKB_DROP_REASON_TCP_OFOMERGE, /* the data of skb is already in + * the ofo queue, corresponding to + * LINUX_MIB_TCPOFOMERGE + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index cc1c8f7eaf72..2ab7193313aa 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -36,6 +36,7 @@ EM(SKB_DROP_REASON_TCP_ZEROWINDOW, TCP_ZEROWINDOW) \ EM(SKB_DROP_REASON_TCP_OLD_DATA, TCP_OLD_DATA) \ EM(SKB_DROP_REASON_TCP_OVERWINDOW, TCP_OVERWINDOW) \ + EM(SKB_DROP_REASON_TCP_OFOMERGE, TCP_OFOMERGE) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 041c778fd0b5..2088f93fa37b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4779,7 +4779,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); sk->sk_data_ready(sk); - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM); return; } @@ -4842,7 +4842,8 @@ coalesce_done: /* All the bits are present. Drop. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb); + tcp_drop_reason(sk, skb, + SKB_DROP_REASON_TCP_OFOMERGE); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4861,7 +4862,8 @@ coalesce_done: TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb1); + tcp_drop_reason(sk, skb1, + SKB_DROP_REASON_TCP_OFOMERGE); goto merge_right; } } else if (tcp_ooo_try_coalesce(sk, skb1, @@ -4889,7 +4891,7 @@ merge_right: tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - tcp_drop(sk, skb1); + tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE); } /* If there is no skb after us, we are the last_skb ! */ if (!skb1) -- cgit v1.2.3 From 763087dab97547230a6807c865a6a5ae53a59247 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:12 -0800 Subject: net: add skb_set_end_offset() helper We have multiple places where this helper is convenient, and plan using it in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 10 ++++++++++ net/core/skbuff.c | 19 +++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a3e90efe6586..115be7f73487 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1536,6 +1536,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = offset; +} #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { @@ -1546,6 +1551,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = skb->head + offset; +} #endif /* Internal */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9d0388bed0c1..27a2296241c9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -201,7 +201,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; + skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; @@ -1736,11 +1736,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->head = data; skb->head_frag = 0; skb->data += off; + + skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; off = nhead; -#else - skb->end = skb->head + size; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); @@ -6044,11 +6043,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb->head = data; skb->data = data; skb->head_frag = 0; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; @@ -6186,11 +6181,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb->head = data; skb->head_frag = 0; skb->data = data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; -- cgit v1.2.3 From 2b88cba55883eaafbc9b7cbff0b2c7cdba71ed01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:13 -0800 Subject: net: preserve skb_end_offset() in skb_unclone_keeptruesize() syzbot found another way to trigger the infamous WARN_ON_ONCE(delta < len) in skb_try_coalesce() [1] I was able to root cause the issue to kfence. When kfence is in action, the following assertion is no longer true: int size = xxxx; void *ptr1 = kmalloc(size, gfp); void *ptr2 = kmalloc(size, gfp); if (ptr1 && ptr2) ASSERT(ksize(ptr1) == ksize(ptr2)); We attempted to fix these issues in the blamed commits, but forgot that TCP was possibly shifting data after skb_unclone_keeptruesize() has been used, notably from tcp_retrans_try_collapse(). So we not only need to keep same skb->truesize value, we also need to make sure TCP wont fill new tailroom that pskb_expand_head() was able to get from a addr = kmalloc(...) followed by ksize(addr) Split skb_unclone_keeptruesize() into two parts: 1) Inline skb_unclone_keeptruesize() for the common case, when skb is not cloned. 2) Out of line __skb_unclone_keeptruesize() for the 'slow path'. WARNING: CPU: 1 PID: 6490 at net/core/skbuff.c:5295 skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Modules linked in: CPU: 1 PID: 6490 Comm: syz-executor161 Not tainted 5.17.0-rc4-syzkaller-00229-g4f12b742eb2b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Code: bf 01 00 00 00 0f b7 c0 89 c6 89 44 24 20 e8 62 24 4e fa 8b 44 24 20 83 e8 01 0f 85 e5 f0 ff ff e9 87 f4 ff ff e8 cb 20 4e fa <0f> 0b e9 06 f9 ff ff e8 af b2 95 fa e9 69 f0 ff ff e8 95 b2 95 fa RSP: 0018:ffffc900063af268 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 00000000ffffffd5 RCX: 0000000000000000 RDX: ffff88806fc05700 RSI: ffffffff872abd55 RDI: 0000000000000003 RBP: ffff88806e675500 R08: 00000000ffffffd5 R09: 0000000000000000 R10: ffffffff872ab659 R11: 0000000000000000 R12: ffff88806dd554e8 R13: ffff88806dd9bac0 R14: ffff88806dd9a2c0 R15: 0000000000000155 FS: 00007f18014f9700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020002000 CR3: 000000006be7a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_try_coalesce net/ipv4/tcp_input.c:4651 [inline] tcp_try_coalesce+0x393/0x920 net/ipv4/tcp_input.c:4630 tcp_queue_rcv+0x8a/0x6e0 net/ipv4/tcp_input.c:4914 tcp_data_queue+0x11fd/0x4bb0 net/ipv4/tcp_input.c:5025 tcp_rcv_established+0x81e/0x1ff0 net/ipv4/tcp_input.c:5947 tcp_v4_do_rcv+0x65e/0x980 net/ipv4/tcp_ipv4.c:1719 sk_backlog_rcv include/net/sock.h:1037 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2779 release_sock+0x54/0x1b0 net/core/sock.c:3311 sk_wait_data+0x177/0x450 net/core/sock.c:2821 tcp_recvmsg_locked+0xe28/0x1fd0 net/ipv4/tcp.c:2457 tcp_recvmsg+0x137/0x610 net/ipv4/tcp.c:2572 inet_recvmsg+0x11b/0x5e0 net/ipv4/af_inet.c:850 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_recvmsg net/socket.c:962 [inline] ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632 ___sys_recvmsg+0x127/0x200 net/socket.c:2674 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: c4777efa751d ("net: add and use skb_unclone_keeptruesize() helper") Fixes: 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Marco Elver Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 18 +++++++++--------- net/core/skbuff.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 115be7f73487..31be38078918 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1795,19 +1795,19 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } -/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +/* This variant of skb_unclone() makes sure skb->truesize + * and skb_end_offset() are not changed, whenever a new skb->head is needed. + * + * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) + * when various debugging features are in place. + */ +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); - if (skb_cloned(skb)) { - unsigned int save = skb->truesize; - int res; - - res = pskb_expand_head(skb, 0, 0, pri); - skb->truesize = save; - return res; - } + if (skb_cloned(skb)) + return __skb_unclone_keeptruesize(skb, pri); return 0; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 27a2296241c9..725f2b356769 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1787,6 +1787,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + /** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate -- cgit v1.2.3 From b26ef81c46ed15d11ddddba9ba1cd52c749385ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Feb 2022 14:04:50 -0800 Subject: drop_monitor: remove quadratic behavior drop_monitor is using an unique list on which all netdevices in the host have an element, regardless of their netns. This scales poorly, not only at device unregister time (what I caught during my netns dismantle stress tests), but also at packet processing time whenever trace_napi_poll_hit() is called. If the intent was to avoid adding one pointer in 'struct net_device' then surely we prefer O(1) behavior. Signed-off-by: Eric Dumazet Cc: Neil Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++- net/core/drop_monitor.c | 79 ++++++++++++++--------------------------------- 2 files changed, 27 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93fc680b658f..c79ee2296296 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2236,7 +2236,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MRP) struct mrp_port __rcu *mrp_port; #endif - +#if IS_ENABLED(CONFIG_NET_DROP_MONITOR) + struct dm_hw_stat_delta __rcu *dm_private; +#endif struct device dev; const struct attribute_group *sysfs_groups[4]; const struct attribute_group *sysfs_rx_queue_group; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index d7d177f75d43..b89e3e95bffc 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -64,7 +64,6 @@ static const char * const drop_reasons[] = { /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. - * It also guards the global 'hw_stats_list' list. */ static DEFINE_MUTEX(net_dm_mutex); @@ -100,11 +99,9 @@ struct per_cpu_dm_data { }; struct dm_hw_stat_delta { - struct net_device *dev; unsigned long last_rx; - struct list_head list; - struct rcu_head rcu; unsigned long last_drop_val; + struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; @@ -115,7 +112,6 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; -static LIST_HEAD(hw_stats_list); static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; @@ -287,33 +283,27 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { - struct dm_hw_stat_delta *new_stat; - + struct net_device *dev = napi->dev; + struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ - if (!napi->dev) + if (!dev) return; rcu_read_lock(); - list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { - struct net_device *dev; - + stat = rcu_dereference(dev->dm_private); + if (stat) { /* * only add a note to our monitor buffer if: - * 1) this is the dev we received on - * 2) its after the last_rx delta - * 3) our rx_dropped count has gone up + * 1) its after the last_rx delta + * 2) our rx_dropped count has gone up */ - /* Paired with WRITE_ONCE() in dropmon_net_event() */ - dev = READ_ONCE(new_stat->dev); - if ((dev == napi->dev) && - (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) && - (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && + (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); - new_stat->last_drop_val = napi->dev->stats.rx_dropped; - new_stat->last_rx = jiffies; - break; + stat->last_drop_val = dev->stats.rx_dropped; + stat->last_rx = jiffies; } } rcu_read_unlock(); @@ -1198,7 +1188,6 @@ err_module_put: static void net_dm_trace_off_set(void) { - struct dm_hw_stat_delta *new_stat, *temp; const struct net_dm_alert_ops *ops; int cpu; @@ -1222,13 +1211,6 @@ static void net_dm_trace_off_set(void) consume_skb(skb); } - list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { - if (new_stat->dev == NULL) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - } - } - module_put(THIS_MODULE); } @@ -1589,41 +1571,28 @@ static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dm_hw_stat_delta *new_stat = NULL; - struct dm_hw_stat_delta *tmp; + struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: - new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) + break; + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) + break; - if (!new_stat) - goto out; + stat->last_rx = jiffies; + rcu_assign_pointer(dev->dm_private, stat); - new_stat->dev = dev; - new_stat->last_rx = jiffies; - mutex_lock(&net_dm_mutex); - list_add_rcu(&new_stat->list, &hw_stats_list); - mutex_unlock(&net_dm_mutex); break; case NETDEV_UNREGISTER: - mutex_lock(&net_dm_mutex); - list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { - if (new_stat->dev == dev) { - - /* Paired with READ_ONCE() in trace_napi_poll_hit() */ - WRITE_ONCE(new_stat->dev, NULL); - - if (trace_state == TRACE_OFF) { - list_del_rcu(&new_stat->list); - kfree_rcu(new_stat, rcu); - break; - } - } + stat = rtnl_dereference(dev->dm_private); + if (stat) { + rcu_assign_pointer(dev->dm_private, NULL); + kfree_rcu(stat, rcu); } - mutex_unlock(&net_dm_mutex); break; } -out: return NOTIFY_DONE; } -- cgit v1.2.3 From a21d9a670d81103db7f788de1a4a4a6e4b891a0b Mon Sep 17 00:00:00 2001 From: Hans Schultz Date: Wed, 23 Feb 2022 11:16:46 +0100 Subject: net: bridge: Add support for bridge port in locked mode In a 802.1X scenario, clients connected to a bridge port shall not be allowed to have traffic forwarded until fully authenticated. A static fdb entry of the clients MAC address for the bridge port unlocks the client and allows bidirectional communication. This scenario is facilitated with setting the bridge port in locked mode, which is also supported by various switchcore chipsets. Signed-off-by: Hans Schultz Acked-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + net/bridge/br_input.c | 11 ++++++++++- net/bridge/br_netlink.c | 6 +++++- 4 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 509e18c7e740..3aae023a9353 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -58,6 +58,7 @@ struct br_ip_list { #define BR_MRP_LOST_CONT BIT(18) #define BR_MRP_LOST_IN_CONT BIT(19) #define BR_TX_FWD_OFFLOAD BIT(20) +#define BR_PORT_LOCKED BIT(21) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e1ba2d51b717..be09d2ad4b5d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -537,6 +537,7 @@ enum { IFLA_BRPORT_MRP_IN_OPEN, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + IFLA_BRPORT_LOCKED, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b50382f957c1..e0c13fcc50ed 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -81,6 +81,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!p || p->state == BR_STATE_DISABLED) goto drop; + br = p->br; brmctx = &p->br->multicast_ctx; pmctx = &p->multicast_ctx; state = p->state; @@ -88,10 +89,18 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb &state, &vlan)) goto out; + if (p->flags & BR_PORT_LOCKED) { + struct net_bridge_fdb_entry *fdb_src = + br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); + + if (!fdb_src || READ_ONCE(fdb_src->dst) != p || + test_bit(BR_FDB_LOCAL, &fdb_src->flags)) + goto drop; + } + nbp_switchdev_frame_mark(p, skb); /* insert into forwarding database after filtering to avoid spoofing */ - br = p->br; if (p->flags & BR_LEARNING) br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 2ff83d84230d..7d4432ca9a20 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -184,6 +184,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -269,7 +270,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || - nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) + nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || + nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -827,6 +829,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 }, [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, + [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, }; @@ -893,6 +896,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL); br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); + br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); changed_mask = old_flags ^ p->flags; -- cgit v1.2.3 From 43c075959de3c45608636d9d80ff9e61d166fb21 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 10:57:07 -0800 Subject: mlx5: remove unused static inlines mlx5 has some unused static inline helpers in include/ while at it also clean static inlines in the driver itself. Signed-off-by: Jakub Kicinski Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h | 9 --------- drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h | 7 ------- include/linux/mlx5/driver.h | 10 ---------- 3 files changed, 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index d964665eaa63..62cde3e87c2e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -139,15 +139,6 @@ static inline bool mlx5e_accel_tx_begin(struct net_device *dev, return true; } -static inline bool mlx5e_accel_tx_is_ipsec_flow(struct mlx5e_accel_tx_state *state) -{ -#ifdef CONFIG_MLX5_EN_IPSEC - return mlx5e_ipsec_is_tx_flow(&state->ipsec); -#else - return false; -#endif -} - static inline unsigned int mlx5e_accel_tx_ids_len(struct mlx5e_txqsq *sq, struct mlx5e_accel_tx_state *state) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h index 4bad6a5fde56..f240ffe5116c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/hv_vhca.h @@ -92,13 +92,6 @@ mlx5_hv_vhca_agent_create(struct mlx5_hv_vhca *hv_vhca, static inline void mlx5_hv_vhca_agent_destroy(struct mlx5_hv_vhca_agent *agent) { } - -static inline int -mlx5_hv_vhca_write_agent(struct mlx5_hv_vhca_agent *agent, - void *buf, int len) -{ - return 0; -} #endif #endif /* __LIB_HV_VHCA_H__ */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 78655d8d13a7..1b398c9e17b9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -863,20 +863,10 @@ struct mlx5_hca_vport_context { bool grh_required; }; -static inline void *mlx5_buf_offset(struct mlx5_frag_buf *buf, int offset) -{ - return buf->frags->buf + offset; -} - #define STRUCT_FIELD(header, field) \ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ .struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field -static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev) -{ - return pci_get_drvdata(pdev); -} - extern struct dentry *mlx5_debugfs_root; static inline u16 fw_rev_maj(struct mlx5_core_dev *dev) -- cgit v1.2.3 From c2c922dae77f36e24d246c6e310cee0c61afc6fb Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 29 Nov 2021 16:24:28 +0200 Subject: net/mlx5: Add ability to insert to specific flow group If the flow table isn't an autogroup the upper driver has to create the flow groups explicitly. This information can't later be used when creating rules to insert into a specific flow group. Allow such use case. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 9 ++++++++- include/linux/mlx5/fs.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index b628917e38e4..ebb7960ec62b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1696,6 +1696,7 @@ static void free_match_list(struct match_list *head, bool ft_locked) static int build_match_list(struct match_list *match_head, struct mlx5_flow_table *ft, const struct mlx5_flow_spec *spec, + struct mlx5_flow_group *fg, bool ft_locked) { struct rhlist_head *tmp, *list; @@ -1710,6 +1711,9 @@ static int build_match_list(struct match_list *match_head, rhl_for_each_entry_rcu(g, tmp, list, hash) { struct match_list *curr_match; + if (fg && fg != g) + continue; + if (unlikely(!tree_get_node(&g->node))) continue; @@ -1889,6 +1893,9 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft, if (!check_valid_spec(spec)) return ERR_PTR(-EINVAL); + if (flow_act->fg && ft->autogroup.active) + return ERR_PTR(-EINVAL); + for (i = 0; i < dest_num; i++) { if (!dest_is_valid(&dest[i], flow_act, ft)) return ERR_PTR(-EINVAL); @@ -1898,7 +1905,7 @@ search_again_locked: version = atomic_read(&ft->node.version); /* Collect all fgs which has a matching match_criteria */ - err = build_match_list(&match_head, ft, spec, take_write); + err = build_match_list(&match_head, ft, spec, flow_act->fg, take_write); if (err) { if (take_write) up_write_ref_node(&ft->node, false); diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index b1aad14689e3..e3bfed68b08a 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -224,6 +224,7 @@ struct mlx5_flow_act { u32 flags; struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; struct ib_counters *counters; + struct mlx5_flow_group *fg; }; #define MLX5_DECLARE_FLOW_ACT(name) \ -- cgit v1.2.3 From 605bef0015b163867127202b821dce79804d603d Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 5 Apr 2020 17:50:25 -0700 Subject: net/mlx5: cmdif, cmd_check refactoring Do not mangle the command outbox in the internal low level cmd_exec and cmd_invoke functions. Instead return a proper unique error code and move the driver error checking to be at a higher level in mlx5_cmd_exec(). Signed-off-by: Saeed Mahameed Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 173 +++++++++++++------------ drivers/net/ethernet/mellanox/mlx5/core/main.c | 5 +- include/linux/mlx5/driver.h | 1 - 3 files changed, 95 insertions(+), 84 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 3c6a533ee0c9..7ff01b901f53 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -760,43 +760,61 @@ struct mlx5_ifc_mbox_in_bits { u8 reserved_at_40[0x40]; }; -void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome) -{ - *status = MLX5_GET(mbox_out, out, status); - *syndrome = MLX5_GET(mbox_out, out, syndrome); -} - -static int mlx5_cmd_check(struct mlx5_core_dev *dev, void *in, void *out) +static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out) { + u16 opcode, op_mod; u32 syndrome; u8 status; - u16 opcode; - u16 op_mod; u16 uid; + int err; - mlx5_cmd_mbox_status(out, &status, &syndrome); - if (!status) - return 0; + syndrome = MLX5_GET(mbox_out, out, syndrome); + status = MLX5_GET(mbox_out, out, status); opcode = MLX5_GET(mbox_in, in, opcode); op_mod = MLX5_GET(mbox_in, in, op_mod); uid = MLX5_GET(mbox_in, in, uid); + err = cmd_status_to_err(status); + if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY) mlx5_core_err_rl(dev, - "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n", + "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", mlx5_command_str(opcode), opcode, op_mod, - cmd_status_str(status), status, syndrome); + cmd_status_str(status), status, syndrome, err); else mlx5_core_dbg(dev, - "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x)\n", - mlx5_command_str(opcode), - opcode, op_mod, - cmd_status_str(status), - status, - syndrome); + "%s(0x%x) op_mod(0x%x) uid(%d) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", + mlx5_command_str(opcode), opcode, op_mod, uid, + cmd_status_str(status), status, syndrome, err); +} + +static int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out) +{ + /* aborted due to PCI error or via reset flow mlx5_cmd_trigger_completions() */ + if (err == -ENXIO) { + u16 opcode = MLX5_GET(mbox_in, in, opcode); + u32 syndrome; + u8 status; + + /* PCI Error, emulate command return status, for smooth reset */ + err = mlx5_internal_err_ret_value(dev, opcode, &syndrome, &status); + MLX5_SET(mbox_out, out, status, status); + MLX5_SET(mbox_out, out, syndrome, syndrome); + if (!err) + return 0; + } + + /* driver or FW delivery error */ + if (err) + return err; - return cmd_status_to_err(status); + /* check outbox status */ + err = cmd_status_to_err(MLX5_GET(mbox_out, out, status)); + if (err) + cmd_status_print(dev, in, out); + + return err; } static void dump_command(struct mlx5_core_dev *dev, @@ -980,13 +998,7 @@ static void cmd_work_handler(struct work_struct *work) /* Skip sending command to fw if internal error */ if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, ent->op)) { - u8 status = 0; - u32 drv_synd; - - ent->ret = mlx5_internal_err_ret_value(dev, msg_to_opcode(ent->in), &drv_synd, &status); - MLX5_SET(mbox_out, ent->out, status, status); - MLX5_SET(mbox_out, ent->out, syndrome, drv_synd); - + ent->ret = -ENXIO; mlx5_cmd_comp_handler(dev, 1ULL << ent->idx, true); return; } @@ -1005,6 +1017,31 @@ static void cmd_work_handler(struct work_struct *work) } } +static int deliv_status_to_err(u8 status) +{ + switch (status) { + case MLX5_CMD_DELIVERY_STAT_OK: + case MLX5_DRIVER_STATUS_ABORTED: + return 0; + case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR: + case MLX5_CMD_DELIVERY_STAT_TOK_ERR: + return -EBADR; + case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR: + case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR: + case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR: + return -EFAULT; /* Bad address */ + case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR: + case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR: + case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR: + case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR: + return -ENOMSG; + case MLX5_CMD_DELIVERY_STAT_FW_ERR: + return -EIO; + default: + return -EINVAL; + } +} + static const char *deliv_status_to_str(u8 status) { switch (status) { @@ -1622,15 +1659,15 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force ent->ts2 = ktime_get_ns(); memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out)); dump_command(dev, ent, 0); - if (!ent->ret) { + + if (vec & MLX5_TRIGGERED_CMD_COMP) + ent->ret = -ENXIO; + + if (!ent->ret) { /* Command completed by FW */ if (!cmd->checksum_disabled) ent->ret = verify_signature(ent); - else - ent->ret = 0; - if (vec & MLX5_TRIGGERED_CMD_COMP) - ent->status = MLX5_DRIVER_STATUS_ABORTED; - else - ent->status = ent->lay->status_own >> 1; + + ent->status = ent->lay->status_own >> 1; mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n", ent->ret, deliv_status_to_str(ent->status), ent->status); @@ -1649,14 +1686,13 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force callback = ent->callback; context = ent->context; err = ent->ret; - if (!err) { + if (!err && !ent->status) { err = mlx5_copy_from_msg(ent->uout, ent->out, ent->uout_size); - err = err ? err : mlx5_cmd_check(dev, - ent->in->first.data, - ent->uout); + err = mlx5_cmd_check(dev, err, ent->in->first.data, + ent->uout); } mlx5_free_cmd_msg(dev, ent->out); @@ -1729,31 +1765,6 @@ void mlx5_cmd_flush(struct mlx5_core_dev *dev) up(&cmd->sem); } -static int deliv_status_to_err(u8 status) -{ - switch (status) { - case MLX5_CMD_DELIVERY_STAT_OK: - case MLX5_DRIVER_STATUS_ABORTED: - return 0; - case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR: - case MLX5_CMD_DELIVERY_STAT_TOK_ERR: - return -EBADR; - case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR: - case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR: - case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR: - return -EFAULT; /* Bad address */ - case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR: - case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR: - case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR: - case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR: - return -ENOMSG; - case MLX5_CMD_DELIVERY_STAT_FW_ERR: - return -EIO; - default: - return -EINVAL; - } -} - static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size, gfp_t gfp) { @@ -1812,15 +1823,8 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, u8 token; int err; - if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, opcode)) { - u32 drv_synd; - u8 status; - - err = mlx5_internal_err_ret_value(dev, opcode, &drv_synd, &status); - MLX5_SET(mbox_out, out, status, status); - MLX5_SET(mbox_out, out, syndrome, drv_synd); - return err; - } + if (mlx5_cmd_is_down(dev) || !opcode_allowed(&dev->cmd, opcode)) + return -ENXIO; pages_queue = is_manage_pages(in); gfp = callback ? GFP_ATOMIC : GFP_KERNEL; @@ -1865,13 +1869,24 @@ out_in: return err; } +/** + * mlx5_cmd_exec - Executes a fw command, wait for completion + * + * @dev: mlx5 core device + * @in: inbox mlx5_ifc command buffer + * @in_size: inbox buffer size + * @out: outbox mlx5_ifc buffer + * @out_size: outbox size + * + * @return: 0 if no error, FW command execution was successful, + * and outbox status is ok. + */ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) { - int err; + int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); - err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); - return err ? : mlx5_cmd_check(dev, in, out); + return mlx5_cmd_check(dev, err, in, out); } EXPORT_SYMBOL(mlx5_cmd_exec); @@ -1932,11 +1947,9 @@ EXPORT_SYMBOL(mlx5_cmd_exec_cb); int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) { - int err; - - err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); + int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); - return err ? : mlx5_cmd_check(dev, in, out); + return mlx5_cmd_check(dev, err, in, out); } EXPORT_SYMBOL(mlx5_cmd_exec_polling); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 2c774f367199..cea1a8ac196e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -736,10 +736,9 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev) MLX5_SET(query_issi_in, query_in, opcode, MLX5_CMD_OP_QUERY_ISSI); err = mlx5_cmd_exec_inout(dev, query_issi, query_in, query_out); if (err) { - u32 syndrome; - u8 status; + u32 syndrome = MLX5_GET(query_issi_out, query_out, syndrome); + u8 status = MLX5_GET(query_issi_out, query_out, status); - mlx5_cmd_mbox_status(query_out, &status, &syndrome); if (!status || syndrome == MLX5_DRIVER_SYND) { mlx5_core_err(dev, "Failed to query ISSI err(%d) status(%d) synd(%d)\n", err, status, syndrome); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1b398c9e17b9..8a8408708e6c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -981,7 +981,6 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); -void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); -- cgit v1.2.3 From f23519e542e51c19ab3081deb089bb3f8fec7bb9 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 17 Aug 2019 03:05:10 -0700 Subject: net/mlx5: cmdif, Add new api for command execution Add mlx5_cmd_do. Unlike mlx5_cmd_exec, this function will not modify or translate outbox.status. The function will return: return = 0: Command was executed, outbox.status == MLX5_CMD_STAT_OK. return = -EREMOTEIO: Executed, outbox.status != MLX5_CMD_STAT_OK. return < 0: Command execution couldn't be performed by FW or driver. And document other mlx5_cmd_exec functions. Signed-off-by: Saeed Mahameed Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 79 ++++++++++++++++++++++----- include/linux/mlx5/driver.h | 2 + 2 files changed, 68 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 7ff01b901f53..a2f87a686a18 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -789,7 +789,7 @@ static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out) cmd_status_str(status), status, syndrome, err); } -static int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out) +int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out) { /* aborted due to PCI error or via reset flow mlx5_cmd_trigger_completions() */ if (err == -ENXIO) { @@ -806,7 +806,7 @@ static int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *ou } /* driver or FW delivery error */ - if (err) + if (err != -EREMOTEIO && err) return err; /* check outbox status */ @@ -816,6 +816,7 @@ static int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *ou return err; } +EXPORT_SYMBOL(mlx5_cmd_check); static void dump_command(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent, int input) @@ -1869,6 +1870,38 @@ out_in: return err; } +/** + * mlx5_cmd_do - Executes a fw command, wait for completion. + * Unlike mlx5_cmd_exec, this function will not translate or intercept + * outbox.status and will return -EREMOTEIO when + * outbox.status != MLX5_CMD_STAT_OK + * + * @dev: mlx5 core device + * @in: inbox mlx5_ifc command buffer + * @in_size: inbox buffer size + * @out: outbox mlx5_ifc buffer + * @out_size: outbox size + * + * @return: + * -EREMOTEIO : Command executed by FW, outbox.status != MLX5_CMD_STAT_OK. + * Caller must check FW outbox status. + * 0 : Command execution successful, outbox.status == MLX5_CMD_STAT_OK. + * < 0 : Command execution couldn't be performed by firmware or driver + */ +int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) +{ + int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); + + if (err) /* -EREMOTEIO is preserved */ + return err == -EREMOTEIO ? -EIO : err; + + if (MLX5_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK) + return -EREMOTEIO; + + return 0; +} +EXPORT_SYMBOL(mlx5_cmd_do); + /** * mlx5_cmd_exec - Executes a fw command, wait for completion * @@ -1878,18 +1911,47 @@ out_in: * @out: outbox mlx5_ifc buffer * @out_size: outbox size * - * @return: 0 if no error, FW command execution was successful, + * @return: 0 if no error, FW command execution was successful * and outbox status is ok. */ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) { - int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); + int err = mlx5_cmd_do(dev, in, in_size, out, out_size); return mlx5_cmd_check(dev, err, in, out); } EXPORT_SYMBOL(mlx5_cmd_exec); +/** + * mlx5_cmd_exec_polling - Executes a fw command, poll for completion + * Needed for driver force teardown, when command completion EQ + * will not be available to complete the command + * + * @dev: mlx5 core device + * @in: inbox mlx5_ifc command buffer + * @in_size: inbox buffer size + * @out: outbox mlx5_ifc buffer + * @out_size: outbox size + * + * @return: 0 if no error, FW command execution was successful + * and outbox status is ok. + */ +int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, + void *out, int out_size) +{ + int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); + + if (err) /* -EREMOTEIO is preserved */ + return err == -EREMOTEIO ? -EIO : err; + + if (MLX5_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK) + err = -EREMOTEIO; + + return mlx5_cmd_check(dev, err, in, out); +} +EXPORT_SYMBOL(mlx5_cmd_exec_polling); + void mlx5_cmd_init_async_ctx(struct mlx5_core_dev *dev, struct mlx5_async_ctx *ctx) { @@ -1944,15 +2006,6 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, } EXPORT_SYMBOL(mlx5_cmd_exec_cb); -int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, - void *out, int out_size) -{ - int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); - - return mlx5_cmd_check(dev, err, in, out); -} -EXPORT_SYMBOL(mlx5_cmd_exec_polling); - static void destroy_msg_cache(struct mlx5_core_dev *dev) { struct cmd_msg_cache *ch; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8a8408708e6c..1b9bec8fa870 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -964,6 +964,8 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, void *out, int out_size, mlx5_async_cbk_t callback, struct mlx5_async_work *work); +int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); +int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); -- cgit v1.2.3 From 31803e59233efc838b9dcb26edea28a4b2389e97 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 30 Mar 2020 21:12:58 -0700 Subject: net/mlx5: Use mlx5_cmd_do() in core create_{cq,dct} mlx5_core_create_{cq/dct} functions are non-trivial mlx5 commands functions. They check command execution status themselves and hide valuable FW failure information. For mlx5_core/eth kernel user this is what we actually want, but for a devx/rdma user the hidden information is essential and should be propagated up to the caller, thus we convert these commands to use mlx5_cmd_do to return the FW/driver and command outbox status as is, and let the caller decide what to do with it. For kernel callers of mlx5_core_create_{cq/dct} or those who only care about the binary status (FAIL/SUCCESS) they must check status themselves via mlx5_cmd_check() to restore the current behavior. err = mlx5_create_cq(in, out) err = mlx5_cmd_check(err, in, out) if (err) // handle err For DEVX users and those who care about full visibility, They will just propagate the error to user space, and app can check if err == -EREMOTEIO, then outbox.{status,syndrome} are valid. API Note: mlx5_cmd_check() must be used by kernel users since it allows the driver to intercept the command execution status and return a driver simulated status in case of driver induced error handling or reset/recovery flows. Signed-off-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/devx.c | 6 +++--- drivers/infiniband/hw/mlx5/qp.c | 1 + drivers/infiniband/hw/mlx5/qpc.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 17 ++++++++++++++--- include/linux/mlx5/cq.h | 2 ++ 5 files changed, 21 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 08b7f6bc56c3..1f62c0ede048 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1497,9 +1497,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( !is_apu_cq(dev, cmd_in)) { obj->flags |= DEVX_OBJ_FLAGS_CQ; obj->core_cq.comp = devx_cq_comp; - err = mlx5_core_create_cq(dev->mdev, &obj->core_cq, - cmd_in, cmd_in_len, cmd_out, - cmd_out_len); + err = mlx5_create_cq(dev->mdev, &obj->core_cq, + cmd_in, cmd_in_len, cmd_out, + cmd_out_len); } else { err = mlx5_cmd_exec(dev->mdev, cmd_in, cmd_in_len, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 29475cf8c7c3..b7fe47107d76 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4465,6 +4465,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, sizeof(out)); + err = mlx5_cmd_check(dev->mdev, err, qp->dct.in, out); if (err) return err; resp.dctn = qp->dct.mdct.mqp.qpn; diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index 8844eacf2380..542e4c63a8de 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -220,7 +220,7 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct, init_completion(&dct->drained); MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); - err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen); + err = mlx5_cmd_do(dev->mdev, in, inlen, out, outlen); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 5371ad0a12eb..15a74966be7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -86,8 +86,9 @@ static void mlx5_add_cq_to_tasklet(struct mlx5_core_cq *cq, spin_unlock_irqrestore(&tasklet_ctx->lock, flags); } -int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - u32 *in, int inlen, u32 *out, int outlen) +/* Callers must verify outbox status in case of err */ +int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *in, int inlen, u32 *out, int outlen) { int eqn = MLX5_GET(cqc, MLX5_ADDR_OF(create_cq_in, in, cq_context), c_eqn_or_apu_element); @@ -101,7 +102,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, memset(out, 0, outlen); MLX5_SET(create_cq_in, in, opcode, MLX5_CMD_OP_CREATE_CQ); - err = mlx5_cmd_exec(dev, in, inlen, out, outlen); + err = mlx5_cmd_do(dev, in, inlen, out, outlen); if (err) return err; @@ -148,6 +149,16 @@ err_cmd: mlx5_cmd_exec_in(dev, destroy_cq, din); return err; } +EXPORT_SYMBOL(mlx5_create_cq); + +/* oubox is checked and err val is normalized */ +int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *in, int inlen, u32 *out, int outlen) +{ + int err = mlx5_create_cq(dev, cq, in, inlen, out, outlen); + + return mlx5_cmd_check(dev, err, in, out); +} EXPORT_SYMBOL(mlx5_core_create_cq); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 7bfb67363434..cb15308b5cb0 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -183,6 +183,8 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq) complete(&cq->free); } +int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); -- cgit v1.2.3 From 0a41527608e7f3da61e76564f5a8749a1fddc7f1 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 31 Mar 2020 22:03:00 -0700 Subject: net/mlx5: cmdif, Refactor error handling and reporting of async commands Same as the new mlx5_cmd_do API, report all information to callers and let them handle the error values and outbox parsing. The user callback status "work->user_callback(status)" is now similar to the error rc code returned from the blocking mlx5_cmd_do() version, and now is defined as follows: -EREMOTEIO : Command executed by FW, outbox.status != MLX5_CMD_STAT_OK. Caller must check FW outbox status. 0 : Command execution successful, outbox.status == MLX5_CMD_STAT_OK. < 0 : Command couldn't execute, FW or driver induced error. Signed-off-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/mr.c | 15 ++++++- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 61 ++++++++++++++++----------- include/linux/mlx5/driver.h | 3 +- 3 files changed, 52 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 157d862fb864..06e4b8cea6bd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -140,6 +140,19 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); } +static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) +{ + if (status == -ENXIO) /* core driver is not available */ + return; + + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + if (status != -EREMOTEIO) /* driver specific failure */ + return; + + /* Failed in FW, print cmd out failure details */ + mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); +} + static void create_mkey_callback(int status, struct mlx5_async_work *context) { struct mlx5_ib_mr *mr = @@ -149,7 +162,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) unsigned long flags; if (status) { - mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + create_mkey_warn(dev, status, mr->out); kfree(mr); spin_lock_irqsave(&ent->lock, flags); ent->pending--; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index a2f87a686a18..823d5808d5a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -760,6 +760,18 @@ struct mlx5_ifc_mbox_in_bits { u8 reserved_at_40[0x40]; }; +void mlx5_cmd_out_err(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod, void *out) +{ + u32 syndrome = MLX5_GET(mbox_out, out, syndrome); + u8 status = MLX5_GET(mbox_out, out, status); + + mlx5_core_err_rl(dev, + "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", + mlx5_command_str(opcode), opcode, op_mod, + cmd_status_str(status), status, syndrome, cmd_status_to_err(status)); +} +EXPORT_SYMBOL(mlx5_cmd_out_err); + static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out) { u16 opcode, op_mod; @@ -778,10 +790,7 @@ static void cmd_status_print(struct mlx5_core_dev *dev, void *in, void *out) err = cmd_status_to_err(status); if (!uid && opcode != MLX5_CMD_OP_DESTROY_MKEY) - mlx5_core_err_rl(dev, - "%s(0x%x) op_mod(0x%x) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", - mlx5_command_str(opcode), opcode, op_mod, - cmd_status_str(status), status, syndrome, err); + mlx5_cmd_out_err(dev, opcode, op_mod, out); else mlx5_core_dbg(dev, "%s(0x%x) op_mod(0x%x) uid(%d) failed, status %s(0x%x), syndrome (0x%x), err(%d)\n", @@ -1686,20 +1695,18 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force callback = ent->callback; context = ent->context; - err = ent->ret; - if (!err && !ent->status) { + err = ent->ret ? : ent->status; + if (err > 0) /* Failed in FW, command didn't execute */ + err = deliv_status_to_err(err); + + if (!err) err = mlx5_copy_from_msg(ent->uout, ent->out, ent->uout_size); - err = mlx5_cmd_check(dev, err, ent->in->first.data, - ent->uout); - } - mlx5_free_cmd_msg(dev, ent->out); free_msg(dev, ent->in); - err = err ? err : ent->status; /* final consumer is done, release ent */ cmd_ent_put(ent); callback(err, context); @@ -1870,6 +1877,18 @@ out_in: return err; } +/* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */ +static int cmd_status_err(int err, void *out) +{ + if (err) /* -EREMOTEIO is preserved */ + return err == -EREMOTEIO ? -EIO : err; + + if (MLX5_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK) + return -EREMOTEIO; + + return 0; +} + /** * mlx5_cmd_do - Executes a fw command, wait for completion. * Unlike mlx5_cmd_exec, this function will not translate or intercept @@ -1892,13 +1911,7 @@ int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int { int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); - if (err) /* -EREMOTEIO is preserved */ - return err == -EREMOTEIO ? -EIO : err; - - if (MLX5_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK) - return -EREMOTEIO; - - return 0; + return cmd_status_err(err, out); } EXPORT_SYMBOL(mlx5_cmd_do); @@ -1942,12 +1955,7 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, { int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); - if (err) /* -EREMOTEIO is preserved */ - return err == -EREMOTEIO ? -EIO : err; - - if (MLX5_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK) - err = -EREMOTEIO; - + err = cmd_status_err(err, out); return mlx5_cmd_check(dev, err, in, out); } EXPORT_SYMBOL(mlx5_cmd_exec_polling); @@ -1980,8 +1988,10 @@ EXPORT_SYMBOL(mlx5_cmd_cleanup_async_ctx); static void mlx5_cmd_exec_cb_handler(int status, void *_work) { struct mlx5_async_work *work = _work; - struct mlx5_async_ctx *ctx = work->ctx; + struct mlx5_async_ctx *ctx; + ctx = work->ctx; + status = cmd_status_err(status, work->out); work->user_callback(status, work); if (atomic_dec_and_test(&ctx->num_inflight)) wake_up(&ctx->wait); @@ -1995,6 +2005,7 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, work->ctx = ctx; work->user_callback = callback; + work->out = out; if (WARN_ON(!atomic_inc_not_zero(&ctx->num_inflight))) return -EIO; ret = cmd_exec(ctx->dev, in, in_size, out, out_size, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1b9bec8fa870..432151d7d0bf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -955,6 +955,7 @@ typedef void (*mlx5_async_cbk_t)(int status, struct mlx5_async_work *context); struct mlx5_async_work { struct mlx5_async_ctx *ctx; mlx5_async_cbk_t user_callback; + void *out; /* pointer to the cmd output buffer */ }; void mlx5_cmd_init_async_ctx(struct mlx5_core_dev *dev, @@ -963,7 +964,7 @@ void mlx5_cmd_cleanup_async_ctx(struct mlx5_async_ctx *ctx); int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, void *out, int out_size, mlx5_async_cbk_t callback, struct mlx5_async_work *work); - +void mlx5_cmd_out_err(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod, void *out); int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, -- cgit v1.2.3 From 72fb3b60a3114a1154a8ae5629ea3b43a88a7a4d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 14 Aug 2021 17:57:51 +0300 Subject: net/mlx5: Add reset_state field to MFRL register Add new field reset_state to MFRL register. This field expose current state of sync reset for fw update. This field enables sharing with the user more details on why fw activate failed in case it failed the sync reset stage. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 598ac3bcc901..8ca2d65ff789 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9694,7 +9694,8 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x6b]; + u8 reserved_at_0[0x6a]; + u8 reset_state[0x1]; u8 ptpcyc2realtime_modify[0x1]; u8 reserved_at_6c[0x2]; u8 pci_status_and_power[0x1]; @@ -10375,6 +10376,14 @@ struct mlx5_ifc_mcda_reg_bits { u8 data[][0x20]; }; +enum { + MLX5_MFRL_REG_RESET_STATE_IDLE = 0, + MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION = 1, + MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS = 2, + MLX5_MFRL_REG_RESET_STATE_TIMEOUT = 3, + MLX5_MFRL_REG_RESET_STATE_NACK = 4, +}; + enum { MLX5_MFRL_REG_RESET_TYPE_FULL_CHIP = BIT(0), MLX5_MFRL_REG_RESET_TYPE_NET_PORT_ALIVE = BIT(1), @@ -10393,7 +10402,8 @@ struct mlx5_ifc_mfrl_reg_bits { u8 pci_sync_for_fw_update_start[0x1]; u8 pci_sync_for_fw_update_resp[0x2]; u8 rst_type_sel[0x3]; - u8 reserved_at_28[0x8]; + u8 reserved_at_28[0x4]; + u8 reset_state[0x4]; u8 reset_type[0x8]; u8 reset_level[0x8]; }; -- cgit v1.2.3 From 45fee8edb4b333af79efad7a99de51718ebda94b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 6 Sep 2021 11:02:44 +0300 Subject: net/mlx5: Add clarification on sync reset failure In case devlink reload action fw_activate failed in sync reset stage, use the new MFRL field reset_state to find why it failed and share this clarification with the user. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 10 ++-- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c | 57 +++++++++++++++++++--- drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h | 3 +- drivers/net/ethernet/mellanox/mlx5/core/port.c | 20 ++++++-- include/linux/mlx5/driver.h | 3 ++ 5 files changed, 74 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index d1093bb2d436..057dde6f4417 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -100,15 +100,11 @@ static int mlx5_devlink_reload_fw_activate(struct devlink *devlink, struct netli } net_port_alive = !!(reset_type & MLX5_MFRL_REG_RESET_TYPE_NET_PORT_ALIVE); - err = mlx5_fw_reset_set_reset_sync(dev, net_port_alive); + err = mlx5_fw_reset_set_reset_sync(dev, net_port_alive, extack); if (err) - goto out; + return err; - err = mlx5_fw_reset_wait_reset_done(dev); -out: - if (err) - NL_SET_ERR_MSG_MOD(extack, "FW activate command failed"); - return err; + return mlx5_fw_reset_wait_reset_done(dev); } static int mlx5_devlink_trigger_fw_live_patch(struct devlink *devlink, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c index 0b0234f9d694..d438d7a61500 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c @@ -57,7 +57,8 @@ static int mlx5_reg_mfrl_set(struct mlx5_core_dev *dev, u8 reset_level, return mlx5_core_access_reg(dev, in, sizeof(in), out, sizeof(out), MLX5_REG_MFRL, 0, 1); } -static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type) +static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level, + u8 *reset_type, u8 *reset_state) { u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; @@ -71,25 +72,67 @@ static int mlx5_reg_mfrl_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *r *reset_level = MLX5_GET(mfrl_reg, out, reset_level); if (reset_type) *reset_type = MLX5_GET(mfrl_reg, out, reset_type); + if (reset_state) + *reset_state = MLX5_GET(mfrl_reg, out, reset_state); return 0; } int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type) { - return mlx5_reg_mfrl_query(dev, reset_level, reset_type); + return mlx5_reg_mfrl_query(dev, reset_level, reset_type, NULL); } -int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel) +static int mlx5_fw_reset_get_reset_state_err(struct mlx5_core_dev *dev, + struct netlink_ext_ack *extack) +{ + u8 reset_state; + + if (mlx5_reg_mfrl_query(dev, NULL, NULL, &reset_state)) + goto out; + + switch (reset_state) { + case MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION: + case MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS: + NL_SET_ERR_MSG_MOD(extack, "Sync reset was already triggered"); + return -EBUSY; + case MLX5_MFRL_REG_RESET_STATE_TIMEOUT: + NL_SET_ERR_MSG_MOD(extack, "Sync reset got timeout"); + return -ETIMEDOUT; + case MLX5_MFRL_REG_RESET_STATE_NACK: + NL_SET_ERR_MSG_MOD(extack, "One of the hosts disabled reset"); + return -EPERM; + } + +out: + NL_SET_ERR_MSG_MOD(extack, "Sync reset failed"); + return -EIO; +} + +int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, + struct netlink_ext_ack *extack) { struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset; + u32 out[MLX5_ST_SZ_DW(mfrl_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(mfrl_reg)] = {}; int err; set_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); - err = mlx5_reg_mfrl_set(dev, MLX5_MFRL_REG_RESET_LEVEL3, reset_type_sel, 0, true); - if (err) - clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); - return err; + + MLX5_SET(mfrl_reg, in, reset_level, MLX5_MFRL_REG_RESET_LEVEL3); + MLX5_SET(mfrl_reg, in, rst_type_sel, reset_type_sel); + MLX5_SET(mfrl_reg, in, pci_sync_for_fw_update_start, 1); + err = mlx5_access_reg(dev, in, sizeof(in), out, sizeof(out), + MLX5_REG_MFRL, 0, 1, false); + if (!err) + return 0; + + clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags); + if (err == -EREMOTEIO && MLX5_CAP_MCAM_FEATURE(dev, reset_state)) + return mlx5_fw_reset_get_reset_state_err(dev, extack); + + NL_SET_ERR_MSG_MOD(extack, "Sync reset command failed"); + return mlx5_cmd_check(dev, err, in, out); } int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h index 7761ee5fc7d0..694fc7cb2684 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.h @@ -9,7 +9,8 @@ void mlx5_fw_reset_enable_remote_dev_reset_set(struct mlx5_core_dev *dev, bool enable); bool mlx5_fw_reset_enable_remote_dev_reset_get(struct mlx5_core_dev *dev); int mlx5_fw_reset_query(struct mlx5_core_dev *dev, u8 *reset_level, u8 *reset_type); -int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel); +int mlx5_fw_reset_set_reset_sync(struct mlx5_core_dev *dev, u8 reset_type_sel, + struct netlink_ext_ack *extack); int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev); int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 1ef2b6a848c1..d15b417d3e07 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -33,9 +33,10 @@ #include #include "mlx5_core.h" -int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, - int size_in, void *data_out, int size_out, - u16 reg_id, int arg, int write) +/* calling with verbose false will not print error to log */ +int mlx5_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, + void *data_out, int size_out, u16 reg_id, int arg, + int write, bool verbose) { int outlen = MLX5_ST_SZ_BYTES(access_register_out) + size_out; int inlen = MLX5_ST_SZ_BYTES(access_register_in) + size_in; @@ -57,7 +58,9 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, MLX5_SET(access_register_in, in, argument, arg); MLX5_SET(access_register_in, in, register_id, reg_id); - err = mlx5_cmd_exec(dev, in, inlen, out, outlen); + err = mlx5_cmd_do(dev, in, inlen, out, outlen); + if (verbose) + err = mlx5_cmd_check(dev, err, in, out); if (err) goto out; @@ -69,6 +72,15 @@ out: kvfree(in); return err; } +EXPORT_SYMBOL_GPL(mlx5_access_reg); + +int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, + int size_in, void *data_out, int size_out, + u16 reg_id, int arg, int write) +{ + return mlx5_access_reg(dev, data_in, size_in, data_out, size_out, + reg_id, arg, write, true); +} EXPORT_SYMBOL_GPL(mlx5_core_access_reg); int mlx5_query_pcam_reg(struct mlx5_core_dev *dev, u32 *pcam, u8 feature_group, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 432151d7d0bf..d3b1a6a1f8d2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1031,6 +1031,9 @@ int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev); void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, + void *data_out, int size_out, u16 reg_id, int arg, + int write, bool verbose); int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, u16 reg_num, int arg, int write); -- cgit v1.2.3 From 1241e329ce2e1f5b1039fd356b75867b29721ad2 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Wed, 23 Feb 2022 00:09:12 +0530 Subject: ethtool: add support to set/get completion queue event size Add support to set completion queue event size via ethtool -G parameter and get it via ethtool -g parameter. ~ # ./ethtool -G eth0 cqe-size 512 ~ # ./ethtool -g eth0 Ring parameters for eth0: Pre-set maximums: RX: 1048576 RX Mini: n/a RX Jumbo: n/a TX: 1048576 Current hardware settings: RX: 256 RX Mini: n/a RX Jumbo: n/a TX: 4096 RX Buf Len: 2048 CQE Size: 128 Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 11 +++++++++++ include/linux/ethtool.h | 4 ++++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/netlink.h | 2 +- net/ethtool/rings.c | 19 +++++++++++++++++-- 5 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index cae28af7a476..24d9be69065d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -861,6 +861,7 @@ Kernel response contents: ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring ``ETHTOOL_A_RINGS_RX_BUF_LEN`` u32 size of buffers on the ring ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` u8 TCP header / data split + ``ETHTOOL_A_RINGS_CQE_SIZE`` u32 Size of TX/RX CQE ==================================== ====== =========================== ``ETHTOOL_A_RINGS_TCP_DATA_SPLIT`` indicates whether the device is usable with @@ -885,6 +886,7 @@ Request contents: ``ETHTOOL_A_RINGS_RX_JUMBO`` u32 size of RX jumbo ring ``ETHTOOL_A_RINGS_TX`` u32 size of TX ring ``ETHTOOL_A_RINGS_RX_BUF_LEN`` u32 size of buffers on the ring + ``ETHTOOL_A_RINGS_CQE_SIZE`` u32 Size of TX/RX CQE ==================================== ====== =========================== Kernel checks that requested ring sizes do not exceed limits reported by @@ -892,6 +894,15 @@ driver. Driver may impose additional constraints and may not suspport all attributes. +``ETHTOOL_A_RINGS_CQE_SIZE`` specifies the completion queue event size. +Completion queue events(CQE) are the events posted by NIC to indicate the +completion status of a packet when the packet is sent(like send success or +error) or received(like pointers to packet fragments). The CQE size parameter +enables to modify the CQE size other than default size if NIC supports it. +A bigger CQE can have more receive buffer pointers inturn NIC can transfer +a bigger frame from wire. Based on the NIC hardware, the overall completion +queue size can be adjusted in the driver if CQE size is modified. + CHANNELS_GET ============ diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e0853f48b75e..4af58459a1e7 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -71,18 +71,22 @@ enum { * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. * @tcp_data_split: Scatter packet headers and data to separate buffers + * @cqe_size: Size of TX/RX completion queue event */ struct kernel_ethtool_ringparam { u32 rx_buf_len; u8 tcp_data_split; + u32 cqe_size; }; /** * enum ethtool_supported_ring_param - indicator caps for setting ring params * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len + * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), + ETHTOOL_RING_USE_CQE_SIZE = BIT(1), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 417d4280d7b5..979850221b8d 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -337,6 +337,7 @@ enum { ETHTOOL_A_RINGS_TX, /* u32 */ ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ + ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ /* add new constants above here */ __ETHTOOL_A_RINGS_CNT, diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 75856db299e9..29d01662a48b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -363,7 +363,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; -extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_RX_BUF_LEN + 1]; +extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_CQE_SIZE + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 18a5035d3bee..9f33c9689b56 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -54,7 +54,8 @@ static int rings_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */ nla_total_size(sizeof(u32)) + /* _RINGS_TX */ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */ - nla_total_size(sizeof(u8)); /* _RINGS_TCP_DATA_SPLIT */ + nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */ + nla_total_size(sizeof(u32)); /* _RINGS_CQE_SIZE */ } static int rings_fill_reply(struct sk_buff *skb, @@ -91,7 +92,9 @@ static int rings_fill_reply(struct sk_buff *skb, (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) || (kr->tcp_data_split && (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT, - kr->tcp_data_split)))) + kr->tcp_data_split))) || + (kr->cqe_size && + (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size)))) return -EMSGSIZE; return 0; @@ -119,6 +122,7 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 }, [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1), + [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1), }; int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) @@ -159,6 +163,8 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod); ethnl_update_u32(&kernel_ringparam.rx_buf_len, tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod); + ethnl_update_u32(&kernel_ringparam.cqe_size, + tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod); ret = 0; if (!mod) goto out_ops; @@ -190,6 +196,15 @@ int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info) goto out_ops; } + if (kernel_ringparam.cqe_size && + !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { + ret = -EOPNOTSUPP; + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_CQE_SIZE], + "setting cqe size not supported"); + goto out_ops; + } + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); if (ret < 0) -- cgit v1.2.3 From 5597f082fcaf17e2d9adee7a1f6fa02f5872f9d5 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 13 Jan 2022 15:34:07 +0100 Subject: can: bittiming: mark function arguments and local variables as const This patch marks the arguments of some functions as well as some local variables as constant. Link: https://lore.kernel.org/all/20220124215642.3474154-7-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/bittiming.c | 12 ++++++------ include/linux/can/bittiming.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c index 1b1d1499e2f1..2103bcca9012 100644 --- a/drivers/net/can/dev/bittiming.c +++ b/drivers/net/can/dev/bittiming.c @@ -24,7 +24,7 @@ */ static int can_update_sample_point(const struct can_bittiming_const *btc, - unsigned int sample_point_nominal, unsigned int tseg, + const unsigned int sample_point_nominal, const unsigned int tseg, unsigned int *tseg1_ptr, unsigned int *tseg2_ptr, unsigned int *sample_point_error_ptr) { @@ -63,7 +63,7 @@ can_update_sample_point(const struct can_bittiming_const *btc, return best_sample_point; } -int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, +int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc) { struct can_priv *priv = netdev_priv(dev); @@ -208,10 +208,10 @@ void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, * prescaler value brp. You can find more information in the header * file linux/can/netlink.h. */ -static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt, +static int can_fixup_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc) { - struct can_priv *priv = netdev_priv(dev); + const struct can_priv *priv = netdev_priv(dev); unsigned int tseg1, alltseg; u64 brp64; @@ -244,7 +244,7 @@ static int can_fixup_bittiming(struct net_device *dev, struct can_bittiming *bt, /* Checks the validity of predefined bitrate settings */ static int -can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt, +can_validate_bitrate(const struct net_device *dev, const struct can_bittiming *bt, const u32 *bitrate_const, const unsigned int bitrate_const_cnt) { @@ -258,7 +258,7 @@ can_validate_bitrate(struct net_device *dev, struct can_bittiming *bt, return -EINVAL; } -int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, +int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc, const u32 *bitrate_const, const unsigned int bitrate_const_cnt) diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index a81652d1c6f3..7ae21c0f7f23 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -113,7 +113,7 @@ struct can_tdc_const { }; #ifdef CONFIG_CAN_CALC_BITTIMING -int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, +int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc); void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, @@ -121,7 +121,7 @@ void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, u32 *ctrlmode, u32 ctrlmode_supported); #else /* !CONFIG_CAN_CALC_BITTIMING */ static inline int -can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, +can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc) { netdev_err(dev, "bit-timing calculation not available\n"); @@ -136,7 +136,7 @@ can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, } #endif /* CONFIG_CAN_CALC_BITTIMING */ -int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, +int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc, const u32 *bitrate_const, const unsigned int bitrate_const_cnt); -- cgit v1.2.3 From 5e187189ec324f78035d33a4bc123a9c4ca6f3e3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 26 Feb 2022 12:18:29 +0800 Subject: net: ip: add skb drop reasons for ip egress path Replace kfree_skb() which is used in the packet egress path of IP layer with kfree_skb_reason(). Functions that are involved include: __ip_queue_xmit() ip_finish_output() ip_mc_finish_output() ip6_output() ip6_finish_output() ip6_finish_output2() Following new drop reasons are introduced: SKB_DROP_REASON_IP_OUTNOROUTES SKB_DROP_REASON_BPF_CGROUP_EGRESS SKB_DROP_REASON_IPV6DISABLED SKB_DROP_REASON_NEIGH_CREATEFAIL Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ include/trace/events/skb.h | 5 +++++ net/ipv4/ip_output.c | 8 ++++---- net/ipv6/ip6_output.c | 6 +++--- 4 files changed, 21 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 31be38078918..62b4bed1b7bc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -380,6 +380,15 @@ enum skb_drop_reason { * the ofo queue, corresponding to * LINUX_MIB_TCPOFOMERGE */ + SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ + SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by + * BPF_PROG_TYPE_CGROUP_SKB + * eBPF program + */ + SKB_DROP_REASON_IPV6DISABLED, /* IPv6 is disabled on the device */ + SKB_DROP_REASON_NEIGH_CREATEFAIL, /* failed to create neigh + * entry + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 2ab7193313aa..97f8097651da 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -37,6 +37,11 @@ EM(SKB_DROP_REASON_TCP_OLD_DATA, TCP_OLD_DATA) \ EM(SKB_DROP_REASON_TCP_OVERWINDOW, TCP_OVERWINDOW) \ EM(SKB_DROP_REASON_TCP_OFOMERGE, TCP_OFOMERGE) \ + EM(SKB_DROP_REASON_IP_OUTNOROUTES, IP_OUTNOROUTES) \ + EM(SKB_DROP_REASON_BPF_CGROUP_EGRESS, \ + BPF_CGROUP_EGRESS) \ + EM(SKB_DROP_REASON_IPV6DISABLED, IPV6DISABLED) \ + EM(SKB_DROP_REASON_NEIGH_CREATEFAIL, NEIGH_CREATEFAIL) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2941de5da871..6df3545c891d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -233,7 +233,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s net_dbg_ratelimited("%s: No header cache and no neighbour!\n", __func__); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; } @@ -317,7 +317,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk case NET_XMIT_CN: return __ip_finish_output(net, sk, skb) ? : ret; default: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } @@ -337,7 +337,7 @@ static int ip_mc_finish_output(struct net *net, struct sock *sk, case NET_XMIT_SUCCESS: break; default: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } @@ -536,7 +536,7 @@ packet_routed: no_route: rcu_read_unlock(); IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES); return -EHOSTUNREACH; } EXPORT_SYMBOL(__ip_queue_xmit); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c5c10f798025..c5edc86b18bd 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -130,7 +130,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * rcu_read_unlock_bh(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; } @@ -202,7 +202,7 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s case NET_XMIT_CN: return __ip6_finish_output(net, sk, skb) ? : ret; default: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); return ret; } } @@ -217,7 +217,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(idev->cnf.disable_ipv6)) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } -- cgit v1.2.3 From a5736edda10ca2ba075606baad1dda3f2426766d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 26 Feb 2022 12:18:30 +0800 Subject: net: neigh: use kfree_skb_reason() for __neigh_event_send() Replace kfree_skb() used in __neigh_event_send() with kfree_skb_reason(). Following drop reasons are added: SKB_DROP_REASON_NEIGH_FAILED SKB_DROP_REASON_NEIGH_QUEUEFULL SKB_DROP_REASON_NEIGH_DEAD The first two reasons above should be the hot path that skb drops in neighbour subsystem. Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ include/trace/events/skb.h | 3 +++ net/core/neighbour.c | 6 +++--- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 62b4bed1b7bc..d67941f78b92 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -389,6 +389,11 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_CREATEFAIL, /* failed to create neigh * entry */ + SKB_DROP_REASON_NEIGH_FAILED, /* neigh entry in failed state */ + SKB_DROP_REASON_NEIGH_QUEUEFULL, /* arp_queue for neigh + * entry is full + */ + SKB_DROP_REASON_NEIGH_DEAD, /* neigh entry is dead */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 97f8097651da..1977f301260d 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -42,6 +42,9 @@ BPF_CGROUP_EGRESS) \ EM(SKB_DROP_REASON_IPV6DISABLED, IPV6DISABLED) \ EM(SKB_DROP_REASON_NEIGH_CREATEFAIL, NEIGH_CREATEFAIL) \ + EM(SKB_DROP_REASON_NEIGH_FAILED, NEIGH_FAILED) \ + EM(SKB_DROP_REASON_NEIGH_QUEUEFULL, NEIGH_QUEUEFULL) \ + EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ec0bf737b076..f64ebd050f6c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1171,7 +1171,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, neigh->updated = jiffies; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); return 1; } } else if (neigh->nud_state & NUD_STALE) { @@ -1193,7 +1193,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb, if (!buff) break; neigh->arp_queue_len_bytes -= buff->truesize; - kfree_skb(buff); + kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL); NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); } skb_dst_force(skb); @@ -1215,7 +1215,7 @@ out_dead: if (neigh->nud_state & NUD_STALE) goto out_unlock_bh; write_unlock_bh(&neigh->lock); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD); trace_neigh_event_send_dead(neigh, 1); return 1; } -- cgit v1.2.3 From e8eb9e32999dc5995b19d5141f8ebb38f69696fc Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Thu, 24 Feb 2022 18:58:55 -0800 Subject: PCI: Add Fungible Vendor ID to pci_ids.h Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Signed-off-by: Dimitris Michailidis Acked-by: Bjorn Helgaas Signed-off-by: David S. Miller --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index aad54c666407..c7e6f2043c7d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2561,6 +2561,8 @@ #define PCI_VENDOR_ID_HYGON 0x1d94 +#define PCI_VENDOR_ID_FUNGIBLE 0x1dad + #define PCI_VENDOR_ID_HXT 0x1dbf #define PCI_VENDOR_ID_TEKRAM 0x1de1 -- cgit v1.2.3 From 91495f21fcec3a889d6a9c21e6df16c4c15f2184 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:16 +0200 Subject: net: dsa: tag_8021q: replace the SVL bridging with VLAN-unaware IVL bridging For VLAN-unaware bridging, tag_8021q uses something perhaps a bit too tied with the sja1105 switch: each port uses the same pvid which is also used for standalone operation (a unique one from which the source port and device ID can be retrieved when packets from that port are forwarded to the CPU). Since each port has a unique pvid when performing autonomous forwarding, the switch must be configured for Shared VLAN Learning (SVL) such that the VLAN ID itself is ignored when performing FDB lookups. Without SVL, packets would always be flooded, since FDB lookup in the source port's VLAN would never find any entry. First of all, to make tag_8021q more palatable to switches which might not support Shared VLAN Learning, let's just use a common VLAN for all ports that are under the same bridge. Secondly, using Shared VLAN Learning means that FDB isolation can never be enforced. But if all ports under the same VLAN-unaware bridge share the same VLAN ID, it can. The disadvantage is that the CPU port can no longer perform precise source port identification for these packets. But at least we have a mechanism which has proven to be adequate for that situation: imprecise RX (dsa_find_designated_bridge_port_by_vid), which is what we use for termination on VLAN-aware bridges. The VLAN ID that VLAN-unaware bridges will use with tag_8021q is the same one as we were previously using for imprecise TX (bridge TX forwarding offload). It is already allocated, it is just a matter of using it. Note that because now all ports under the same bridge share the same VLAN, the complexity of performing a tag_8021q bridge join decreases dramatically. We no longer have to install the RX VLAN of a newly joining port into the port membership of the existing bridge ports. The newly joining port just becomes a member of the VLAN corresponding to that bridge, and the other ports are already members of it from when they joined the bridge themselves. So forwarding works properly. This means that we can unhook dsa_tag_8021q_bridge_{join,leave} from the cross-chip notifier level dsa_switch_bridge_{join,leave}. We can put these calls directly into the sja1105 driver. With this new mode of operation, a port controlled by tag_8021q can have two pvids whereas before it could only have one. The pvid for standalone operation is different from the pvid used for VLAN-unaware bridging. This is done, again, so that FDB isolation can be enforced. Let tag_8021q manage this by deleting the standalone pvid when a port joins a bridge, and restoring it when it leaves it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_main.c | 4 +- drivers/net/dsa/sja1105/sja1105_vl.c | 15 +++- include/linux/dsa/8021q.h | 12 +-- net/dsa/dsa_priv.h | 4 - net/dsa/switch.c | 4 +- net/dsa/tag_8021q.c | 132 ++++++++++++--------------------- 6 files changed, 70 insertions(+), 101 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 8fc309446e1e..ab196dc615da 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2067,7 +2067,7 @@ static int sja1105_bridge_join(struct dsa_switch *ds, int port, if (rc) return rc; - rc = dsa_tag_8021q_bridge_tx_fwd_offload(ds, port, bridge); + rc = dsa_tag_8021q_bridge_join(ds, port, bridge); if (rc) { sja1105_bridge_member(ds, port, bridge, false); return rc; @@ -2081,7 +2081,7 @@ static int sja1105_bridge_join(struct dsa_switch *ds, int port, static void sja1105_bridge_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) { - dsa_tag_8021q_bridge_tx_fwd_unoffload(ds, port, bridge); + dsa_tag_8021q_bridge_leave(ds, port, bridge); sja1105_bridge_member(ds, port, bridge, false); } diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index f5dca6a9b0f9..14e6dd7fb103 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -296,6 +296,19 @@ static bool sja1105_vl_key_lower(struct sja1105_vl_lookup_entry *a, return false; } +/* FIXME: this should change when the bridge upper of the port changes. */ +static u16 sja1105_port_get_tag_8021q_vid(struct dsa_port *dp) +{ + unsigned long bridge_num; + + if (!dp->bridge) + return dsa_tag_8021q_rx_vid(dp); + + bridge_num = dsa_port_bridge_num_get(dp); + + return dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); +} + static int sja1105_init_virtual_links(struct sja1105_private *priv, struct netlink_ext_ack *extack) { @@ -395,7 +408,7 @@ static int sja1105_init_virtual_links(struct sja1105_private *priv, vl_lookup[k].vlanprior = rule->key.vl.pcp; } else { struct dsa_port *dp = dsa_to_port(priv->ds, port); - u16 vid = dsa_tag_8021q_rx_vid(dp); + u16 vid = sja1105_port_get_tag_8021q_vid(dp); vl_lookup[k].vlanid = vid; vl_lookup[k].vlanprior = 0; diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 939a1beaddf7..f47f227baa27 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -32,17 +32,17 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); void dsa_tag_8021q_unregister(struct dsa_switch *ds); +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, + struct dsa_bridge bridge); + +void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_bridge bridge); + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); -int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge); - -void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge); - u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 1ba93afdc874..7a1c98581f53 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -522,10 +522,6 @@ struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst, const struct net_device *br); /* tag_8021q.c */ -int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info); -int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info); int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, struct dsa_notifier_tag_8021q_vlan_info *info); int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 0c2961cbc105..eb38beb10147 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -110,7 +110,7 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, return err; } - return dsa_tag_8021q_bridge_join(ds, info); + return 0; } static int dsa_switch_sync_vlan_filtering(struct dsa_switch *ds, @@ -186,7 +186,7 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, return err; } - return dsa_tag_8021q_bridge_leave(ds, info); + return 0; } /* Matches for all upstream-facing ports (the CPU port and all upstream-facing diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 114f663332d0..c6555003f5df 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -110,6 +110,15 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); +/* Returns the decoded VBID from the RX VID. */ +static int dsa_tag_8021q_rx_vbid(u16 vid) +{ + u16 vbid_hi = (vid & DSA_8021Q_VBID_HI_MASK) >> DSA_8021Q_VBID_HI_SHIFT; + u16 vbid_lo = (vid & DSA_8021Q_VBID_LO_MASK) >> DSA_8021Q_VBID_LO_SHIFT; + + return (vbid_hi << 2) | vbid_lo; +} + bool vid_is_dsa_8021q_rxvlan(u16 vid) { return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; @@ -244,11 +253,17 @@ int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, if (dsa_port_is_user(dp)) flags |= BRIDGE_VLAN_INFO_UNTAGGED; + /* Standalone VLANs are PVIDs */ if (vid_is_dsa_8021q_rxvlan(info->vid) && dsa_8021q_rx_switch_id(info->vid) == ds->index && dsa_8021q_rx_source_port(info->vid) == dp->index) flags |= BRIDGE_VLAN_INFO_PVID; + /* And bridging VLANs are PVIDs too on user ports */ + if (dsa_tag_8021q_rx_vbid(info->vid) && + dsa_port_is_user(dp)) + flags |= BRIDGE_VLAN_INFO_PVID; + err = dsa_port_do_tag_8021q_vlan_add(dp, info->vid, flags); if (err) @@ -326,107 +341,52 @@ int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 */ -static bool -dsa_port_tag_8021q_bridge_match(struct dsa_port *dp, - struct dsa_notifier_bridge_info *info) +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, + struct dsa_bridge bridge) { - /* Don't match on self */ - if (dp->ds->dst->index == info->tree_index && - dp->ds->index == info->sw_index && - dp->index == info->port) - return false; - - if (dsa_port_is_user(dp)) - return dsa_port_offloads_bridge(dp, &info->bridge); - - return false; -} - -int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info) -{ - struct dsa_switch *targeted_ds; - struct dsa_port *targeted_dp; - struct dsa_port *dp; - u16 targeted_rx_vid; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 standalone_vid, bridge_vid; int err; - if (!ds->tag_8021q_ctx) - return 0; - - targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); - targeted_dp = dsa_to_port(targeted_ds, info->port); - targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); - - dsa_switch_for_each_port(dp, ds) { - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - - if (!dsa_port_tag_8021q_bridge_match(dp, info)) - continue; + /* Delete the standalone VLAN of the port and replace it with a + * bridging VLAN + */ + standalone_vid = dsa_tag_8021q_rx_vid(dp); + bridge_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); - /* Install the RX VID of the targeted port in our VLAN table */ - err = dsa_port_tag_8021q_vlan_add(dp, targeted_rx_vid, true); - if (err) - return err; + err = dsa_port_tag_8021q_vlan_add(dp, bridge_vid, true); + if (err) + return err; - /* Install our RX VID into the targeted port's VLAN table */ - err = dsa_port_tag_8021q_vlan_add(targeted_dp, rx_vid, true); - if (err) - return err; - } + dsa_port_tag_8021q_vlan_del(dp, standalone_vid, false); return 0; } +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_join); -int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, - struct dsa_notifier_bridge_info *info) +void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_bridge bridge) { - struct dsa_switch *targeted_ds; - struct dsa_port *targeted_dp; - struct dsa_port *dp; - u16 targeted_rx_vid; - - if (!ds->tag_8021q_ctx) - return 0; - - targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); - targeted_dp = dsa_to_port(targeted_ds, info->port); - targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); - - dsa_switch_for_each_port(dp, ds) { - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - - if (!dsa_port_tag_8021q_bridge_match(dp, info)) - continue; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 standalone_vid, bridge_vid; + int err; - /* Remove the RX VID of the targeted port from our VLAN table */ - dsa_port_tag_8021q_vlan_del(dp, targeted_rx_vid, true); + /* Delete the bridging VLAN of the port and replace it with a + * standalone VLAN + */ + standalone_vid = dsa_tag_8021q_rx_vid(dp); + bridge_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); - /* Remove our RX VID from the targeted port's VLAN table */ - dsa_port_tag_8021q_vlan_del(targeted_dp, rx_vid, true); + err = dsa_port_tag_8021q_vlan_add(dp, standalone_vid, false); + if (err) { + dev_err(ds->dev, + "Failed to delete tag_8021q standalone VLAN %d from port %d: %pe\n", + standalone_vid, port, ERR_PTR(err)); } - return 0; -} - -int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge) -{ - u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); - - return dsa_port_tag_8021q_vlan_add(dsa_to_port(ds, port), tx_vid, - true); -} -EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_offload); - -void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge) -{ - u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); - - dsa_port_tag_8021q_vlan_del(dsa_to_port(ds, port), tx_vid, true); + dsa_port_tag_8021q_vlan_del(dp, bridge_vid, true); } -EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_unoffload); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_leave); /* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) -- cgit v1.2.3 From d7f9787a763f35225287aedb9364c972ae128d18 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:17 +0200 Subject: net: dsa: tag_8021q: add support for imprecise RX based on the VBID The sja1105 switch can't populate the PORT field of the tag_8021q header when sending a frame to the CPU with a non-zero VBID. Similar to dsa_find_designated_bridge_port_by_vid() which performs imprecise RX for VLAN-aware bridges, let's introduce a helper in tag_8021q for performing imprecise RX based on the VLAN that it has allocated for a VLAN-unaware bridge. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 6 +++++- net/dsa/tag_8021q.c | 38 ++++++++++++++++++++++++++++++++++++-- net/dsa/tag_ocelot_8021q.c | 2 +- net/dsa/tag_sja1105.c | 22 +++++++++++++--------- 4 files changed, 55 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index f47f227baa27..92f5243b841e 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -41,7 +41,11 @@ void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *vbid); + +struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, + int vbid); u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index c6555003f5df..1cf245a6f18e 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -32,7 +32,7 @@ * VBID - { VID[9], VID[5:4] }: * Virtual bridge ID. If between 1 and 7, packet targets the broadcast * domain of a bridge. If transmitted as zero, packet targets a single - * port. Field only valid on transmit, must be ignored on receive. + * port. * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. @@ -533,7 +533,37 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id) +struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, + int vbid) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct dsa_port *dp; + + if (WARN_ON(!vbid)) + return NULL; + + dsa_tree_for_each_user_port(dp, dst) { + if (!dp->bridge) + continue; + + if (dp->stp_state != BR_STATE_LEARNING && + dp->stp_state != BR_STATE_FORWARDING) + continue; + + if (dp->cpu_dp != cpu_dp) + continue; + + if (dsa_port_bridge_num_get(dp) == vbid) + return dp->slave; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_find_port_by_vbid); + +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *vbid) { u16 vid, tci; @@ -550,6 +580,10 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id) *source_port = dsa_8021q_rx_source_port(vid); *switch_id = dsa_8021q_rx_switch_id(vid); + + if (vbid) + *vbid = dsa_tag_8021q_rx_vbid(vid); + skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } EXPORT_SYMBOL_GPL(dsa_8021q_rcv); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index bd6f1d0e5372..1144a87ad0db 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -77,7 +77,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, { int src_port, switch_id; - dsa_8021q_rcv(skb, &src_port, &switch_id); + dsa_8021q_rcv(skb, &src_port, &switch_id, NULL); skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); if (!skb->dev) diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 72d5e0ef8dcf..9c5c00980b06 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -509,7 +509,7 @@ static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb) * packet. */ static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, - int *switch_id, u16 *vid) + int *switch_id, int *vbid, u16 *vid) { struct vlan_ethhdr *hdr = (struct vlan_ethhdr *)skb_mac_header(skb); u16 vlan_tci; @@ -519,8 +519,8 @@ static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, else vlan_tci = ntohs(hdr->h_vlan_TCI); - if (vid_is_dsa_8021q_rxvlan(vlan_tci & VLAN_VID_MASK)) - return dsa_8021q_rcv(skb, source_port, switch_id); + if (vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK)) + return dsa_8021q_rcv(skb, source_port, switch_id, vbid); /* Try our best with imprecise RX */ *vid = vlan_tci & VLAN_VID_MASK; @@ -529,7 +529,7 @@ static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev) { - int source_port = -1, switch_id = -1; + int source_port = -1, switch_id = -1, vbid = -1; struct sja1105_meta meta = {0}; struct ethhdr *hdr; bool is_link_local; @@ -542,7 +542,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, if (sja1105_skb_has_tag_8021q(skb)) { /* Normal traffic path. */ - sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vbid, &vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -561,7 +561,9 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - if (source_port == -1 || switch_id == -1) + if (vbid >= 1) + skb->dev = dsa_tag_8021q_find_port_by_vbid(netdev, vbid); + else if (source_port == -1 || switch_id == -1) skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); else skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); @@ -686,7 +688,7 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, static struct sk_buff *sja1110_rcv(struct sk_buff *skb, struct net_device *netdev) { - int source_port = -1, switch_id = -1; + int source_port = -1, switch_id = -1, vbid = -1; bool host_only = false; u16 vid = 0; @@ -700,9 +702,11 @@ static struct sk_buff *sja1110_rcv(struct sk_buff *skb, /* Packets with in-band control extensions might still have RX VLANs */ if (likely(sja1105_skb_has_tag_8021q(skb))) - sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vbid, &vid); - if (source_port == -1 || switch_id == -1) + if (vbid >= 1) + skb->dev = dsa_tag_8021q_find_port_by_vbid(netdev, vbid); + else if (source_port == -1 || switch_id == -1) skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); else skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); -- cgit v1.2.3 From 04b67e18ce5b29785578397f6785f28f512d64aa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:20 +0200 Subject: net: dsa: tag_8021q: merge RX and TX VLANs In the old Shared VLAN Learning mode of operation that tag_8021q previously used for forwarding, we needed to have distinct concepts for an RX and a TX VLAN. An RX VLAN could be installed on all ports that were members of a given bridge, so that autonomous forwarding could still work, while a TX VLAN was dedicated for precise packet steering, so it just contained the CPU port and one egress port. Now that tag_8021q uses Independent VLAN Learning and imprecise RX/TX all over, those lines have been blurred and we no longer have the need to do precise TX towards a port that is in a bridge. As for standalone ports, it is fine to use the same VLAN ID for both RX and TX. This patch changes the tag_8021q format by shifting the VLAN range it reserves, and halving it. Previously, our DIR bits were encoding the VLAN direction (RX/TX) and were set to either 1 or 2. This meant that tag_8021q reserved 2K VLANs, or 50% of the available range. Change the DIR bits to a hardcoded value of 3 now, which makes tag_8021q reserve only 1K VLANs, and a different range now (the last 1K). This is done so that we leave the old format in place in case we need to return to it. In terms of code, the vid_is_dsa_8021q_rxvlan and vid_is_dsa_8021q_txvlan functions go away. Any vid_is_dsa_8021q is both a TX and an RX VLAN, and they are no longer distinct. For example, felix which did different things for different VLAN types, now needs to handle the RX and the TX logic for the same VLAN. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 120 ++++++++++++----------- drivers/net/dsa/sja1105/sja1105_main.c | 2 +- drivers/net/dsa/sja1105/sja1105_vl.c | 3 +- include/linux/dsa/8021q.h | 8 +- net/dsa/tag_8021q.c | 169 +++++++++------------------------ net/dsa/tag_ocelot_8021q.c | 2 +- net/dsa/tag_sja1105.c | 4 +- 7 files changed, 115 insertions(+), 193 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 57d6e62d7a48..33037ee305b4 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -25,8 +25,10 @@ #include #include "felix.h" -static int felix_tag_8021q_rxvlan_add(struct felix *felix, int port, u16 vid, - bool pvid, bool untagged) +/* Set up VCAP ES0 rules for pushing a tag_8021q VLAN towards the CPU such that + * the tagger can perform RX source port identification. + */ +static int felix_tag_8021q_vlan_add_rx(struct felix *felix, int port, u16 vid) { struct ocelot_vcap_filter *outer_tagging_rule; struct ocelot *ocelot = &felix->ocelot; @@ -64,21 +66,32 @@ static int felix_tag_8021q_rxvlan_add(struct felix *felix, int port, u16 vid, return err; } -static int felix_tag_8021q_txvlan_add(struct felix *felix, int port, u16 vid, - bool pvid, bool untagged) +static int felix_tag_8021q_vlan_del_rx(struct felix *felix, int port, u16 vid) +{ + struct ocelot_vcap_filter *outer_tagging_rule; + struct ocelot_vcap_block *block_vcap_es0; + struct ocelot *ocelot = &felix->ocelot; + + block_vcap_es0 = &ocelot->block[VCAP_ES0]; + + outer_tagging_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_es0, + port, false); + if (!outer_tagging_rule) + return -ENOENT; + + return ocelot_vcap_filter_del(ocelot, outer_tagging_rule); +} + +/* Set up VCAP IS1 rules for stripping the tag_8021q VLAN on TX and VCAP IS2 + * rules for steering those tagged packets towards the correct destination port + */ +static int felix_tag_8021q_vlan_add_tx(struct felix *felix, int port, u16 vid) { struct ocelot_vcap_filter *untagging_rule, *redirect_rule; struct ocelot *ocelot = &felix->ocelot; struct dsa_switch *ds = felix->ds; int upstream, err; - /* tag_8021q.c assumes we are implementing this via port VLAN - * membership, which we aren't. So we don't need to add any VCAP filter - * for the CPU port. - */ - if (ocelot->ports[port]->is_dsa_8021q_cpu) - return 0; - untagging_rule = kzalloc(sizeof(struct ocelot_vcap_filter), GFP_KERNEL); if (!untagging_rule) return -ENOMEM; @@ -135,41 +148,7 @@ static int felix_tag_8021q_txvlan_add(struct felix *felix, int port, u16 vid, return 0; } -static int felix_tag_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid, - u16 flags) -{ - bool untagged = flags & BRIDGE_VLAN_INFO_UNTAGGED; - bool pvid = flags & BRIDGE_VLAN_INFO_PVID; - struct ocelot *ocelot = ds->priv; - - if (vid_is_dsa_8021q_rxvlan(vid)) - return felix_tag_8021q_rxvlan_add(ocelot_to_felix(ocelot), - port, vid, pvid, untagged); - - if (vid_is_dsa_8021q_txvlan(vid)) - return felix_tag_8021q_txvlan_add(ocelot_to_felix(ocelot), - port, vid, pvid, untagged); - - return 0; -} - -static int felix_tag_8021q_rxvlan_del(struct felix *felix, int port, u16 vid) -{ - struct ocelot_vcap_filter *outer_tagging_rule; - struct ocelot_vcap_block *block_vcap_es0; - struct ocelot *ocelot = &felix->ocelot; - - block_vcap_es0 = &ocelot->block[VCAP_ES0]; - - outer_tagging_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_es0, - port, false); - if (!outer_tagging_rule) - return -ENOENT; - - return ocelot_vcap_filter_del(ocelot, outer_tagging_rule); -} - -static int felix_tag_8021q_txvlan_del(struct felix *felix, int port, u16 vid) +static int felix_tag_8021q_vlan_del_tx(struct felix *felix, int port, u16 vid) { struct ocelot_vcap_filter *untagging_rule, *redirect_rule; struct ocelot_vcap_block *block_vcap_is1; @@ -177,9 +156,6 @@ static int felix_tag_8021q_txvlan_del(struct felix *felix, int port, u16 vid) struct ocelot *ocelot = &felix->ocelot; int err; - if (ocelot->ports[port]->is_dsa_8021q_cpu) - return 0; - block_vcap_is1 = &ocelot->block[VCAP_IS1]; block_vcap_is2 = &ocelot->block[VCAP_IS2]; @@ -195,22 +171,54 @@ static int felix_tag_8021q_txvlan_del(struct felix *felix, int port, u16 vid) redirect_rule = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, port, false); if (!redirect_rule) - return 0; + return -ENOENT; return ocelot_vcap_filter_del(ocelot, redirect_rule); } +static int felix_tag_8021q_vlan_add(struct dsa_switch *ds, int port, u16 vid, + u16 flags) +{ + struct ocelot *ocelot = ds->priv; + int err; + + /* tag_8021q.c assumes we are implementing this via port VLAN + * membership, which we aren't. So we don't need to add any VCAP filter + * for the CPU port. + */ + if (!dsa_is_user_port(ds, port)) + return 0; + + err = felix_tag_8021q_vlan_add_rx(ocelot_to_felix(ocelot), port, vid); + if (err) + return err; + + err = felix_tag_8021q_vlan_add_tx(ocelot_to_felix(ocelot), port, vid); + if (err) { + felix_tag_8021q_vlan_del_rx(ocelot_to_felix(ocelot), port, vid); + return err; + } + + return 0; +} + static int felix_tag_8021q_vlan_del(struct dsa_switch *ds, int port, u16 vid) { struct ocelot *ocelot = ds->priv; + int err; + + if (!dsa_is_user_port(ds, port)) + return 0; - if (vid_is_dsa_8021q_rxvlan(vid)) - return felix_tag_8021q_rxvlan_del(ocelot_to_felix(ocelot), - port, vid); + err = felix_tag_8021q_vlan_del_rx(ocelot_to_felix(ocelot), port, vid); + if (err) + return err; - if (vid_is_dsa_8021q_txvlan(vid)) - return felix_tag_8021q_txvlan_del(ocelot_to_felix(ocelot), - port, vid); + err = felix_tag_8021q_vlan_del_tx(ocelot_to_felix(ocelot), port, vid); + if (err) { + felix_tag_8021q_vlan_add_rx(ocelot_to_felix(ocelot), port, vid); + return err; + } return 0; } diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index ab196dc615da..abc67b97bfc4 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2509,7 +2509,7 @@ static int sja1105_bridge_vlan_add(struct dsa_switch *ds, int port, */ if (vid_is_dsa_8021q(vlan->vid)) { NL_SET_ERR_MSG_MOD(extack, - "Range 1024-3071 reserved for dsa_8021q operation"); + "Range 3072-4095 reserved for dsa_8021q operation"); return -EBUSY; } diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index 14e6dd7fb103..1eef60207c6b 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -302,7 +302,7 @@ static u16 sja1105_port_get_tag_8021q_vid(struct dsa_port *dp) unsigned long bridge_num; if (!dp->bridge) - return dsa_tag_8021q_rx_vid(dp); + return dsa_tag_8021q_standalone_vid(dp); bridge_num = dsa_port_bridge_num_get(dp); @@ -407,6 +407,7 @@ static int sja1105_init_virtual_links(struct sja1105_private *priv, vl_lookup[k].vlanid = rule->key.vl.vid; vl_lookup[k].vlanprior = rule->key.vl.pcp; } else { + /* FIXME */ struct dsa_port *dp = dsa_to_port(priv->ds, port); u16 vid = sja1105_port_get_tag_8021q_vid(dp); diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 92f5243b841e..b4e2862633f6 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -49,18 +49,12 @@ struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); -u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp); - -u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp); +u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); -bool vid_is_dsa_8021q_rxvlan(u16 vid); - -bool vid_is_dsa_8021q_txvlan(u16 vid); - bool vid_is_dsa_8021q(u16 vid); #endif /* _NET_DSA_8021Q_H */ diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 1cf245a6f18e..eac43f5b4e07 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -16,15 +16,11 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | VBID| SWITCH_ID | VBID | PORT | + * | RSV | VBID| SWITCH_ID | VBID | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * - * DIR - VID[11:10]: - * Direction flags. - * * 1 (0b01) for RX VLAN, - * * 2 (0b10) for TX VLAN. - * These values make the special VIDs of 0, 1 and 4095 to be left - * unused by this coding scheme. + * RSV - VID[11:10]: + * Reserved. Must be set to 3 (0b11). * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. @@ -38,12 +34,11 @@ * Index of switch port. Must be between 0 and 15. */ -#define DSA_8021Q_DIR_SHIFT 10 -#define DSA_8021Q_DIR_MASK GENMASK(11, 10) -#define DSA_8021Q_DIR(x) (((x) << DSA_8021Q_DIR_SHIFT) & \ - DSA_8021Q_DIR_MASK) -#define DSA_8021Q_DIR_RX DSA_8021Q_DIR(1) -#define DSA_8021Q_DIR_TX DSA_8021Q_DIR(2) +#define DSA_8021Q_RSV_VAL 3 +#define DSA_8021Q_RSV_SHIFT 10 +#define DSA_8021Q_RSV_MASK GENMASK(11, 10) +#define DSA_8021Q_RSV ((DSA_8021Q_RSV_VAL << DSA_8021Q_RSV_SHIFT) & \ + DSA_8021Q_RSV_MASK) #define DSA_8021Q_SWITCH_ID_SHIFT 6 #define DSA_8021Q_SWITCH_ID_MASK GENMASK(8, 6) @@ -72,29 +67,19 @@ u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num) /* The VBID value of 0 is reserved for precise TX, but it is also * reserved/invalid for the bridge_num, so all is well. */ - return DSA_8021Q_DIR_TX | DSA_8021Q_VBID(bridge_num); + return DSA_8021Q_RSV | DSA_8021Q_VBID(bridge_num); } EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid); -/* Returns the VID to be inserted into the frame from xmit for switch steering - * instructions on egress. Encodes switch ID and port ID. - */ -u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp) -{ - return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(dp->ds->index) | - DSA_8021Q_PORT(dp->index); -} -EXPORT_SYMBOL_GPL(dsa_tag_8021q_tx_vid); - /* Returns the VID that will be installed as pvid for this switch port, sent as * tagged egress towards the CPU port and decoded by the rcv function. */ -u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp) +u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp) { - return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(dp->ds->index) | + return DSA_8021Q_RSV | DSA_8021Q_SWITCH_ID(dp->ds->index) | DSA_8021Q_PORT(dp->index); } -EXPORT_SYMBOL_GPL(dsa_tag_8021q_rx_vid); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_standalone_vid); /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) @@ -119,21 +104,11 @@ static int dsa_tag_8021q_rx_vbid(u16 vid) return (vbid_hi << 2) | vbid_lo; } -bool vid_is_dsa_8021q_rxvlan(u16 vid) -{ - return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; -} -EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_rxvlan); - -bool vid_is_dsa_8021q_txvlan(u16 vid) -{ - return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX; -} -EXPORT_SYMBOL_GPL(vid_is_dsa_8021q_txvlan); - bool vid_is_dsa_8021q(u16 vid) { - return vid_is_dsa_8021q_rxvlan(vid) || vid_is_dsa_8021q_txvlan(vid); + u16 rsv = (vid & DSA_8021Q_RSV_MASK) >> DSA_8021Q_RSV_SHIFT; + + return rsv == DSA_8021Q_RSV_VAL; } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); @@ -251,18 +226,8 @@ int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, u16 flags = 0; if (dsa_port_is_user(dp)) - flags |= BRIDGE_VLAN_INFO_UNTAGGED; - - /* Standalone VLANs are PVIDs */ - if (vid_is_dsa_8021q_rxvlan(info->vid) && - dsa_8021q_rx_switch_id(info->vid) == ds->index && - dsa_8021q_rx_source_port(info->vid) == dp->index) - flags |= BRIDGE_VLAN_INFO_PVID; - - /* And bridging VLANs are PVIDs too on user ports */ - if (dsa_tag_8021q_rx_vbid(info->vid) && - dsa_port_is_user(dp)) - flags |= BRIDGE_VLAN_INFO_PVID; + flags |= BRIDGE_VLAN_INFO_UNTAGGED | + BRIDGE_VLAN_INFO_PVID; err = dsa_port_do_tag_8021q_vlan_add(dp, info->vid, flags); @@ -294,52 +259,24 @@ int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, return 0; } -/* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single - * front-panel switch port (here swp0). +/* There are 2 ways of offloading tag_8021q VLANs. * - * Port identification through VLAN (802.1Q) tags has different requirements - * for it to work effectively: - * - On RX (ingress from network): each front-panel port must have a pvid - * that uniquely identifies it, and the egress of this pvid must be tagged - * towards the CPU port, so that software can recover the source port based - * on the VID in the frame. But this would only work for standalone ports; - * if bridged, this VLAN setup would break autonomous forwarding and would - * force all switched traffic to pass through the CPU. So we must also make - * the other front-panel ports members of this VID we're adding, albeit - * we're not making it their PVID (they'll still have their own). - * - On TX (ingress from CPU and towards network) we are faced with a problem. - * If we were to tag traffic (from within DSA) with the port's pvid, all - * would be well, assuming the switch ports were standalone. Frames would - * have no choice but to be directed towards the correct front-panel port. - * But because we also want the RX VLAN to not break bridging, then - * inevitably that means that we have to give them a choice (of what - * front-panel port to go out on), and therefore we cannot steer traffic - * based on the RX VID. So what we do is simply install one more VID on the - * front-panel and CPU ports, and profit off of the fact that steering will - * work just by virtue of the fact that there is only one other port that's - * a member of the VID we're tagging the traffic with - the desired one. + * One is to use a hardware TCAM to push the port's standalone VLAN into the + * frame when forwarding it to the CPU, as an egress modification rule on the + * CPU port. This is preferable because it has no side effects for the + * autonomous forwarding path, and accomplishes tag_8021q's primary goal of + * identifying the source port of each packet based on VLAN ID. * - * So at the end, each front-panel port will have one RX VID (also the PVID), - * the RX VID of all other front-panel ports that are in the same bridge, and - * one TX VID. Whereas the CPU port will have the RX and TX VIDs of all - * front-panel ports, and on top of that, is also tagged-input and - * tagged-output (VLAN trunk). + * The other is to commit the tag_8021q VLAN as a PVID to the VLAN table, and + * to configure the port as VLAN-unaware. This is less preferable because + * unique source port identification can only be done for standalone ports; + * under a VLAN-unaware bridge, all ports share the same tag_8021q VLAN as + * PVID, and under a VLAN-aware bridge, packets received by software will not + * have tag_8021q VLANs appended, just bridge VLANs. * - * CPU port CPU port - * +-------------+-----+-------------+ +-------------+-----+-------------+ - * | RX VID | | | | TX VID | | | - * | of swp0 | | | | of swp0 | | | - * | +-----+ | | +-----+ | - * | ^ T | | | Tagged | - * | | | | | ingress | - * | +-------+---+---+-------+ | | +-----------+ | - * | | | | | | | | Untagged | - * | | U v U v U v | | v egress | - * | +-----+ +-----+ +-----+ +-----+ | | +-----+ +-----+ +-----+ +-----+ | - * | | | | | | | | | | | | | | | | | | | | - * | |PVID | | | | | | | | | | | | | | | | | | - * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ - * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 + * For tag_8021q implementations of the second type, this method is used to + * replace the standalone tag_8021q VLAN of a port with the tag_8021q VLAN to + * be used for VLAN-unaware bridging. */ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge) @@ -351,7 +288,7 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, /* Delete the standalone VLAN of the port and replace it with a * bridging VLAN */ - standalone_vid = dsa_tag_8021q_rx_vid(dp); + standalone_vid = dsa_tag_8021q_standalone_vid(dp); bridge_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); err = dsa_port_tag_8021q_vlan_add(dp, bridge_vid, true); @@ -374,7 +311,7 @@ void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, /* Delete the bridging VLAN of the port and replace it with a * standalone VLAN */ - standalone_vid = dsa_tag_8021q_rx_vid(dp); + standalone_vid = dsa_tag_8021q_standalone_vid(dp); bridge_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); err = dsa_port_tag_8021q_vlan_add(dp, standalone_vid, false); @@ -388,13 +325,12 @@ void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, } EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_leave); -/* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */ +/* Set up a port's standalone tag_8021q VLAN */ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) { struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 vid = dsa_tag_8021q_standalone_vid(dp); struct net_device *master; int err; @@ -406,30 +342,16 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) master = dp->cpu_dp->master; - /* Add this user port's RX VID to the membership list of all others - * (including itself). This is so that bridging will not be hindered. - * L2 forwarding rules still take precedence when there are no VLAN - * restrictions, so there are no concerns about leaking traffic. - */ - err = dsa_port_tag_8021q_vlan_add(dp, rx_vid, false); + err = dsa_port_tag_8021q_vlan_add(dp, vid, false); if (err) { dev_err(ds->dev, - "Failed to apply RX VID %d to port %d: %pe\n", - rx_vid, port, ERR_PTR(err)); + "Failed to apply standalone VID %d to port %d: %pe\n", + vid, port, ERR_PTR(err)); return err; } - /* Add @rx_vid to the master's RX filter. */ - vlan_vid_add(master, ctx->proto, rx_vid); - - /* Finally apply the TX VID on this port and on the CPU port */ - err = dsa_port_tag_8021q_vlan_add(dp, tx_vid, false); - if (err) { - dev_err(ds->dev, - "Failed to apply TX VID %d on port %d: %pe\n", - tx_vid, port, ERR_PTR(err)); - return err; - } + /* Add the VLAN to the master's RX filter. */ + vlan_vid_add(master, ctx->proto, vid); return err; } @@ -438,8 +360,7 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) { struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; struct dsa_port *dp = dsa_to_port(ds, port); - u16 rx_vid = dsa_tag_8021q_rx_vid(dp); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 vid = dsa_tag_8021q_standalone_vid(dp); struct net_device *master; /* The CPU port is implicitly configured by @@ -450,11 +371,9 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) master = dp->cpu_dp->master; - dsa_port_tag_8021q_vlan_del(dp, rx_vid, false); - - vlan_vid_del(master, ctx->proto, rx_vid); + dsa_port_tag_8021q_vlan_del(dp, vid, false); - dsa_port_tag_8021q_vlan_del(dp, tx_vid, false); + vlan_vid_del(master, ctx->proto, vid); } static int dsa_tag_8021q_setup(struct dsa_switch *ds) diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 1144a87ad0db..37ccf00404ea 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -62,7 +62,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(netdev); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 tx_vid = dsa_tag_8021q_standalone_vid(dp); struct ethhdr *hdr = eth_hdr(skb); if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest)) diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 9c5c00980b06..f3832ac54098 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -267,7 +267,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(netdev); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 tx_vid = dsa_tag_8021q_standalone_vid(dp); if (skb->offload_fwd_mark) return sja1105_imprecise_xmit(skb, netdev); @@ -295,7 +295,7 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb, struct dsa_port *dp = dsa_slave_to_port(netdev); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + u16 tx_vid = dsa_tag_8021q_standalone_vid(dp); __be32 *tx_trailer; __be16 *tx_header; int trailer_pos; -- cgit v1.2.3 From b6362bdf750b4ba266d4a10156174ec52460e73d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:21 +0200 Subject: net: dsa: tag_8021q: rename dsa_8021q_bridge_tx_fwd_offload_vid The dsa_8021q_bridge_tx_fwd_offload_vid is no longer used just for bridge TX forwarding offload, it is the private VLAN reserved for VLAN-unaware bridging in a way that is compatible with FDB isolation. So just rename it dsa_tag_8021q_bridge_vid. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_vl.c | 2 +- include/linux/dsa/8021q.h | 2 +- net/dsa/tag_8021q.c | 8 ++++---- net/dsa/tag_sja1105.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/dsa/sja1105/sja1105_vl.c b/drivers/net/dsa/sja1105/sja1105_vl.c index 1eef60207c6b..b7e95d60a6e4 100644 --- a/drivers/net/dsa/sja1105/sja1105_vl.c +++ b/drivers/net/dsa/sja1105/sja1105_vl.c @@ -306,7 +306,7 @@ static u16 sja1105_port_get_tag_8021q_vid(struct dsa_port *dp) bridge_num = dsa_port_bridge_num_get(dp); - return dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + return dsa_tag_8021q_bridge_vid(bridge_num); } static int sja1105_init_virtual_links(struct sja1105_private *priv, diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index b4e2862633f6..3ed117e299ec 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -47,7 +47,7 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, int vbid); -u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); +u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num); u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index eac43f5b4e07..a786569203f0 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -62,14 +62,14 @@ #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ DSA_8021Q_PORT_MASK) -u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num) +u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num) { /* The VBID value of 0 is reserved for precise TX, but it is also * reserved/invalid for the bridge_num, so all is well. */ return DSA_8021Q_RSV | DSA_8021Q_VBID(bridge_num); } -EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_vid); /* Returns the VID that will be installed as pvid for this switch port, sent as * tagged egress towards the CPU port and decoded by the rcv function. @@ -289,7 +289,7 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, * bridging VLAN */ standalone_vid = dsa_tag_8021q_standalone_vid(dp); - bridge_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); + bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num); err = dsa_port_tag_8021q_vlan_add(dp, bridge_vid, true); if (err) @@ -312,7 +312,7 @@ void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, * standalone VLAN */ standalone_vid = dsa_tag_8021q_standalone_vid(dp); - bridge_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge.num); + bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num); err = dsa_port_tag_8021q_vlan_add(dp, standalone_vid, false); if (err) { diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index f3832ac54098..83e4136516b0 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -226,7 +226,7 @@ static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb, * TX VLAN that targets the bridge's entire broadcast domain, * instead of just the specific port. */ - tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + tx_vid = dsa_tag_8021q_bridge_vid(bridge_num); return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid); } -- cgit v1.2.3 From a5081bad2eac5108593ac36b6551201f0df9e897 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 26 Feb 2022 14:56:22 +0000 Subject: net: phylink: remove phylink_set_pcs() As all users of phylink_set_pcs() have now been updated to use the mac_select_pcs() method, it can be removed from the phylink kernel API and its functionality moved into phylink_major_config(). Removing phylink_set_pcs() gives us a single approach for attaching a PCS within phylink. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 44 ++++++++++++-------------------------------- include/linux/phylink.h | 1 - 2 files changed, 12 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 28aa23533107..8d1cd2b9ba5f 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -815,8 +815,18 @@ static void phylink_major_config(struct phylink *pl, bool restart, /* If we have a new PCS, switch to the new PCS after preparing the MAC * for the change. */ - if (pcs) - phylink_set_pcs(pl, pcs); + if (pcs) { + pl->pcs = pcs; + pl->pcs_ops = pcs->ops; + + if (!pl->phylink_disable_state && + pl->cfg_link_an_mode == MLO_AN_INBAND) { + if (pcs->poll) + mod_timer(&pl->link_poll, jiffies + HZ); + else + del_timer(&pl->link_poll); + } + } phylink_mac_config(pl, state); @@ -1286,36 +1296,6 @@ struct phylink *phylink_create(struct phylink_config *config, } EXPORT_SYMBOL_GPL(phylink_create); -/** - * phylink_set_pcs() - set the current PCS for phylink to use - * @pl: a pointer to a &struct phylink returned from phylink_create() - * @pcs: a pointer to the &struct phylink_pcs - * - * Bind the MAC PCS to phylink. This may be called after phylink_create(). - * If it is desired to dynamically change the PCS, then the preferred method - * is to use mac_select_pcs(), but it may also be called in mac_prepare() - * or mac_config(). - * - * Please note that there are behavioural changes with the mac_config() - * callback if a PCS is present (denoting a newer setup) so removing a PCS - * is not supported, and if a PCS is going to be used, it must be registered - * by calling phylink_set_pcs() at the latest in the first mac_config() call. - */ -void phylink_set_pcs(struct phylink *pl, struct phylink_pcs *pcs) -{ - pl->pcs = pcs; - pl->pcs_ops = pcs->ops; - - if (!pl->phylink_disable_state && - pl->cfg_link_an_mode == MLO_AN_INBAND) { - if (pcs->poll) - mod_timer(&pl->link_poll, jiffies + HZ); - else - del_timer(&pl->link_poll); - } -} -EXPORT_SYMBOL_GPL(phylink_set_pcs); - /** * phylink_destroy() - cleanup and destroy the phylink instance * @pl: a pointer to a &struct phylink returned from phylink_create() diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 9ef9b7047f19..223781622b33 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -532,7 +532,6 @@ void phylink_generic_validate(struct phylink_config *config, struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *, phy_interface_t iface, const struct phylink_mac_ops *mac_ops); -void phylink_set_pcs(struct phylink *, struct phylink_pcs *pcs); void phylink_destroy(struct phylink *); int phylink_connect_phy(struct phylink *, struct phy_device *); -- cgit v1.2.3 From bf08824a0f4776fc0626b82b6924fa1a5643eacb Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Mon, 28 Feb 2022 20:58:56 +0100 Subject: flow_dissector: Add support for HSR Network drivers such as igb or igc call eth_get_headlen() to determine the header length for their to be constructed skbs in receive path. When running HSR on top of these drivers, it results in triggering BUG_ON() in skb_pull(). The reason is the skb headlen is not sufficient for HSR to work correctly. skb_pull() notices that. For instance, eth_get_headlen() returns 14 bytes for TCP traffic over HSR which is not correct. The problem is, the flow dissection code does not take HSR into account. Therefore, add support for it. Reported-by: Anthony Harivel Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220228195856.88187-1-kurt@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/if_hsr.h | 16 ++++++++++++++++ net/core/flow_dissector.c | 17 +++++++++++++++++ net/hsr/hsr_main.h | 16 ---------------- 3 files changed, 33 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h index 38bbc537d4e4..408539d5ea5f 100644 --- a/include/linux/if_hsr.h +++ b/include/linux/if_hsr.h @@ -9,6 +9,22 @@ enum hsr_version { PRP_V1, }; +/* HSR Tag. + * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, + * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, + * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr, + * encapsulated protocol } instead. + * + * Field names as defined in the IEC:2010 standard for HSR. + */ +struct hsr_tag { + __be16 path_and_LSDU_size; + __be16 sequence_nr; + __be16 encap_proto; +} __packed; + +#define HSR_HLEN 6 + #if IS_ENABLED(CONFIG_HSR) extern bool is_hsr_master(struct net_device *dev); extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 15833e1d6ea1..34441a32e3be 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1282,6 +1283,22 @@ proto_again: break; } + case htons(ETH_P_HSR): { + struct hsr_tag *hdr, _hdr; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + + proto = hdr->encap_proto; + nhoff += HSR_HLEN; + fdret = FLOW_DISSECT_RET_PROTO_AGAIN; + break; + } + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index ca556bda3467..b158ba409f9a 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -45,22 +45,6 @@ /* PRP V1 life redundancy box MAC address */ #define PRP_TLV_REDBOX_MAC 30 -/* HSR Tag. - * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, - * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, - * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr, - * encapsulated protocol } instead. - * - * Field names as defined in the IEC:2010 standard for HSR. - */ -struct hsr_tag { - __be16 path_and_LSDU_size; - __be16 sequence_nr; - __be16 encap_proto; -} __packed; - -#define HSR_HLEN 6 - #define HSR_V1_SUP_LSDUSIZE 52 #define HSR_HSIZE_SHIFT 8 -- cgit v1.2.3 From 9309f97aef6d8250bb484dabeac925c3a7c57716 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 2 Mar 2022 18:31:20 +0200 Subject: net: dev: Add hardware stats support Offloading switch device drivers may be able to collect statistics of the traffic taking place in the HW datapath that pertains to a certain soft netdevice, such as VLAN. Add the necessary infrastructure to allow exposing these statistics to the offloaded netdevice in question. The API was shaped by the following considerations: - Collection of HW statistics is not free: there may be a finite number of counters, and the act of counting may have a performance impact. It is therefore necessary to allow toggling whether HW counting should be done for any particular SW netdevice. - As the drivers are loaded and removed, a particular device may get offloaded and unoffloaded again. At the same time, the statistics values need to stay monotonic (modulo the eventual 64-bit wraparound), increasing only to reflect traffic measured in the device. To that end, the netdevice keeps around a lazily-allocated copy of struct rtnl_link_stats64. Device drivers then contribute to the values kept therein at various points. Even as the driver goes away, the struct stays around to maintain the statistics values. - Different HW devices may be able to count different things. The motivation behind this patch in particular is exposure of HW counters on Nvidia Spectrum switches, where the only practical approach to counting traffic on offloaded soft netdevices currently is to use router interface counters, and count L3 traffic. Correspondingly that is the statistics suite added in this patch. Other devices may be able to measure different kinds of traffic, and for that reason, the APIs are built to allow uniform access to different statistics suites. - Because soft netdevices and offloading drivers are only loosely bound, a netdevice uses a notifier chain to communicate with the drivers. Several new notifiers, NETDEV_OFFLOAD_XSTATS_*, have been added to carry messages to the offloading drivers. - Devices can have various conditions for when a particular counter is available. As the device is configured and reconfigured, the device offload may become or cease being suitable for counter binding. A netdevice can use a notifier type NETDEV_OFFLOAD_XSTATS_REPORT_USED to ping offloading drivers and determine whether anyone currently implements a given statistics suite. This information can then be propagated to user space. When the driver decides to unoffload a netdevice, it can use a newly-added function, netdev_offload_xstats_report_delta(), to record outstanding collected statistics, before destroying the HW counter. This patch adds a helper, call_netdevice_notifiers_info_robust(), for dispatching a notifier with the possibility of unwind when one of the consumers bails. Given the wish to eventually get rid of the global notifier block altogether, this helper only invokes the per-netns notifier block. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 42 +++++++ include/uapi/linux/if_link.h | 15 +++ net/core/dev.c | 267 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 323 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c79ee2296296..19a27ac361ef 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type { * @watchdog_dev_tracker: refcount tracker used by watchdog. * @dev_registered_tracker: tracker for reference held while * registered + * @offload_xstats_l3: L3 HW stats for this netdevice. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2287,6 +2288,7 @@ struct net_device { netdevice_tracker linkwatch_dev_tracker; netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; + struct rtnl_hw_stats64 *offload_xstats_l3; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2727,6 +2729,10 @@ enum netdev_cmd { NETDEV_CVLAN_FILTER_DROP_INFO, NETDEV_SVLAN_FILTER_PUSH_INFO, NETDEV_SVLAN_FILTER_DROP_INFO, + NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + NETDEV_OFFLOAD_XSTATS_REPORT_USED, + NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); @@ -2777,6 +2783,42 @@ struct netdev_notifier_pre_changeaddr_info { const unsigned char *dev_addr; }; +enum netdev_offload_xstats_type { + NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1, +}; + +struct netdev_notifier_offload_xstats_info { + struct netdev_notifier_info info; /* must be first */ + enum netdev_offload_xstats_type type; + + union { + /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */ + struct netdev_notifier_offload_xstats_rd *report_delta; + /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */ + struct netdev_notifier_offload_xstats_ru *report_used; + }; +}; + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack); +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type); +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type); +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *stats, bool *used, + struct netlink_ext_ack *extack); +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd, + const struct rtnl_hw_stats64 *stats); +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru); +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *stats); + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4d62ea6e1288..ef6a62a2e15d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -245,6 +245,21 @@ struct rtnl_link_stats64 { __u64 rx_nohandler; }; +/* Subset of link stats useful for in-HW collection. Meaning of the fields is as + * for struct rtnl_link_stats64. + */ +struct rtnl_hw_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; +}; + /* The struct should be in sync with struct ifmap */ struct rtnl_link_ifmap { __u64 mem_start; diff --git a/net/core/dev.c b/net/core/dev.c index 2d6771075720..c9e54e5ad48d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1622,7 +1622,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) - N(PRE_CHANGEADDR) + N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) + N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA) } #undef N return "UNKNOWN_NETDEV_EVENT"; @@ -1939,6 +1940,32 @@ static int call_netdevice_notifiers_info(unsigned long val, return raw_notifier_call_chain(&netdev_chain, val, info); } +/** + * call_netdevice_notifiers_info_robust - call per-netns notifier blocks + * for and rollback on error + * @val_up: value passed unmodified to notifier function + * @val_down: value passed unmodified to the notifier function when + * recovering from an error on @val_up + * @info: notifier information data + * + * Call all per-netns network notifier blocks, but not notifier blocks on + * the global notifier chain. Parameters and return value are as for + * raw_notifier_call_chain_robust(). + */ + +static int +call_netdevice_notifiers_info_robust(unsigned long val_up, + unsigned long val_down, + struct netdev_notifier_info *info) +{ + struct net *net = dev_net(info->dev); + + ASSERT_RTNL(); + + return raw_notifier_call_chain_robust(&net->netdev_chain, + val_up, val_down, info); +} + static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack) @@ -7728,6 +7755,242 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +static int netdev_offload_xstats_enable_l3(struct net_device *dev, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + int err; + int rc; + + dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3), + GFP_KERNEL); + if (!dev->offload_xstats_l3) + return -ENOMEM; + + rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + err = notifier_to_errno(rc); + if (err) + goto free_stats; + + return 0; + +free_stats: + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; + return err; +} + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return netdev_offload_xstats_enable_l3(dev, extack); + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_enable); + +static void netdev_offload_xstats_disable_l3(struct net_device *dev) +{ + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3, + }; + + call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE, + &info.info); + kfree(dev->offload_xstats_l3); + dev->offload_xstats_l3 = NULL; +} + +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + if (!netdev_offload_xstats_enabled(dev, type)) + return -EALREADY; + + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + netdev_offload_xstats_disable_l3(dev); + return 0; + } + + WARN_ON(1); + return -EINVAL; +} +EXPORT_SYMBOL(netdev_offload_xstats_disable); + +static void netdev_offload_xstats_disable_all(struct net_device *dev) +{ + netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3); +} + +static struct rtnl_hw_stats64 * +netdev_offload_xstats_get_ptr(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + switch (type) { + case NETDEV_OFFLOAD_XSTATS_TYPE_L3: + return dev->offload_xstats_l3; + } + + WARN_ON(1); + return NULL; +} + +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type) +{ + ASSERT_RTNL(); + + return netdev_offload_xstats_get_ptr(dev, type); +} +EXPORT_SYMBOL(netdev_offload_xstats_enabled); + +struct netdev_notifier_offload_xstats_ru { + bool used; +}; + +struct netdev_notifier_offload_xstats_rd { + struct rtnl_hw_stats64 stats; + bool used; +}; + +static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest, + const struct rtnl_hw_stats64 *src) +{ + dest->rx_packets += src->rx_packets; + dest->tx_packets += src->tx_packets; + dest->rx_bytes += src->rx_bytes; + dest->tx_bytes += src->tx_bytes; + dest->rx_errors += src->rx_errors; + dest->tx_errors += src->tx_errors; + dest->rx_dropped += src->rx_dropped; + dest->tx_dropped += src->tx_dropped; + dest->multicast += src->multicast; +} + +static int netdev_offload_xstats_get_used(struct net_device *dev, + enum netdev_offload_xstats_type type, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_ru report_used = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_used = &report_used, + }; + int rc; + + WARN_ON(!netdev_offload_xstats_enabled(dev, type)); + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED, + &info.info); + *p_used = report_used.used; + return notifier_to_errno(rc); +} + +static int netdev_offload_xstats_get_stats(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, + bool *p_used, + struct netlink_ext_ack *extack) +{ + struct netdev_notifier_offload_xstats_rd report_delta = {}; + struct netdev_notifier_offload_xstats_info info = { + .info.dev = dev, + .info.extack = extack, + .type = type, + .report_delta = &report_delta, + }; + struct rtnl_hw_stats64 *stats; + int rc; + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return -EINVAL; + + rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, + &info.info); + + /* Cache whatever we got, even if there was an error, otherwise the + * successful stats retrievals would get lost. + */ + netdev_hw_stats64_add(stats, &report_delta.stats); + + if (p_stats) + *p_stats = *stats; + *p_used = report_delta.used; + + return notifier_to_errno(rc); +} + +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *p_stats, bool *p_used, + struct netlink_ext_ack *extack) +{ + ASSERT_RTNL(); + + if (p_stats) + return netdev_offload_xstats_get_stats(dev, type, p_stats, + p_used, extack); + else + return netdev_offload_xstats_get_used(dev, type, p_used, + extack); +} +EXPORT_SYMBOL(netdev_offload_xstats_get); + +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta, + const struct rtnl_hw_stats64 *stats) +{ + report_delta->used = true; + netdev_hw_stats64_add(&report_delta->stats, stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_report_delta); + +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used) +{ + report_used->used = true; +} +EXPORT_SYMBOL(netdev_offload_xstats_report_used); + +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *p_stats) +{ + struct rtnl_hw_stats64 *stats; + + ASSERT_RTNL(); + + stats = netdev_offload_xstats_get_ptr(dev, type); + if (WARN_ON(!stats)) + return; + + netdev_hw_stats64_add(stats, p_stats); +} +EXPORT_SYMBOL(netdev_offload_xstats_push_delta); + /** * netdev_get_xmit_slave - Get the xmit slave of master device * @dev: device @@ -10417,6 +10680,8 @@ void unregister_netdevice_many(struct list_head *head) dev_xdp_uninstall(dev); + netdev_offload_xstats_disable_all(dev); + /* Notify protocols, that we are about to destroy * this device. They should clean all the things. */ -- cgit v1.2.3 From 5fd0b838efac16046509f7fb100455d0463b9687 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 2 Mar 2022 18:31:23 +0200 Subject: net: rtnetlink: Add UAPI toggle for IFLA_OFFLOAD_XSTATS_L3_STATS The offloaded HW stats are designed to allow per-netdevice enablement and disablement. Add an attribute, IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, which should be carried by the RTM_SETSTATS message, and expresses a desire to toggle L3 offload xstats on or off. As part of the above, add an exported function rtnl_offload_xstats_notify() that drivers can use when they have installed or deinstalled the counters backing the HW stats. At this point, it is possible to enable, disable and query L3 offload xstats on netdevices. (However there is no driver actually implementing these.) Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 3 ++ include/uapi/linux/if_link.h | 1 + include/uapi/linux/rtnetlink.h | 2 ++ net/core/rtnetlink.c | 75 ++++++++++++++++++++++++++++++++---------- 4 files changed, 64 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index bb9cb84114c1..7f970b16da3a 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -134,4 +134,7 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)); + +extern void rtnl_offload_xstats_notify(struct net_device *dev); + #endif /* __LINUX_RTNETLINK_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b1031f481d2f..ddca20357e7e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1227,6 +1227,7 @@ enum { IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with * a filter mask for the corresponding group. */ + IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, /* 0 or 1 as u8 */ __IFLA_STATS_GETSET_MAX, }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 14462dc159fd..51530aade46e 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -767,6 +767,8 @@ enum rtnetlink_groups { #define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR RTNLGRP_TUNNEL, #define RTNLGRP_TUNNEL RTNLGRP_TUNNEL + RTNLGRP_STATS, +#define RTNLGRP_STATS RTNLGRP_STATS __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d09354514355..a66b6761b88b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -5566,6 +5566,7 @@ rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = { static const struct nla_policy ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = { + [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters, @@ -5773,16 +5774,51 @@ out: return skb->len; } +void rtnl_offload_xstats_notify(struct net_device *dev) +{ + struct rtnl_stats_dump_filters response_filters = {}; + struct net *net = dev_net(dev); + int idxattr = 0, prividx = 0; + struct sk_buff *skb; + int err = -ENOBUFS; + + ASSERT_RTNL(); + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); + + skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), + GFP_KERNEL); + if (!skb) + goto errout; + + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0, + &response_filters, &idxattr, &prividx, NULL); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_STATS, err); +} +EXPORT_SYMBOL(rtnl_offload_xstats_notify); + static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3; struct rtnl_stats_dump_filters response_filters = {}; struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; - int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; - struct sk_buff *nskb; + bool notify = false; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), @@ -5814,24 +5850,29 @@ static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - nskb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters), - GFP_KERNEL); - if (!nskb) - return -ENOBUFS; + if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) { + u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]); - err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, - NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, - 0, &response_filters, &idxattr, &prividx, - extack); - if (err < 0) { - /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ - WARN_ON(err == -EMSGSIZE); - kfree_skb(nskb); - } else { - err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); + if (req) + err = netdev_offload_xstats_enable(dev, t_l3, extack); + else + err = netdev_offload_xstats_disable(dev, t_l3); + + if (!err) + notify = true; + else if (err != -EALREADY) + return err; + + response_filters.mask[0] |= + IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS); + response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |= + IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO); } - return err; + if (notify) + rtnl_offload_xstats_notify(dev); + + return 0; } /* Process one rtnetlink message. */ -- cgit v1.2.3 From a1ac9c8acec1605c6b43af418f79facafdced680 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:25 -0800 Subject: net: Add skb->mono_delivery_time to distinguish mono delivery_time from (rcv) timestamp skb->tstamp was first used as the (rcv) timestamp. The major usage is to report it to the user (e.g. SO_TIMESTAMP). Later, skb->tstamp is also set as the (future) delivery_time (e.g. EDT in TCP) during egress and used by the qdisc (e.g. sch_fq) to make decision on when the skb can be passed to the dev. Currently, there is no way to tell skb->tstamp having the (rcv) timestamp or the delivery_time, so it is always reset to 0 whenever forwarded between egress and ingress. While it makes sense to always clear the (rcv) timestamp in skb->tstamp to avoid confusing sch_fq that expects the delivery_time, it is a performance issue [0] to clear the delivery_time if the skb finally egress to a fq@phy-dev. For example, when forwarding from egress to ingress and then finally back to egress: tcp-sender => veth@netns => veth@hostns => fq@eth0@hostns ^ ^ reset rest This patch adds one bit skb->mono_delivery_time to flag the skb->tstamp is storing the mono delivery_time (EDT) instead of the (rcv) timestamp. The current use case is to keep the TCP mono delivery_time (EDT) and to be used with sch_fq. A latter patch will also allow tc-bpf@ingress to read and change the mono delivery_time. In the future, another bit (e.g. skb->user_delivery_time) can be added for the SCM_TXTIME where the clock base is tracked by sk->sk_clockid. [ This patch is a prep work. The following patches will get the other parts of the stack ready first. Then another patch after that will finally set the skb->mono_delivery_time. ] skb_set_delivery_time() function is added. It is used by the tcp_output.c and during ip[6] fragmentation to assign the delivery_time to the skb->tstamp and also set the skb->mono_delivery_time. A note on the change in ip_send_unicast_reply() in ip_output.c. It is only used by TCP to send reset/ack out of a ctl_sk. Like the new skb_set_delivery_time(), this patch sets the skb->mono_delivery_time to 0 for now as a place holder. It will be enabled in a latter patch. A similar case in tcp_ipv6 can be done with skb_set_delivery_time() in tcp_v6_send_response(). [0] (slide 22): https://linuxplumbersconf.org/event/11/contributions/953/attachments/867/1658/LPC_2021_BPF_Datapath_Extensions.pdf Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 +++++++++++++ net/bridge/netfilter/nf_conntrack_bridge.c | 5 +++-- net/ipv4/ip_output.c | 7 +++++-- net/ipv4/tcp_output.c | 16 +++++++++------- net/ipv6/ip6_output.c | 5 +++-- net/ipv6/netfilter.c | 5 +++-- net/ipv6/tcp_ipv6.c | 2 +- 7 files changed, 37 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d67941f78b92..803ffa63dea6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -795,6 +795,10 @@ typedef unsigned char *sk_buff_data_t; * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required + * @mono_delivery_time: When set, skb->tstamp has the + * delivery_time in mono clock base (i.e. EDT). Otherwise, the + * skb->tstamp has the (rcv) timestamp at ingress and + * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @secmark: security marking @@ -965,6 +969,7 @@ struct sk_buff { __u8 decrypted:1; #endif __u8 slow_gro:1; + __u8 mono_delivery_time:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -3983,6 +3988,14 @@ static inline ktime_t net_timedelta(ktime_t t) return ktime_sub(ktime_get_real(), t); } +static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, + bool mono) +{ + skb->tstamp = kt; + /* Setting mono_delivery_time will be enabled later */ + skb->mono_delivery_time = 0; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index fdbed3158555..ebfb2a5c59e4 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -32,6 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + bool mono_delivery_time = skb->mono_delivery_time; unsigned int hlen, ll_rs, mtu; ktime_t tstamp = skb->tstamp; struct ip_frag_state state; @@ -81,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, if (iter.frag) ip_fraglist_prepare(skb, &iter); - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -112,7 +113,7 @@ slow_path: goto blackhole; } - skb2->tstamp = tstamp; + skb_set_delivery_time(skb2, tstamp, mono_delivery_time); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6df3545c891d..a9588e0c82c5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -761,6 +761,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, { struct iphdr *iph; struct sk_buff *skb2; + bool mono_delivery_time = skb->mono_delivery_time; struct rtable *rt = skb_rtable(skb); unsigned int mtu, hlen, ll_rs; struct ip_fraglist_iter iter; @@ -852,7 +853,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, } } - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, skb); if (!err) @@ -908,7 +909,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - skb2->tstamp = tstamp; + skb_set_delivery_time(skb2, tstamp, mono_delivery_time); err = output(net, sk, skb2); if (err) goto fail; @@ -1727,6 +1728,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; + /* Setting mono_delivery_time will be enabled later */ + nskb->mono_delivery_time = 0; ip_push_pending_frames(sk, &fl4); } out: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e76bf1e9251e..2319531267c6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1253,7 +1253,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp = tcp_sk(sk); prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); if (clone_it) { oskb = skb; @@ -1589,7 +1589,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - buff->tstamp = skb->tstamp; + skb_set_delivery_time(buff, skb->tstamp, true); tcp_fragment_tstamp(skb, buff); old_factor = tcp_skb_pcount(skb); @@ -2616,7 +2616,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ - skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; + tp->tcp_wstamp_ns = tp->tcp_clock_cache; + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ @@ -3541,11 +3542,12 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, now = tcp_clock_ns(); #ifdef CONFIG_SYN_COOKIES if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) - skb->skb_mstamp_ns = cookie_init_timestamp(req, now); + skb_set_delivery_time(skb, cookie_init_timestamp(req, now), + true); else #endif { - skb->skb_mstamp_ns = now; + skb_set_delivery_time(skb, now, true); if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); } @@ -3594,7 +3596,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, synack_type, &opts); - skb->skb_mstamp_ns = now; + skb_set_delivery_time(skb, now, true); tcp_add_tx_delay(skb, tp); return skb; @@ -3771,7 +3773,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; + skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); /* Now full SYN+DATA was cloned and sent (or not), * remove the SYN from the original skb (syn_data) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c5edc86b18bd..dad4e3d0492e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -813,6 +813,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; + bool mono_delivery_time = skb->mono_delivery_time; struct ip6_frag_state state; unsigned int mtu, hlen, nexthdr_offset; ktime_t tstamp = skb->tstamp; @@ -903,7 +904,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, skb); if (!err) IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), @@ -962,7 +963,7 @@ slow_path: /* * Put this fragment into the sending queue. */ - frag->tstamp = tstamp; + skb_set_delivery_time(frag, tstamp, mono_delivery_time); err = output(net, sk, frag); if (err) goto fail; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 6ab710b5a1a8..1da332450d98 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -121,6 +121,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct sk_buff *)) { int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; + bool mono_delivery_time = skb->mono_delivery_time; ktime_t tstamp = skb->tstamp; struct ip6_frag_state state; u8 *prevhdr, nexthdr = 0; @@ -186,7 +187,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (iter.frag) ip6_fraglist_prepare(skb, &iter); - skb->tstamp = tstamp; + skb_set_delivery_time(skb, tstamp, mono_delivery_time); err = output(net, sk, data, skb); if (err || !iter.frag) break; @@ -219,7 +220,7 @@ slow_path: goto blackhole; } - skb2->tstamp = tstamp; + skb_set_delivery_time(skb2, tstamp, mono_delivery_time); err = output(net, sk, data, skb2); if (err) goto blackhole; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e98af869ff3a..cb2bb7d2e907 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -940,7 +940,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 } else { mark = sk->sk_mark; } - buff->tstamp = tcp_transmit_time(sk); + skb_set_delivery_time(buff, tcp_transmit_time(sk), true); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; -- cgit v1.2.3 From de799101519aad23c6096041ba2744d7b5517e6a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:31 -0800 Subject: net: Add skb_clear_tstamp() to keep the mono delivery_time Right now, skb->tstamp is reset to 0 whenever the skb is forwarded. If skb->tstamp has the mono delivery_time, clearing it can hurt the performance when it finally transmits out to fq@phy-dev. The earlier patch added a skb->mono_delivery_time bit to flag the skb->tstamp carrying the mono delivery_time. This patch adds skb_clear_tstamp() helper which keeps the mono delivery_time and clears everything else. The delivery_time clearing will be postponed until the stack knows the skb will be delivered locally. It will be done in a latter patch. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- drivers/net/loopback.c | 2 +- include/linux/skbuff.h | 10 +++++++++- net/bridge/br_forward.c | 2 +- net/core/filter.c | 6 +++--- net/core/skbuff.c | 2 +- net/ipv4/ip_forward.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/netfilter/ipvs/ip_vs_xmit.c | 6 +++--- net/netfilter/nf_dup_netdev.c | 2 +- net/netfilter/nf_flow_table_ip.c | 4 ++-- net/netfilter/nft_fwd_netdev.c | 2 +- net/openvswitch/vport.c | 2 +- net/xfrm/xfrm_interface.c | 2 +- 13 files changed, 26 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index d05f86fe78c9..720394c0639b 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -74,7 +74,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, skb_tx_timestamp(skb); /* do not fool net_timestamp_check() with various clock bases */ - skb->tstamp = 0; + skb_clear_tstamp(skb); skb_orphan(skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 803ffa63dea6..27a28920e7b3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3996,6 +3996,14 @@ static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, skb->mono_delivery_time = 0; } +static inline void skb_clear_tstamp(struct sk_buff *skb) +{ + if (skb->mono_delivery_time) + return; + + skb->tstamp = 0; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; @@ -4852,7 +4860,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) - skb->tstamp = 0; + skb_clear_tstamp(skb); #endif } diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index ec646656dbf1..02bb620d3b8d 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -62,7 +62,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit); int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->tstamp = 0; + skb_clear_tstamp(skb); return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING, net, sk, skb, NULL, skb->dev, br_dev_queue_push_xmit); diff --git a/net/core/filter.c b/net/core/filter.c index 65869fd510e8..cfcf9b4d1ec2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2107,7 +2107,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); @@ -2176,7 +2176,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); @@ -2274,7 +2274,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, } skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { skb = skb_expand_head(skb, hh_len); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b32c5d782fe1..9abb0028309f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5381,7 +5381,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) ipvs_reset(skb); skb->mark = 0; - skb->tstamp = 0; + skb_clear_tstamp(skb); } EXPORT_SYMBOL_GPL(skb_scrub_packet); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 00ec819f949b..92ba3350274b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -79,7 +79,7 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(opt->optlen)) ip_forward_options(skb); - skb->tstamp = 0; + skb_clear_tstamp(skb); return dst_output(net, sk, skb); } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index dad4e3d0492e..50db9b20d746 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -440,7 +440,7 @@ static inline int ip6_forward_finish(struct net *net, struct sock *sk, } #endif - skb->tstamp = 0; + skb_clear_tstamp(skb); return dst_output(net, sk, skb); } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index d2e5a8f644b8..029171379884 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -610,7 +610,7 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, nf_reset_ct(skb); skb_forward_csum(skb); if (skb->dev) - skb->tstamp = 0; + skb_clear_tstamp(skb); } return ret; } @@ -652,7 +652,7 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); if (skb->dev) - skb->tstamp = 0; + skb_clear_tstamp(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -674,7 +674,7 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); if (skb->dev) - skb->tstamp = 0; + skb_clear_tstamp(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a579e59ee5c5..7873bd1389c3 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -19,7 +19,7 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) skb_push(skb, skb->mac_len); skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); dev_queue_xmit(skb); } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 889cf88d3dba..f1d387129f02 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -376,7 +376,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, nf_flow_nat_ip(flow, skb, thoff, dir, iph); ip_decrease_ttl(iph); - skb->tstamp = 0; + skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); @@ -611,7 +611,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, nf_flow_nat_ipv6(flow, skb, dir, ip6h); ip6h->hop_limit--; - skb->tstamp = 0; + skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 619e394a91de..08e7a289738e 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -145,7 +145,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, return; skb->dev = dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); neigh_xmit(neigh_table, dev, addr, skb); out: regs->verdict.code = verdict; diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index cf2ce5812489..82a74f998966 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -507,7 +507,7 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) } skb->dev = vport->dev; - skb->tstamp = 0; + skb_clear_tstamp(skb); vport->ops->send(skb); return; diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 57448fc519fc..4991e99ced9a 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -190,7 +190,7 @@ static void xfrmi_dev_uninit(struct net_device *dev) static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; + skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; -- cgit v1.2.3 From 27942a15209f564ed8ee2a9e126cb7b105181355 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:38 -0800 Subject: net: Handle delivery_time in skb->tstamp during network tapping with af_packet A latter patch will set the skb->mono_delivery_time to flag the skb->tstamp is used as the mono delivery_time (EDT) instead of the (rcv) timestamp. skb_clear_tstamp() will then keep this delivery_time during forwarding. This patch is to make the network tapping (with af_packet) to handle the delivery_time stored in skb->tstamp. Regardless of tapping at the ingress or egress, the tapped skb is received by the af_packet socket, so it is ingress to the af_packet socket and it expects the (rcv) timestamp. When tapping at egress, dev_queue_xmit_nit() is used. It has already expected skb->tstamp may have delivery_time, so it does skb_clone()+net_timestamp_set() to ensure the cloned skb has the (rcv) timestamp before passing to the af_packet sk. This patch only adds to clear the skb->mono_delivery_time bit in net_timestamp_set(). When tapping at ingress, it currently expects the skb->tstamp is either 0 or the (rcv) timestamp. Meaning, the tapping at ingress path has already expected the skb->tstamp could be 0 and it will get the (rcv) timestamp by ktime_get_real() when needed. There are two cases for tapping at ingress: One case is af_packet queues the skb to its sk_receive_queue. The skb is either not shared or new clone created. The newly added skb_clear_delivery_time() is called to clear the delivery_time (if any) and set the (rcv) timestamp if needed before the skb is queued to the sk_receive_queue. Another case, the ingress skb is directly copied to the rx_ring and tpacket_get_timestamp() is used to get the (rcv) timestamp. The newly added skb_tstamp() is used in tpacket_get_timestamp() to check the skb->mono_delivery_time bit before returning skb->tstamp. As mentioned earlier, the tapping@ingress has already expected the skb may not have the (rcv) timestamp (because no sk has asked for it) and has handled this case by directly calling ktime_get_real(). Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 24 ++++++++++++++++++++++++ net/core/dev.c | 4 +++- net/packet/af_packet.c | 4 +++- 3 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27a28920e7b3..7e2d796ece80 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3996,6 +3996,22 @@ static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, skb->mono_delivery_time = 0; } +DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); + +/* It is used in the ingress path to clear the delivery_time. + * If needed, set the skb->tstamp to the (rcv) timestamp. + */ +static inline void skb_clear_delivery_time(struct sk_buff *skb) +{ + if (skb->mono_delivery_time) { + skb->mono_delivery_time = 0; + if (static_branch_unlikely(&netstamp_needed_key)) + skb->tstamp = ktime_get_real(); + else + skb->tstamp = 0; + } +} + static inline void skb_clear_tstamp(struct sk_buff *skb) { if (skb->mono_delivery_time) @@ -4004,6 +4020,14 @@ static inline void skb_clear_tstamp(struct sk_buff *skb) skb->tstamp = 0; } +static inline ktime_t skb_tstamp(const struct sk_buff *skb) +{ + if (skb->mono_delivery_time) + return 0; + + return skb->tstamp; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; diff --git a/net/core/dev.c b/net/core/dev.c index c9e54e5ad48d..e128f26711eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2047,7 +2047,8 @@ void net_dec_egress_queue(void) EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif -static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); +EXPORT_SYMBOL(netstamp_needed_key); #ifdef CONFIG_JUMP_LABEL static atomic_t netstamp_needed_deferred; static atomic_t netstamp_wanted; @@ -2108,6 +2109,7 @@ EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp = 0; + skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) __net_timestamp(skb); } diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index ab87f22cc7ec..1b93ce1a5600 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -460,7 +460,7 @@ static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && - ktime_to_timespec64_cond(skb->tstamp, ts)) + ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; @@ -2199,6 +2199,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); + skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); @@ -2377,6 +2378,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; + skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); -- cgit v1.2.3 From d93376f503c7a586707925957592c0f16f4db0b1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:44 -0800 Subject: net: Clear mono_delivery_time bit in __skb_tstamp_tx() In __skb_tstamp_tx(), it may clone the egress skb and queues the clone to the sk_error_queue. The outgoing skb may have the mono delivery_time while the (rcv) timestamp is expected for the clone, so the skb->mono_delivery_time bit needs to be cleared from the clone. This patch adds the skb->mono_delivery_time clearing to the existing __net_timestamp() and use it in __skb_tstamp_tx(). The __net_timestamp() fast path usage in dev.c is changed to directly call ktime_get_real() since the mono_delivery_time bit is not set at that point. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + net/core/dev.c | 4 ++-- net/core/skbuff.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7e2d796ece80..8e8a4af4f9e2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3981,6 +3981,7 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb, static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); + skb->mono_delivery_time = 0; } static inline ktime_t net_timedelta(ktime_t t) diff --git a/net/core/dev.c b/net/core/dev.c index e128f26711eb..5db2443c2371 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2111,13 +2111,13 @@ static inline void net_timestamp_set(struct sk_buff *skb) skb->tstamp = 0; skb->mono_delivery_time = 0; if (static_branch_unlikely(&netstamp_needed_key)) - __net_timestamp(skb); + skb->tstamp = ktime_get_real(); } #define net_timestamp_check(COND, SKB) \ if (static_branch_unlikely(&netstamp_needed_key)) { \ if ((COND) && !(SKB)->tstamp) \ - __net_timestamp(SKB); \ + (SKB)->tstamp = ktime_get_real(); \ } \ bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9abb0028309f..e5082836295b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4851,7 +4851,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (hwtstamps) *skb_hwtstamps(skb) = *hwtstamps; else - skb->tstamp = ktime_get_real(); + __net_timestamp(skb); __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats); } -- cgit v1.2.3 From d98d58a002619b5c165f1eedcd731e2fe2c19088 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:50 -0800 Subject: net: Set skb->mono_delivery_time and clear it after sch_handle_ingress() The previous patches handled the delivery_time before sch_handle_ingress(). This patch can now set the skb->mono_delivery_time to flag the skb->tstamp is used as the mono delivery_time (EDT) instead of the (rcv) timestamp and also clear it with skb_clear_delivery_time() after sch_handle_ingress(). This will make the bpf_redirect_*() to keep the mono delivery_time and used by a qdisc (fq) of the egress-ing interface. A latter patch will postpone the skb_clear_delivery_time() until the stack learns that the skb is being delivered locally and that will make other kernel forwarding paths (ip[6]_forward) able to keep the delivery_time also. Thus, like the previous patches on using the skb->mono_delivery_time bit, calling skb_clear_delivery_time() is not limited within the CONFIG_NET_INGRESS to avoid too many code churns among this set. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +-- net/core/dev.c | 8 ++++++-- net/ipv4/ip_output.c | 3 +-- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8e8a4af4f9e2..0f5fd53059cd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3993,8 +3993,7 @@ static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, bool mono) { skb->tstamp = kt; - /* Setting mono_delivery_time will be enabled later */ - skb->mono_delivery_time = 0; + skb->mono_delivery_time = kt && mono; } DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); diff --git a/net/core/dev.c b/net/core/dev.c index 5db2443c2371..4b572bbbd07e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5220,8 +5220,10 @@ another_round: goto out; } - if (skb_skip_tc_classify(skb)) + if (skb_skip_tc_classify(skb)) { + skb_clear_delivery_time(skb); goto skip_classify; + } if (pfmemalloc) goto skip_taps; @@ -5250,12 +5252,14 @@ skip_taps: goto another_round; if (!skb) goto out; + skb_clear_delivery_time(skb); nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; - } + } else #endif + skb_clear_delivery_time(skb); skb_reset_redirect(skb); skip_classify: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a9588e0c82c5..00b4bf26fd93 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1728,8 +1728,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - /* Setting mono_delivery_time will be enabled later */ - nskb->mono_delivery_time = 0; + nskb->mono_delivery_time = !!transmit_time; ip_push_pending_frames(sk, &fl4); } out: -- cgit v1.2.3 From b6561f8491ca899e5a08311796085c9738d631ae Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:56:09 -0800 Subject: net: ipv6: Get rcv timestamp if needed when handling hop-by-hop IOAM option IOAM is a hop-by-hop option with a temporary iana allocation (49). Since it is hop-by-hop, it is done before the input routing decision. One of the traced data field is the (rcv) timestamp. When the locally generated skb is looping from egress to ingress over a virtual interface (e.g. veth, loopback...), skb->tstamp may have the delivery time before it is known that it will be delivered locally and received by another sk. Like handling the network tapping (tcpdump) in the earlier patch, this patch gets the timestamp if needed without over-writing the delivery_time in the skb->tstamp. skb_tstamp_cond() is added to do the ktime_get_real() with an extra cond arg to check on top of the netstamp_needed_key static key. skb_tstamp_cond() will also be used in a latter patch and it needs the netstamp_needed_key check. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++++ net/ipv6/ioam6.c | 19 +++++++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0f5fd53059cd..4b5b926a81f2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4028,6 +4028,17 @@ static inline ktime_t skb_tstamp(const struct sk_buff *skb) return skb->tstamp; } +static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) +{ + if (!skb->mono_delivery_time && skb->tstamp) + return skb->tstamp; + + if (static_branch_unlikely(&netstamp_needed_key) || cond) + return ktime_get_real(); + + return 0; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index e159eb4328a8..1098131ed90c 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -635,7 +635,8 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc, u8 sclen, bool is_input) { - struct __kernel_sock_timeval ts; + struct timespec64 ts; + ktime_t tstamp; u64 raw64; u32 raw32; u16 raw16; @@ -680,10 +681,9 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { - if (!skb->tstamp) - __net_timestamp(skb); + tstamp = skb_tstamp_cond(skb, true); + ts = ktime_to_timespec64(tstamp); - skb_get_new_timestamp(skb, &ts); *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec); } data += sizeof(__be32); @@ -694,13 +694,12 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (!skb->dev) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { - if (!skb->tstamp) - __net_timestamp(skb); + if (!trace->type.bit2) { + tstamp = skb_tstamp_cond(skb, true); + ts = ktime_to_timespec64(tstamp); + } - if (!trace->type.bit2) - skb_get_new_timestamp(skb, &ts); - - *(__be32 *)data = cpu_to_be32((u32)ts.tv_usec); + *(__be32 *)data = cpu_to_be32((u32)(ts.tv_nsec / NSEC_PER_USEC)); } data += sizeof(__be32); } -- cgit v1.2.3 From 7449197d600d30d038b1ce6285ab4747096eac7f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:56:28 -0800 Subject: bpf: Keep the (rcv) timestamp behavior for the existing tc-bpf@ingress The current tc-bpf@ingress reads and writes the __sk_buff->tstamp as a (rcv) timestamp which currently could either be 0 (not available) or ktime_get_real(). This patch is to backward compatible with the (rcv) timestamp expectation at ingress. If the skb->tstamp has the delivery_time, the bpf insn rewrite will read 0 for tc-bpf running at ingress as it is not available. When writing at ingress, it will also clear the skb->mono_delivery_time bit. /* BPF_READ: a = __sk_buff->tstamp */ if (!skb->tc_at_ingress || !skb->mono_delivery_time) a = skb->tstamp; else a = 0 /* BPF_WRITE: __sk_buff->tstamp = a */ if (skb->tc_at_ingress) skb->mono_delivery_time = 0; skb->tstamp = a; [ A note on the BPF_CGROUP_INET_INGRESS which can also access skb->tstamp. At that point, the skb is delivered locally and skb_clear_delivery_time() has already been done, so the skb->tstamp will only have the (rcv) timestamp. ] If the tc-bpf@egress writes 0 to skb->tstamp, the skb->mono_delivery_time has to be cleared also. It could be done together during convert_ctx_access(). However, the latter patch will also expose the skb->mono_delivery_time bit as __sk_buff->delivery_time_type. Changing the delivery_time_type in the background may surprise the user, e.g. the 2nd read on __sk_buff->delivery_time_type may need a READ_ONCE() to avoid compiler optimization. Thus, in expecting the needs in the latter patch, this patch does a check on !skb->tstamp after running the tc-bpf and clears the skb->mono_delivery_time bit if needed. The earlier discussion on v4 [0]. The bpf insn rewrite requires the skb's mono_delivery_time bit and tc_at_ingress bit. They are moved up in sk_buff so that bpf rewrite can be done at a fixed offset. tc_skip_classify is moved together with tc_at_ingress. To get one bit for mono_delivery_time, csum_not_inet is moved down and this bit is currently used by sctp. [0]: https://lore.kernel.org/bpf/20220217015043.khqwqklx45c4m4se@kafai-mbp.dhcp.thefacebook.com/ Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++----- net/core/filter.c | 71 +++++++++++++++++++++++++++++++++++++++++++------- net/sched/act_bpf.c | 2 ++ net/sched/cls_bpf.c | 2 ++ 4 files changed, 77 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4b5b926a81f2..5445860e1ba6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -941,8 +941,12 @@ struct sk_buff { __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ __u8 csum_complete_sw:1; __u8 csum_level:2; - __u8 csum_not_inet:1; __u8 dst_pending_confirm:1; + __u8 mono_delivery_time:1; +#ifdef CONFIG_NET_CLS_ACT + __u8 tc_skip_classify:1; + __u8 tc_at_ingress:1; +#endif #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif @@ -953,10 +957,6 @@ struct sk_buff { #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; -#endif -#ifdef CONFIG_NET_CLS_ACT - __u8 tc_skip_classify:1; - __u8 tc_at_ingress:1; #endif __u8 redirected:1; #ifdef CONFIG_NET_REDIRECT @@ -969,7 +969,7 @@ struct sk_buff { __u8 decrypted:1; #endif __u8 slow_gro:1; - __u8 mono_delivery_time:1; + __u8 csum_not_inet:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -1047,10 +1047,16 @@ struct sk_buff { /* if you move pkt_vlan_present around you also must adapt these constants */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_VLAN_PRESENT_BIT 7 +#define TC_AT_INGRESS_MASK (1 << 0) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) #else #define PKT_VLAN_PRESENT_BIT 0 +#define TC_AT_INGRESS_MASK (1 << 7) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif #define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) +#define TC_AT_INGRESS_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) +#define SKB_MONO_DELIVERY_TIME_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) #ifdef __KERNEL__ /* diff --git a/net/core/filter.c b/net/core/filter.c index cfcf9b4d1ec2..5072733743e9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8859,6 +8859,65 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, return insn; } +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + +#ifdef CONFIG_NET_CLS_ACT + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5); + /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp, + * so check the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); +#endif + + *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->src_reg; + __u8 skb_reg = si->dst_reg; + +#ifdef CONFIG_NET_CLS_ACT + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3); + /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp. + * Clear the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + ~SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); +#endif + + /* skb->tstamp = tstamp */ + *insn++ = BPF_STX_MEM(BPF_DW, skb_reg, value_reg, + offsetof(struct sk_buff, tstamp)); + return insn; +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -9167,17 +9226,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) - *insn++ = BPF_STX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_write(si, insn); else - *insn++ = BPF_LDX_MEM(BPF_DW, - si->dst_reg, si->src_reg, - bpf_target_off(struct sk_buff, - tstamp, 8, - target_size)); + insn = bpf_convert_tstamp_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index a77d8908e737..fea2d78b9ddc 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -53,6 +53,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(filter, skb); } + if (unlikely(!skb->tstamp && skb->mono_delivery_time)) + skb->mono_delivery_time = 0; if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index df19a847829e..c85b85a192bf 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -102,6 +102,8 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); } + if (unlikely(!skb->tstamp && skb->mono_delivery_time)) + skb->mono_delivery_time = 0; if (prog->exts_integrated) { res->class = 0; -- cgit v1.2.3 From 8d21ec0e46ed6e39994accff8eb4f2be3d2e76b5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:56:34 -0800 Subject: bpf: Add __sk_buff->delivery_time_type and bpf_skb_set_skb_delivery_time() * __sk_buff->delivery_time_type: This patch adds __sk_buff->delivery_time_type. It tells if the delivery_time is stored in __sk_buff->tstamp or not. It will be most useful for ingress to tell if the __sk_buff->tstamp has the (rcv) timestamp or delivery_time. If delivery_time_type is 0 (BPF_SKB_DELIVERY_TIME_NONE), it has the (rcv) timestamp. Two non-zero types are defined for the delivery_time_type, BPF_SKB_DELIVERY_TIME_MONO and BPF_SKB_DELIVERY_TIME_UNSPEC. For UNSPEC, it can only happen in egress because only mono delivery_time can be forwarded to ingress now. The clock of UNSPEC delivery_time can be deduced from the skb->sk->sk_clockid which is how the sch_etf doing it also. * Provide forwarded delivery_time to tc-bpf@ingress: With the help of the new delivery_time_type, the tc-bpf has a way to tell if the __sk_buff->tstamp has the (rcv) timestamp or the delivery_time. During bpf load time, the verifier will learn if the bpf prog has accessed the new __sk_buff->delivery_time_type. If it does, it means the tc-bpf@ingress is expecting the skb->tstamp could have the delivery_time. The kernel will then read the skb->tstamp as-is during bpf insn rewrite without checking the skb->mono_delivery_time. This is done by adding a new prog->delivery_time_access bit. The same goes for writing skb->tstamp. * bpf_skb_set_delivery_time(): The bpf_skb_set_delivery_time() helper is added to allow setting both delivery_time and the delivery_time_type at the same time. If the tc-bpf does not need to change the delivery_time_type, it can directly write to the __sk_buff->tstamp as the existing tc-bpf has already been doing. It will be most useful at ingress to change the __sk_buff->tstamp from the (rcv) timestamp to a mono delivery_time and then bpf_redirect_*(). bpf only has mono clock helper (bpf_ktime_get_ns), and the current known use case is the mono EDT for fq, and only mono delivery time can be kept during forward now, so bpf_skb_set_delivery_time() only supports setting BPF_SKB_DELIVERY_TIME_MONO. It can be extended later when use cases come up and the forwarding path also supports other clock bases. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/filter.h | 3 +- include/uapi/linux/bpf.h | 41 +++++++++- net/core/filter.c | 169 ++++++++++++++++++++++++++++++++--------- tools/include/uapi/linux/bpf.h | 41 +++++++++- 4 files changed, 216 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1cb1af917617..9bf26307247f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -572,7 +572,8 @@ struct bpf_prog { has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ - call_get_func_ip:1; /* Do we call get_func_ip() */ + call_get_func_ip:1, /* Do we call get_func_ip() */ + delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index afe3d0d7f5f2..4eebea830613 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5086,6 +5086,37 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. + * + * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * Description + * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also + * change the __sk_buff->delivery_time_type to *dtime_type*. + * + * When setting a delivery time (non zero *dtime*) to + * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* + * is supported. It is the only delivery_time_type that will be + * kept after bpf_redirect_*(). + * + * If there is no need to change the __sk_buff->delivery_time_type, + * the delivery time can be directly written to __sk_buff->tstamp + * instead. + * + * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE + * can be used to clear any delivery time stored in + * __sk_buff->tstamp. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5280,6 +5311,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ + FN(skb_set_delivery_time), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5469,6 +5501,12 @@ union { \ __u64 :64; \ } __attribute__((aligned(8))) +enum { + BPF_SKB_DELIVERY_TIME_NONE, + BPF_SKB_DELIVERY_TIME_UNSPEC, + BPF_SKB_DELIVERY_TIME_MONO, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -5509,7 +5547,8 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u32 :32; /* Padding, future use. */ + __u8 delivery_time_type; + __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; diff --git a/net/core/filter.c b/net/core/filter.c index 5072733743e9..88767f7da150 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7388,6 +7388,43 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb, + u64, dtime, u32, dtime_type) +{ + /* skb_clear_delivery_time() is done for inet protocol */ + if (skb->protocol != htons(ETH_P_IP) && + skb->protocol != htons(ETH_P_IPV6)) + return -EOPNOTSUPP; + + switch (dtime_type) { + case BPF_SKB_DELIVERY_TIME_MONO: + if (!dtime) + return -EINVAL; + skb->tstamp = dtime; + skb->mono_delivery_time = 1; + break; + case BPF_SKB_DELIVERY_TIME_NONE: + if (dtime) + return -EINVAL; + skb->tstamp = 0; + skb->mono_delivery_time = 0; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = { + .func = bpf_skb_set_delivery_time, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -7749,6 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; + case BPF_FUNC_skb_set_delivery_time: + return &bpf_skb_set_delivery_time_proto; #endif default: return bpf_sk_base_func_proto(func_id); @@ -8088,7 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; - case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1: + case offsetof(struct __sk_buff, delivery_time_type): + return false; + case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: @@ -8443,6 +8484,15 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; + case offsetof(struct __sk_buff, delivery_time_type): + /* The convert_ctx_access() on reading and writing + * __sk_buff->tstamp depends on whether the bpf prog + * has used __sk_buff->delivery_time_type or not. + * Thus, we need to set prog->delivery_time_access + * earlier during is_valid_access() here. + */ + ((struct bpf_prog *)prog)->delivery_time_access = 1; + return size == sizeof(__u8); } return bpf_skb_is_valid_access(off, size, type, prog, info); @@ -8838,6 +8888,45 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + __u8 value_reg = si->dst_reg; + __u8 skb_reg = si->src_reg; + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* value_reg = BPF_SKB_DELIVERY_TIME_MONO */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO); + *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5); + + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg, + offsetof(struct sk_buff, tstamp)); + *insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2); + /* value_reg = BPF_SKB_DELIVERY_TIME_NONE */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE); + *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1); + +#ifdef CONFIG_NET_CLS_ACT + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* At ingress, value_reg = 0 */ + *insn++ = BPF_MOV32_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); +#endif + + /* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */ + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC); + + /* 15 insns with CONFIG_NET_CLS_ACT */ + return insn; +} + static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, struct bpf_insn *insn) { @@ -8859,29 +8948,32 @@ static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, return insn; } -static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si, +static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, + const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_CLS_ACT - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5); - /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp, - * so check the skb->mono_delivery_time. - */ - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); - /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */ - *insn++ = BPF_MOV64_IMM(value_reg, 0); - *insn++ = BPF_JMP_A(1); + if (!prog->delivery_time_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5); + /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp, + * so check the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); + /* skb->mono_delivery_time is set, read 0 as the (rcv) timestamp. */ + *insn++ = BPF_MOV64_IMM(value_reg, 0); + *insn++ = BPF_JMP_A(1); + } #endif *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg, @@ -8889,27 +8981,30 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_insn *si, return insn; } -static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_insn *si, +static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, + const struct bpf_insn *si, struct bpf_insn *insn) { __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_CLS_ACT - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3); - /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp. - * Clear the skb->mono_delivery_time. - */ - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - ~SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); + if (!prog->delivery_time_access) { + __u8 tmp_reg = BPF_REG_AX; + + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); + *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3); + /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp. + * Clear the skb->mono_delivery_time. + */ + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, + ~SKB_MONO_DELIVERY_TIME_MASK); + *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, + SKB_MONO_DELIVERY_TIME_OFFSET); + } #endif /* skb->tstamp = tstamp */ @@ -9226,9 +9321,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8); if (type == BPF_WRITE) - insn = bpf_convert_tstamp_write(si, insn); + insn = bpf_convert_tstamp_write(prog, si, insn); else - insn = bpf_convert_tstamp_read(si, insn); + insn = bpf_convert_tstamp_read(prog, si, insn); + break; + + case offsetof(struct __sk_buff, delivery_time_type): + insn = bpf_convert_dtime_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index afe3d0d7f5f2..4eebea830613 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5086,6 +5086,37 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. + * + * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * Description + * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also + * change the __sk_buff->delivery_time_type to *dtime_type*. + * + * When setting a delivery time (non zero *dtime*) to + * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* + * is supported. It is the only delivery_time_type that will be + * kept after bpf_redirect_*(). + * + * If there is no need to change the __sk_buff->delivery_time_type, + * the delivery time can be directly written to __sk_buff->tstamp + * instead. + * + * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE + * can be used to clear any delivery time stored in + * __sk_buff->tstamp. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5280,6 +5311,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ + FN(skb_set_delivery_time), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5469,6 +5501,12 @@ union { \ __u64 :64; \ } __attribute__((aligned(8))) +enum { + BPF_SKB_DELIVERY_TIME_NONE, + BPF_SKB_DELIVERY_TIME_UNSPEC, + BPF_SKB_DELIVERY_TIME_MONO, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -5509,7 +5547,8 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u32 :32; /* Padding, future use. */ + __u8 delivery_time_type; + __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; -- cgit v1.2.3 From 98b4d7a4e7374a44c4afd9f08330e72f6ad0d644 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:40 +0800 Subject: net: dev: use kfree_skb_reason() for sch_handle_egress() Replace kfree_skb() used in sch_handle_egress() with kfree_skb_reason(). The drop reason SKB_DROP_REASON_TC_EGRESS is introduced. Considering the code path of tc egerss, we make it distinct with the drop reason of SKB_DROP_REASON_QDISC_DROP in the next commit. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + net/core/dev.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5445860e1ba6..1ffe64616741 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -394,6 +394,7 @@ enum skb_drop_reason { * entry is full */ SKB_DROP_REASON_NEIGH_DEAD, /* neigh entry is dead */ + SKB_DROP_REASON_TC_EGRESS, /* dropped in TC egress HOOK */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 1977f301260d..53755e8191a1 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -45,6 +45,7 @@ EM(SKB_DROP_REASON_NEIGH_FAILED, NEIGH_FAILED) \ EM(SKB_DROP_REASON_NEIGH_QUEUEFULL, NEIGH_QUEUEFULL) \ EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ + EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/core/dev.c b/net/core/dev.c index 96bcc003e018..ef0b1992cf13 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3889,7 +3889,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); *ret = NET_XMIT_DROP; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: -- cgit v1.2.3 From 215b0f1963d4e34fccac6992b3debe26f78a6eb8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:41 +0800 Subject: net: skb: introduce the function kfree_skb_list_reason() To report reasons of skb drops, introduce the function kfree_skb_list_reason() and make kfree_skb_list() an inline call to it. This function will be used in the next commit in __dev_xmit_skb(). Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 +++++++- net/core/skbuff.c | 7 ++++--- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1ffe64616741..1f2de153755c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1202,10 +1202,16 @@ static inline void kfree_skb(struct sk_buff *skb) } void skb_release_head_state(struct sk_buff *skb); -void kfree_skb_list(struct sk_buff *segs); +void kfree_skb_list_reason(struct sk_buff *segs, + enum skb_drop_reason reason); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); +static inline void kfree_skb_list(struct sk_buff *segs) +{ + kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED); +} + #ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); #else diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 23f3ba343661..10bde7c6db44 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -777,16 +777,17 @@ void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) } EXPORT_SYMBOL(kfree_skb_reason); -void kfree_skb_list(struct sk_buff *segs) +void kfree_skb_list_reason(struct sk_buff *segs, + enum skb_drop_reason reason) { while (segs) { struct sk_buff *next = segs->next; - kfree_skb(segs); + kfree_skb_reason(segs, reason); segs = next; } } -EXPORT_SYMBOL(kfree_skb_list); +EXPORT_SYMBOL(kfree_skb_list_reason); /* Dump skb information and contents. * -- cgit v1.2.3 From 7faef0547f4c29031a68d058918b031a8e520d49 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:42 +0800 Subject: net: dev: add skb drop reasons to __dev_xmit_skb() Add reasons for skb drops to __dev_xmit_skb() by replacing kfree_skb_list() with kfree_skb_list_reason(). The drop reason of SKB_DROP_REASON_QDISC_DROP is introduced for qdisc enqueue fails. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ include/trace/events/skb.h | 1 + net/core/dev.c | 5 +++-- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1f2de153755c..769d54ba2f3b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -395,6 +395,10 @@ enum skb_drop_reason { */ SKB_DROP_REASON_NEIGH_DEAD, /* neigh entry is dead */ SKB_DROP_REASON_TC_EGRESS, /* dropped in TC egress HOOK */ + SKB_DROP_REASON_QDISC_DROP, /* dropped by qdisc when packet + * outputting (failed to enqueue to + * current qdisc) + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 53755e8191a1..dbf3e2e3c1b4 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -46,6 +46,7 @@ EM(SKB_DROP_REASON_NEIGH_QUEUEFULL, NEIGH_QUEUEFULL) \ EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ + EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/core/dev.c b/net/core/dev.c index ef0b1992cf13..e2c107643ee5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3759,7 +3759,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, no_lock_out: if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, + SKB_DROP_REASON_QDISC_DROP); return rc; } @@ -3814,7 +3815,7 @@ no_lock_out: } spin_unlock(root_lock); if (unlikely(to_free)) - kfree_skb_list(to_free); + kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP); if (unlikely(contended)) spin_unlock(&q->busylock); return rc; -- cgit v1.2.3 From 44f0bd40803c0e04f1c8cd59df3c7acce783ae9c Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:43 +0800 Subject: net: dev: use kfree_skb_reason() for enqueue_to_backlog() Replace kfree_skb() used in enqueue_to_backlog() with kfree_skb_reason(). The skb rop reason SKB_DROP_REASON_CPU_BACKLOG is introduced for the case of failing to enqueue the skb to the per CPU backlog queue. The further reason can be backlog queue full or RPS flow limition, and I think we needn't to make further distinctions. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ include/trace/events/skb.h | 1 + net/core/dev.c | 5 ++++- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 769d54ba2f3b..44ecbfff2b69 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -399,6 +399,12 @@ enum skb_drop_reason { * outputting (failed to enqueue to * current qdisc) */ + SKB_DROP_REASON_CPU_BACKLOG, /* failed to enqueue the skb to + * the per CPU backlog queue. This + * can be caused by backlog queue + * full (see netdev_max_backlog in + * net.rst) or RPS flow limit + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index dbf3e2e3c1b4..3bb90ca893ae 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -47,6 +47,7 @@ EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ + EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/core/dev.c b/net/core/dev.c index e2c107643ee5..460ac941a631 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4570,10 +4570,12 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) static int enqueue_to_backlog(struct sk_buff *skb, int cpu, unsigned int *qtail) { + enum skb_drop_reason reason; struct softnet_data *sd; unsigned long flags; unsigned int qlen; + reason = SKB_DROP_REASON_NOT_SPECIFIED; sd = &per_cpu(softnet_data, cpu); rps_lock_irqsave(sd, &flags); @@ -4596,13 +4598,14 @@ enqueue: napi_schedule_rps(sd); goto enqueue; } + reason = SKB_DROP_REASON_CPU_BACKLOG; drop: sd->dropped++; rps_unlock_irq_restore(sd, &flags); atomic_long_inc(&skb->dev->rx_dropped); - kfree_skb(skb); + kfree_skb_reason(skb, reason); return NET_RX_DROP; } -- cgit v1.2.3 From 7e726ed81e1ddd5fdc431e02b94fcfe2a9876d42 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:44 +0800 Subject: net: dev: use kfree_skb_reason() for do_xdp_generic() Replace kfree_skb() used in do_xdp_generic() with kfree_skb_reason(). The drop reason SKB_DROP_REASON_XDP is introduced for this case. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + net/core/dev.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 44ecbfff2b69..7a38d0f90cef 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -405,6 +405,7 @@ enum skb_drop_reason { * full (see netdev_max_backlog in * net.rst) or RPS flow limit */ + SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 3bb90ca893ae..8c4c343c830f 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -48,6 +48,7 @@ EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ + EM(SKB_DROP_REASON_XDP, XDP) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/core/dev.c b/net/core/dev.c index 460ac941a631..f449834a63df 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4825,7 +4825,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); -- cgit v1.2.3 From a568aff26ac03ee9eb1482683514914a5ec3b4c3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:45 +0800 Subject: net: dev: use kfree_skb_reason() for sch_handle_ingress() Replace kfree_skb() used in sch_handle_ingress() with kfree_skb_reason(). Following drop reasons are introduced: SKB_DROP_REASON_TC_INGRESS Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + net/core/dev.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7a38d0f90cef..6b14b2d297d2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -406,6 +406,7 @@ enum skb_drop_reason { * net.rst) or RPS flow limit */ SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ + SKB_DROP_REASON_TC_INGRESS, /* dropped in TC ingress HOOK */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 8c4c343c830f..514dd2de8776 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -49,6 +49,7 @@ EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ EM(SKB_DROP_REASON_XDP, XDP) \ + EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/core/dev.c b/net/core/dev.c index f449834a63df..ba7580ccee94 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5041,7 +5041,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: -- cgit v1.2.3 From 6c2728b7c14164928cb7cb9c847dead101b2d503 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:46 +0800 Subject: net: dev: use kfree_skb_reason() for __netif_receive_skb_core() Add reason for skb drops to __netif_receive_skb_core() when packet_type not found to handle the skb. For this purpose, the drop reason SKB_DROP_REASON_PTYPE_ABSENT is introduced. Take ether packets for example, this case mainly happens when L3 protocol is not supported. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ include/trace/events/skb.h | 1 + net/core/dev.c | 8 +++++--- 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6b14b2d297d2..2be263184d1e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -407,6 +407,11 @@ enum skb_drop_reason { */ SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ SKB_DROP_REASON_TC_INGRESS, /* dropped in TC ingress HOOK */ + SKB_DROP_REASON_PTYPE_ABSENT, /* not packet_type found to handle + * the skb. For an etner packet, + * this means that L3 protocol is + * not supported + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 514dd2de8776..c0769d943f8e 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -50,6 +50,7 @@ EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ EM(SKB_DROP_REASON_XDP, XDP) \ EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS) \ + EM(SKB_DROP_REASON_PTYPE_ABSENT, PTYPE_ABSENT) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM diff --git a/net/core/dev.c b/net/core/dev.c index ba7580ccee94..ba69ddf85af6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5358,11 +5358,13 @@ check_vlan_id: *ppt_prev = pt_prev; } else { drop: - if (!deliver_exact) + if (!deliver_exact) { atomic_long_inc(&skb->dev->rx_dropped); - else + kfree_skb_reason(skb, SKB_DROP_REASON_PTYPE_ABSENT); + } else { atomic_long_inc(&skb->dev->rx_nohandler); - kfree_skb(skb); + kfree_skb(skb); + } /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ -- cgit v1.2.3 From 25b35dd28138f61f9a0fb8b76c0483761fd228bd Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 5 Mar 2022 04:16:38 +0530 Subject: bpf: Add check_func_arg_reg_off function Lift the list of register types allowed for having fixed and variable offsets when passed as helper function arguments into a common helper, so that they can be reused for kfunc checks in later commits. Keeping a common helper aids maintainability and allows us to follow the same consistent rules across helpers and kfuncs. Also, convert check_func_arg to use this function. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-2-memxor@gmail.com --- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/verifier.c | 69 ++++++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7a7be8c057f2..38b24ee8d8c2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -521,6 +521,9 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type); int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a57db4b2803c..e37eb6020253 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5359,6 +5359,44 @@ found: return 0; } +int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type) +{ + enum bpf_reg_type type = reg->type; + bool fixed_off_ok = false; + + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type != ARG_PTR_TO_ALLOC_MEM) + return 0; + break; + /* All the rest must be rejected, except PTR_TO_BTF_ID which allows + * fixed offset. + */ + case PTR_TO_BTF_ID: + fixed_off_ok = true; + break; + default: + break; + } + return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); +} + static int check_func_arg(struct bpf_verifier_env *env, u32 arg, struct bpf_call_arg_meta *meta, const struct bpf_func_proto *fn) @@ -5408,34 +5446,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - switch ((u32)type) { - case SCALAR_VALUE: - /* Pointer types where reg offset is explicitly allowed: */ - case PTR_TO_PACKET: - case PTR_TO_PACKET_META: - case PTR_TO_MAP_KEY: - case PTR_TO_MAP_VALUE: - case PTR_TO_MEM: - case PTR_TO_MEM | MEM_RDONLY: - case PTR_TO_MEM | MEM_ALLOC: - case PTR_TO_BUF: - case PTR_TO_BUF | MEM_RDONLY: - case PTR_TO_STACK: - /* Some of the argument types nevertheless require a - * zero register offset. - */ - if (arg_type == ARG_PTR_TO_ALLOC_MEM) - goto force_off_check; - break; - /* All the rest must be rejected: */ - default: -force_off_check: - err = __check_ptr_off_reg(env, reg, regno, - type == PTR_TO_BTF_ID); - if (err < 0) - return err; - break; - } + err = check_func_arg_reg_off(env, reg, regno, arg_type); + if (err) + return err; skip_type_check: if (reg->ref_obj_id) { -- cgit v1.2.3 From 24d5bb806c7e2c0b9972564fd493069f612d90dd Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 5 Mar 2022 04:16:41 +0530 Subject: bpf: Harden register offset checks for release helpers and kfuncs Let's ensure that the PTR_TO_BTF_ID reg being passed in to release BPF helpers and kfuncs always has its offset set to 0. While not a real problem now, there's a very real possibility this will become a problem when more and more kfuncs are exposed, and more BPF helpers are added which can release PTR_TO_BTF_ID. Previous commits already protected against non-zero var_off. One of the case we are concerned about now is when we have a type that can be returned by e.g. an acquire kfunc: struct foo { int a; int b; struct bar b; }; ... and struct bar is also a type that can be returned by another acquire kfunc. Then, doing the following sequence: struct foo *f = bpf_get_foo(); // acquire kfunc if (!f) return 0; bpf_put_bar(&f->b); // release kfunc ... would work with the current code, since the btf_struct_ids_match takes reg->off into account for matching pointer type with release kfunc argument type, but would obviously be incorrect, and most likely lead to a kernel crash. A test has been included later to prevent regressions in this area. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-5-memxor@gmail.com --- include/linux/bpf_verifier.h | 3 ++- kernel/bpf/btf.c | 33 +++++++++++++++++++-------------- kernel/bpf/verifier.c | 25 ++++++++++++++++++++++--- 3 files changed, 43 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 38b24ee8d8c2..c1fc4af47f69 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -523,7 +523,8 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type); + enum bpf_arg_type arg_type, + bool is_release_func); int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7f6a0ae5028b..162807e3b4a5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5753,6 +5753,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* Only kfunc can be release func */ + if (is_kfunc) + rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RELEASE, func_id); /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -5777,7 +5781,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel); if (ret < 0) return ret; @@ -5809,7 +5813,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - /* Ensure only one argument is referenced PTR_TO_BTF_ID */ + /* Ensure only one argument is referenced + * PTR_TO_BTF_ID, check_func_arg_reg_off relies + * on only one referenced register being allowed + * for kfuncs. + */ if (reg->ref_obj_id) { if (ref_obj_id) { bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5891,18 +5899,15 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, /* Either both are set, or neither */ WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); - if (is_kfunc) { - rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), - BTF_KFUNC_TYPE_RELEASE, func_id); - /* We already made sure ref_obj_id is set only for one argument */ - if (rel && !ref_obj_id) { - bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - /* Allow (!rel && ref_obj_id), so that passing such referenced PTR_TO_BTF_ID to - * other kfuncs works - */ + /* We already made sure ref_obj_id is set only for one argument. We do + * allow (!rel && ref_obj_id), so that passing such referenced + * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when + * is_kfunc is true. + */ + if (rel && !ref_obj_id) { + bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; } /* returns argument register number > 0 in case of reference release kfunc */ return rel ? ref_regno : 0; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 455b4ab69e47..fe9a513e2314 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5367,10 +5367,11 @@ found: int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type) + enum bpf_arg_type arg_type, + bool is_release_func) { + bool fixed_off_ok = false, release_reg; enum bpf_reg_type type = reg->type; - bool fixed_off_ok = false; switch ((u32)type) { case SCALAR_VALUE: @@ -5395,6 +5396,21 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, * fixed offset. */ case PTR_TO_BTF_ID: + /* When referenced PTR_TO_BTF_ID is passed to release function, + * it's fixed offset must be 0. We rely on the property that + * only one referenced register can be passed to BPF helpers and + * kfuncs. In the other cases, fixed offset can be non-zero. + */ + release_reg = is_release_func && reg->ref_obj_id; + if (release_reg && reg->off) { + verbose(env, "R%d must have zero offset when passed to release func\n", + regno); + return -EINVAL; + } + /* For release_reg == true, fixed_off_ok must be false, but we + * already checked and rejected reg->off != 0 above, so set to + * true to allow fixed offset for all other cases. + */ fixed_off_ok = true; break; default: @@ -5452,11 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - err = check_func_arg_reg_off(env, reg, regno, arg_type); + err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id)); if (err) return err; skip_type_check: + /* check_func_arg_reg_off relies on only one referenced register being + * allowed for BPF helpers. + */ if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", -- cgit v1.2.3 From f014a00bbeb09cea16017b82448d32a468a6b96f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 5 Mar 2022 04:16:42 +0530 Subject: compiler-clang.h: Add __diag infrastructure for clang Add __diag macros similar to those in compiler-gcc.h, so that warnings that need to be adjusted for specific cases but not globally can be ignored when building with clang. Signed-off-by: Nathan Chancellor Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-6-memxor@gmail.com [ Kartikeya: wrote commit message ] --- include/linux/compiler-clang.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 3c4de9b6c6e3..f1aa41d520bd 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -68,3 +68,25 @@ #define __nocfi __attribute__((__no_sanitize__("cfi"))) #define __cficanonical __attribute__((__cfi_canonical_jump_table__)) + +/* + * Turn individual warnings and errors on and off locally, depending + * on version. + */ +#define __diag_clang(version, severity, s) \ + __diag_clang_ ## version(__diag_clang_ ## severity s) + +/* Severity used in pragma directives */ +#define __diag_clang_ignore ignored +#define __diag_clang_warn warning +#define __diag_clang_error error + +#define __diag_str1(s) #s +#define __diag_str(s) __diag_str1(s) +#define __diag(s) _Pragma(__diag_str(clang diagnostic s)) + +#if CONFIG_CLANG_VERSION >= 110000 +#define __diag_clang_11(s) __diag(s) +#else +#define __diag_clang_11(s) +#endif -- cgit v1.2.3 From 4d1ea705d797e66edd70ffa708b83888a210a437 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 5 Mar 2022 04:16:43 +0530 Subject: compiler_types.h: Add unified __diag_ignore_all for GCC/LLVM Add a __diag_ignore_all macro, to ignore warnings for both GCC and LLVM, without having to specify the compiler type and version. By default, GCC 8 and clang 11 are used. This will be used by bpf subsystem to ignore -Wmissing-prototypes warning for functions that are meant to be global functions so that they are in vmlinux BTF, but don't have a prototype. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-7-memxor@gmail.com --- include/linux/compiler-clang.h | 3 +++ include/linux/compiler-gcc.h | 3 +++ include/linux/compiler_types.h | 4 ++++ 3 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index f1aa41d520bd..babb1347148c 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -90,3 +90,6 @@ #else #define __diag_clang_11(s) #endif + +#define __diag_ignore_all(option, comment) \ + __diag_clang(11, ignore, option) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index ccbbd31b3aae..d364c98a4a80 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -151,6 +151,9 @@ #define __diag_GCC_8(s) #endif +#define __diag_ignore_all(option, comment) \ + __diag_GCC(8, ignore, option) + /* * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" * attribute) do not work, and must be disabled. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3f31ff400432..8e5d2f50f951 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -371,4 +371,8 @@ struct ftrace_likely_data { #define __diag_error(compiler, version, option, comment) \ __diag_ ## compiler(version, error, option) +#ifndef __diag_ignore_all +#define __diag_ignore_all(option, comment) +#endif + #endif /* __LINUX_COMPILER_TYPES_H */ -- cgit v1.2.3 From 9216c916237805c93d054ed022afb172ddbc3ed1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Fri, 4 Mar 2022 11:16:55 -0800 Subject: compiler_types: Define __percpu as __attribute__((btf_type_tag("percpu"))) This is similar to commit 7472d5a642c9 ("compiler_types: define __user as __attribute__((btf_type_tag("user")))"), where a type tag "user" was introduced to identify the pointers that point to user memory. With that change, the newest compile toolchain can encode __user information into vmlinux BTF, which can be used by the BPF verifier to enforce safe program behaviors. Similarly, we have __percpu attribute, which is mainly used to indicate memory is allocated in percpu region. The __percpu pointers in kernel are supposed to be used together with functions like per_cpu_ptr() and this_cpu_ptr(), which perform necessary calculation on the pointer's base address. Without the btf_type_tag introduced in this patch, __percpu pointers will be treated as regular memory pointers in vmlinux BTF and BPF programs are allowed to directly dereference them, generating incorrect behaviors. Now with "percpu" btf_type_tag, the BPF verifier is able to differentiate __percpu pointers from regular pointers and forbids unexpected behaviors like direct load. The following is an example similar to the one given in commit 7472d5a642c9: [$ ~] cat test.c #define __percpu __attribute__((btf_type_tag("percpu"))) int foo(int __percpu *arg) { return *arg; } [$ ~] clang -O2 -g -c test.c [$ ~] pahole -JV test.o ... File test.o: [1] INT int size=4 nr_bits=32 encoding=SIGNED [2] TYPE_TAG percpu type_id=1 [3] PTR (anon) type_id=2 [4] FUNC_PROTO (anon) return=1 args=(3 arg) [5] FUNC foo type_id=4 [$ ~] for the function argument "int __percpu *arg", its type is described as PTR -> TYPE_TAG(percpu) -> INT The kernel can use this information for bpf verification or other use cases. Like commit 7472d5a642c9, this feature requires clang (>= clang14) and pahole (>= 1.23). Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220304191657.981240-3-haoluo@google.com --- include/linux/compiler_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 8e5d2f50f951..b9a8ae9440c7 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -38,7 +38,12 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __user # endif # define __iomem -# define __percpu +# if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ + __has_attribute(btf_type_tag) +# define __percpu __attribute__((btf_type_tag("percpu"))) +# else +# define __percpu +# endif # define __rcu # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 -- cgit v1.2.3 From 5844101a1be9b8636024cb31c865ef13c7cc6db3 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Fri, 4 Mar 2022 11:16:56 -0800 Subject: bpf: Reject programs that try to load __percpu memory. With the introduction of the btf_type_tag "percpu", we can add a MEM_PERCPU to identify those pointers that point to percpu memory. The ability of differetiating percpu pointers from regular memory pointers have two benefits: 1. It forbids unexpected use of percpu pointers, such as direct loads. In kernel, there are special functions used for accessing percpu memory. Directly loading percpu memory is meaningless. We already have BPF helpers like bpf_per_cpu_ptr() and bpf_this_cpu_ptr() that wrap the kernel percpu functions. So we can now convert percpu pointers into regular pointers in a safe way. 2. Previously, bpf_per_cpu_ptr() and bpf_this_cpu_ptr() only work on PTR_TO_PERCPU_BTF_ID, a special reg_type which describes static percpu variables in kernel (we rely on pahole to encode them into vmlinux BTF). Now, since we can identify __percpu tagged pointers, we can also identify dynamically allocated percpu memory as well. It means we can use bpf_xxx_cpu_ptr() on dynamic percpu memory. This would be very convenient when accessing fields like "cgroup->rstat_cpu". Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220304191657.981240-4-haoluo@google.com --- include/linux/bpf.h | 11 +++++++++-- kernel/bpf/btf.c | 8 +++++++- kernel/bpf/verifier.c | 24 ++++++++++++++---------- 3 files changed, 30 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f19abc59b6cd..88449fbbe063 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -334,7 +334,15 @@ enum bpf_type_flag { /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_USER, + /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged + * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In + * order to drop this tag, it must be passed into bpf_per_cpu_ptr() + * or bpf_this_cpu_ptr(), which will return the pointer corresponding + * to the specified cpu. + */ + MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_PERCPU, }; /* Max number of base types. */ @@ -516,7 +524,6 @@ enum bpf_reg_type { */ PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ - PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ __BPF_REG_TYPE_MAX, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 162807e3b4a5..8b34563a832e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5057,6 +5057,8 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tag_value = __btf_name_by_offset(btf, t->name_off); if (strcmp(tag_value, "user") == 0) info->reg_type |= MEM_USER; + if (strcmp(tag_value, "percpu") == 0) + info->reg_type |= MEM_PERCPU; } /* skip modifiers */ @@ -5285,12 +5287,16 @@ error: return -EACCES; } - /* check __user tag */ + /* check type tag */ t = btf_type_by_id(btf, mtype->type); if (btf_type_is_type_tag(t)) { tag_value = __btf_name_by_offset(btf, t->name_off); + /* check __user tag */ if (strcmp(tag_value, "user") == 0) tmp_flag = MEM_USER; + /* check __percpu tag */ + if (strcmp(tag_value, "percpu") == 0) + tmp_flag = MEM_PERCPU; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7a6b58fea37d..ec3a7b6c9515 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -554,7 +554,6 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", @@ -562,8 +561,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, }; if (type & PTR_MAYBE_NULL) { - if (base_type(type) == PTR_TO_BTF_ID || - base_type(type) == PTR_TO_PERCPU_BTF_ID) + if (base_type(type) == PTR_TO_BTF_ID) strncpy(postfix, "or_null_", 16); else strncpy(postfix, "_or_null", 16); @@ -575,6 +573,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(prefix, "alloc_", 32); if (type & MEM_USER) strncpy(prefix, "user_", 32); + if (type & MEM_PERCPU) + strncpy(prefix, "percpu_", 32); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -697,8 +697,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, const char *sep = ""; verbose(env, "%s", reg_type_str(env, t)); - if (base_type(t) == PTR_TO_BTF_ID || - base_type(t) == PTR_TO_PERCPU_BTF_ID) + if (base_type(t) == PTR_TO_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); verbose(env, "("); /* @@ -2783,7 +2782,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: case PTR_TO_BUF: - case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: @@ -4203,6 +4201,13 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if (reg->type & MEM_PERCPU) { + verbose(env, + "R%d is ptr_%s access percpu memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, off, size, atype, &btf_id, &flag); @@ -4809,7 +4814,7 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) goto mark; if (is_spilled_reg(&state->stack[spi]) && @@ -5265,7 +5270,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | ME static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -9677,7 +9682,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: - case PTR_TO_PERCPU_BTF_ID: dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; @@ -11877,7 +11881,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { - aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU; aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { -- cgit v1.2.3 From 736f16de75f9bb32d76f652cb66f04d1bc685057 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 4 Mar 2022 06:55:05 -0800 Subject: net: tap: track dropped skb via kfree_skb_reason() The TAP can be used as vhost-net backend. E.g., the tap_handle_frame() is the interface to forward the skb from TAP to vhost-net/virtio-net. However, there are many "goto drop" in the TAP driver. Therefore, the kfree_skb_reason() is involved at each "goto drop" to help userspace ftrace/ebpf to track the reason for the loss of packets. The below reasons are introduced: - SKB_DROP_REASON_SKB_CSUM - SKB_DROP_REASON_SKB_GSO_SEG - SKB_DROP_REASON_SKB_UCOPY_FAULT - SKB_DROP_REASON_DEV_HDR - SKB_DROP_REASON_FULL_RING Cc: Joao Martins Cc: Joe Jin Signed-off-by: Dongli Zhang Signed-off-by: David S. Miller --- drivers/net/tap.c | 35 +++++++++++++++++++++++++---------- include/linux/skbuff.h | 13 +++++++++++++ include/trace/events/skb.h | 5 +++++ 3 files changed, 43 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index ba2ef5437e16..c3d42062559d 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -322,6 +322,7 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) struct tap_dev *tap; struct tap_queue *q; netdev_features_t features = TAP_FEATURES; + enum skb_drop_reason drop_reason; tap = tap_dev_get_rcu(dev); if (!tap) @@ -343,12 +344,16 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) struct sk_buff *segs = __skb_gso_segment(skb, features, false); struct sk_buff *next; - if (IS_ERR(segs)) + if (IS_ERR(segs)) { + drop_reason = SKB_DROP_REASON_SKB_GSO_SEG; goto drop; + } if (!segs) { - if (ptr_ring_produce(&q->ring, skb)) + if (ptr_ring_produce(&q->ring, skb)) { + drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; + } goto wake_up; } @@ -356,8 +361,9 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) skb_list_walk_safe(segs, skb, next) { skb_mark_not_on_list(skb); if (ptr_ring_produce(&q->ring, skb)) { - kfree_skb(skb); - kfree_skb_list(next); + drop_reason = SKB_DROP_REASON_FULL_RING; + kfree_skb_reason(skb, drop_reason); + kfree_skb_list_reason(next, drop_reason); break; } } @@ -369,10 +375,14 @@ rx_handler_result_t tap_handle_frame(struct sk_buff **pskb) */ if (skb->ip_summed == CHECKSUM_PARTIAL && !(features & NETIF_F_CSUM_MASK) && - skb_checksum_help(skb)) + skb_checksum_help(skb)) { + drop_reason = SKB_DROP_REASON_SKB_CSUM; goto drop; - if (ptr_ring_produce(&q->ring, skb)) + } + if (ptr_ring_produce(&q->ring, skb)) { + drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; + } } wake_up: @@ -383,7 +393,7 @@ drop: /* Count errors/drops only here, thus don't care about args. */ if (tap->count_rx_dropped) tap->count_rx_dropped(tap); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return RX_HANDLER_CONSUMED; } EXPORT_SYMBOL_GPL(tap_handle_frame); @@ -632,6 +642,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, int depth; bool zerocopy = false; size_t linear; + enum skb_drop_reason drop_reason; if (q->flags & IFF_VNET_HDR) { vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz); @@ -696,8 +707,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, else err = skb_copy_datagram_from_iter(skb, 0, from, len); - if (err) + if (err) { + drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto err_kfree; + } skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); @@ -706,8 +719,10 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, if (vnet_hdr_len) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, tap_is_little_endian(q)); - if (err) + if (err) { + drop_reason = SKB_DROP_REASON_DEV_HDR; goto err_kfree; + } } skb_probe_transport_header(skb); @@ -738,7 +753,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, return total_len; err_kfree: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); err: rcu_read_lock(); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2be263184d1e..67cfff4065b6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -412,6 +412,19 @@ enum skb_drop_reason { * this means that L3 protocol is * not supported */ + SKB_DROP_REASON_SKB_CSUM, /* sk_buff checksum computation + * error + */ + SKB_DROP_REASON_SKB_GSO_SEG, /* gso segmentation error */ + SKB_DROP_REASON_SKB_UCOPY_FAULT, /* failed to copy data from + * user space, e.g., via + * zerocopy_sg_from_iter() + * or skb_orphan_frags_rx() + */ + SKB_DROP_REASON_DEV_HDR, /* device driver specific + * header/metadata is invalid + */ + SKB_DROP_REASON_FULL_RING, /* ring buffer is full */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index c0769d943f8e..240e7e7591fc 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -51,6 +51,11 @@ EM(SKB_DROP_REASON_XDP, XDP) \ EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS) \ EM(SKB_DROP_REASON_PTYPE_ABSENT, PTYPE_ABSENT) \ + EM(SKB_DROP_REASON_SKB_CSUM, SKB_CSUM) \ + EM(SKB_DROP_REASON_SKB_GSO_SEG, SKB_GSO_SEG) \ + EM(SKB_DROP_REASON_SKB_UCOPY_FAULT, SKB_UCOPY_FAULT) \ + EM(SKB_DROP_REASON_DEV_HDR, DEV_HDR) \ + EM(SKB_DROP_REASON_FULL_RING, FULL_RING) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 4b4f052e2d89c2eb7e13ee28ba9e85f8097aef3d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 4 Mar 2022 06:55:07 -0800 Subject: net: tun: track dropped skb via kfree_skb_reason() The TUN can be used as vhost-net backend. E.g, the tun_net_xmit() is the interface to forward the skb from TUN to vhost-net/virtio-net. However, there are many "goto drop" in the TUN driver. Therefore, the kfree_skb_reason() is involved at each "goto drop" to help userspace ftrace/ebpf to track the reason for the loss of packets. The below reasons are introduced: - SKB_DROP_REASON_DEV_READY - SKB_DROP_REASON_NOMEM - SKB_DROP_REASON_HDR_TRUNC - SKB_DROP_REASON_TAP_FILTER - SKB_DROP_REASON_TAP_TXFILTER Cc: Joao Martins Cc: Joe Jin Signed-off-by: Dongli Zhang Signed-off-by: David S. Miller --- drivers/net/tun.c | 37 ++++++++++++++++++++++++++++--------- include/linux/skbuff.h | 18 ++++++++++++++++++ include/trace/events/skb.h | 5 +++++ 3 files changed, 51 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 6e06c846fe82..bab92e489fba 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1058,6 +1058,7 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun, static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); + enum skb_drop_reason drop_reason; int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; @@ -1067,8 +1068,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) tfile = rcu_dereference(tun->tfiles[txq]); /* Drop packet if interface is not attached */ - if (!tfile) + if (!tfile) { + drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; + } if (!rcu_dereference(tun->steering_prog)) tun_automq_xmit(tun, skb); @@ -1078,22 +1081,32 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) /* Drop if the filter does not like it. * This is a noop if the filter is disabled. * Filter can be enabled only for the TAP devices. */ - if (!check_filter(&tun->txflt, skb)) + if (!check_filter(&tun->txflt, skb)) { + drop_reason = SKB_DROP_REASON_TAP_TXFILTER; goto drop; + } if (tfile->socket.sk->sk_filter && - sk_filter(tfile->socket.sk, skb)) + sk_filter(tfile->socket.sk, skb)) { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; + } len = run_ebpf_filter(tun, skb, len); - if (len == 0) + if (len == 0) { + drop_reason = SKB_DROP_REASON_TAP_FILTER; goto drop; + } - if (pskb_trim(skb, len)) + if (pskb_trim(skb, len)) { + drop_reason = SKB_DROP_REASON_NOMEM; goto drop; + } - if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; goto drop; + } skb_tx_timestamp(skb); @@ -1104,8 +1117,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) nf_reset_ct(skb); - if (ptr_ring_produce(&tfile->tx_ring, skb)) + if (ptr_ring_produce(&tfile->tx_ring, skb)) { + drop_reason = SKB_DROP_REASON_FULL_RING; goto drop; + } /* NETIF_F_LLTX requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); @@ -1122,7 +1137,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) drop: atomic_long_inc(&dev->tx_dropped); skb_tx_error(skb); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); return NET_XMIT_DROP; } @@ -1720,6 +1735,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, u32 rxhash = 0; int skb_xdp = 1; bool frags = tun_napi_frags_enabled(tfile); + enum skb_drop_reason drop_reason; if (!(tun->flags & IFF_NO_PI)) { if (len < sizeof(pi)) @@ -1823,9 +1839,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (err) { err = -EFAULT; + drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; drop: atomic_long_inc(&tun->dev->rx_dropped); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); if (frags) { tfile->napi.skb = NULL; mutex_unlock(&tfile->napi_mutex); @@ -1872,6 +1889,7 @@ drop: case IFF_TAP: if (frags && !pskb_may_pull(skb, ETH_HLEN)) { err = -ENOMEM; + drop_reason = SKB_DROP_REASON_HDR_TRUNC; goto drop; } skb->protocol = eth_type_trans(skb, tun->dev); @@ -1925,6 +1943,7 @@ drop: if (unlikely(!(tun->dev->flags & IFF_UP))) { err = -EIO; rcu_read_unlock(); + drop_reason = SKB_DROP_REASON_DEV_READY; goto drop; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 67cfff4065b6..34f572271c0c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -424,7 +424,25 @@ enum skb_drop_reason { SKB_DROP_REASON_DEV_HDR, /* device driver specific * header/metadata is invalid */ + /* the device is not ready to xmit/recv due to any of its data + * structure that is not up/ready/initialized, e.g., the IFF_UP is + * not set, or driver specific tun->tfiles[txq] is not initialized + */ + SKB_DROP_REASON_DEV_READY, SKB_DROP_REASON_FULL_RING, /* ring buffer is full */ + SKB_DROP_REASON_NOMEM, /* error due to OOM */ + SKB_DROP_REASON_HDR_TRUNC, /* failed to trunc/extract the header + * from networking data, e.g., failed + * to pull the protocol header from + * frags via pskb_may_pull() + */ + SKB_DROP_REASON_TAP_FILTER, /* dropped by (ebpf) filter directly + * attached to tun/tap, e.g., via + * TUNSETFILTEREBPF + */ + SKB_DROP_REASON_TAP_TXFILTER, /* dropped by tx filter implemented + * at tun/tap, e.g., check_filter() + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 240e7e7591fc..e1670e1e4934 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -55,7 +55,12 @@ EM(SKB_DROP_REASON_SKB_GSO_SEG, SKB_GSO_SEG) \ EM(SKB_DROP_REASON_SKB_UCOPY_FAULT, SKB_UCOPY_FAULT) \ EM(SKB_DROP_REASON_DEV_HDR, DEV_HDR) \ + EM(SKB_DROP_REASON_DEV_READY, DEV_READY) \ EM(SKB_DROP_REASON_FULL_RING, FULL_RING) \ + EM(SKB_DROP_REASON_NOMEM, NOMEM) \ + EM(SKB_DROP_REASON_HDR_TRUNC, HDR_TRUNC) \ + EM(SKB_DROP_REASON_TAP_FILTER, TAP_FILTER) \ + EM(SKB_DROP_REASON_TAP_TXFILTER, TAP_TXFILTER) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From f72de02ebece2e962462bc0c1e9efd29eaa029b2 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Sat, 5 Mar 2022 12:21:25 +0100 Subject: ptp: Add generic PTP is_sync() function PHY drivers such as micrel or dp83640 need to analyze whether a given skb is a PTP sync message for one step functionality. In order to avoid code duplication introduce a generic function and move it to ptp classify. Signed-off-by: Kurt Kanzenbach Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 15 +++++++++++++++ net/core/ptp_classifier.c | 12 ++++++++++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index 9afd34a2d36c..fefa7790dc46 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -126,6 +126,17 @@ static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, return msgtype; } +/** + * ptp_msg_is_sync - Evaluates whether the given skb is a PTP Sync message + * @skb: packet buffer + * @type: type of the packet (see ptp_classify_raw()) + * + * This function evaluates whether the given skb is a PTP Sync message. + * + * Return: true if sync message, false otherwise + */ +bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type); + void __init ptp_classifier_init(void); #else static inline void ptp_classifier_init(void) @@ -148,5 +159,9 @@ static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, */ return PTP_MSGTYPE_SYNC; } +static inline bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type) +{ + return false; +} #endif #endif /* _PTP_CLASSIFY_H_ */ diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index dd4cf01d1e0a..598041b0499e 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -137,6 +137,18 @@ struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) } EXPORT_SYMBOL_GPL(ptp_parse_header); +bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type) +{ + struct ptp_header *hdr; + + hdr = ptp_parse_header(skb, type); + if (!hdr) + return false; + + return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC; +} +EXPORT_SYMBOL_GPL(ptp_msg_is_sync); + void __init ptp_classifier_init(void) { static struct sock_filter ptp_filter[] __initdata = { -- cgit v1.2.3 From 2655926aea9beea62c9ba80c032485456fd848f0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sun, 6 Mar 2022 22:57:52 +0100 Subject: net: Remove netif_rx_any_context() and netif_rx_ni(). Remove netif_rx_any_context and netif_rx_ni() because there are no more users in tree. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 19a27ac361ef..29a850a8d460 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3718,16 +3718,6 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); -static inline int netif_rx_ni(struct sk_buff *skb) -{ - return netif_rx(skb); -} - -static inline int netif_rx_any_context(struct sk_buff *skb) -{ - return netif_rx(skb); -} - int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); -- cgit v1.2.3 From 64807c2321512f67959e51f09e302c145c336184 Mon Sep 17 00:00:00 2001 From: Arun Ramadoss Date: Mon, 7 Mar 2022 21:45:14 +0530 Subject: net: phy: exported the genphy_read_master_slave function genphy_read_master_slave function allows to configure the master/slave for gigabit phys only. In order to use this function irrespective of speed, moved the speed check to the genphy_read_status call. Signed-off-by: Arun Ramadoss Reviewed-by: Andrew Lunn Signed-off-by: Paolo Abeni --- drivers/net/phy/phy_device.c | 19 +++++++++---------- include/linux/phy.h | 1 + 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index ce0bb5951b81..8406ac739def 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2051,17 +2051,11 @@ static int genphy_setup_master_slave(struct phy_device *phydev) CTL1000_PREFER_MASTER), ctl); } -static int genphy_read_master_slave(struct phy_device *phydev) +int genphy_read_master_slave(struct phy_device *phydev) { int cfg, state; int val; - if (!phydev->is_gigabit_capable) { - phydev->master_slave_get = MASTER_SLAVE_CFG_UNSUPPORTED; - phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; - return 0; - } - phydev->master_slave_get = MASTER_SLAVE_CFG_UNKNOWN; phydev->master_slave_state = MASTER_SLAVE_STATE_UNKNOWN; @@ -2102,6 +2096,7 @@ static int genphy_read_master_slave(struct phy_device *phydev) return 0; } +EXPORT_SYMBOL(genphy_read_master_slave); /** * genphy_restart_aneg - Enable and Restart Autonegotiation @@ -2396,14 +2391,18 @@ int genphy_read_status(struct phy_device *phydev) if (phydev->autoneg == AUTONEG_ENABLE && old_link && phydev->link) return 0; + phydev->master_slave_get = MASTER_SLAVE_CFG_UNSUPPORTED; + phydev->master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; phydev->speed = SPEED_UNKNOWN; phydev->duplex = DUPLEX_UNKNOWN; phydev->pause = 0; phydev->asym_pause = 0; - err = genphy_read_master_slave(phydev); - if (err < 0) - return err; + if (phydev->is_gigabit_capable) { + err = genphy_read_master_slave(phydev); + if (err < 0) + return err; + } err = genphy_read_lpa(phydev); if (err < 0) diff --git a/include/linux/phy.h b/include/linux/phy.h index cd08cf1a8b0d..20beeaa7443b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1578,6 +1578,7 @@ int genphy_update_link(struct phy_device *phydev); int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status_fixed(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); +int genphy_read_master_slave(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); int genphy_loopback(struct phy_device *phydev, bool enable); -- cgit v1.2.3 From 1330b6ef3313fcec577d2b020c290dc8b9f11f1a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 7 Mar 2022 16:44:21 -0800 Subject: skb: make drop reason booleanable We have a number of cases where function returns drop/no drop decision as a boolean. Now that we want to report the reason code as well we have to pass extra output arguments. We can make the reason code evaluate correctly as bool. I believe we're good to reorder the reasons as they are reported to user space as strings. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/tcp.h | 21 +++++++++++---------- net/ipv4/tcp.c | 21 +++++++++------------ net/ipv4/tcp_ipv4.c | 12 +++++++----- net/ipv6/tcp_ipv6.c | 11 +++++++---- 5 files changed, 35 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 34f572271c0c..26538ceb4b01 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -314,6 +314,7 @@ struct sk_buff; * used to translate the reason to string. */ enum skb_drop_reason { + SKB_NOT_DROPPED_YET = 0, SKB_DROP_REASON_NOT_SPECIFIED, /* drop reason is not specified */ SKB_DROP_REASON_NO_SOCKET, /* socket not found */ SKB_DROP_REASON_PKT_TOO_SMALL, /* packet size is too small */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d486d7b6112d..ee8237b58e1d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1674,10 +1674,11 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } -bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - enum skb_drop_reason *reason, - const void *saddr, const void *daddr, - int family, int dif, int sdif); + +enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) @@ -1688,13 +1689,13 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, { return NULL; } -static inline bool tcp_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb, - enum skb_drop_reason *reason, - const void *saddr, const void *daddr, - int family, int dif, int sdif) + +static inline enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); { - return false; + return SKB_NOT_DROPPED_YET; } #define tcp_twsk_md5_key(twsk) NULL #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 33f20134e3f1..b5f032958b2c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4434,10 +4434,10 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke EXPORT_SYMBOL(tcp_md5_hash_key); /* Called with rcu_read_lock() */ -bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - enum skb_drop_reason *reason, - const void *saddr, const void *daddr, - int family, int dif, int sdif) +enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif) { /* * This gets called for each TCP segment that arrives @@ -4464,18 +4464,16 @@ bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) - return false; + return SKB_NOT_DROPPED_YET; if (hash_expected && !hash_location) { - *reason = SKB_DROP_REASON_TCP_MD5NOTFOUND; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); - return true; + return SKB_DROP_REASON_TCP_MD5NOTFOUND; } if (!hash_expected && hash_location) { - *reason = SKB_DROP_REASON_TCP_MD5UNEXPECTED; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); - return true; + return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } /* check the signature */ @@ -4483,7 +4481,6 @@ bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { - *reason = SKB_DROP_REASON_TCP_MD5FAILURE; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); if (family == AF_INET) { net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", @@ -4497,9 +4494,9 @@ bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, saddr, ntohs(th->source), daddr, ntohs(th->dest), l3index); } - return true; + return SKB_DROP_REASON_TCP_MD5FAILURE; } - return false; + return SKB_NOT_DROPPED_YET; } EXPORT_SYMBOL(tcp_inbound_md5_hash); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 411357ad9757..81694a354110 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1965,9 +1965,10 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_inbound_md5_hash(sk, skb, &drop_reason, - &iph->saddr, &iph->daddr, - AF_INET, dif, sdif))) { + drop_reason = tcp_inbound_md5_hash(sk, skb, + &iph->saddr, &iph->daddr, + AF_INET, dif, sdif); + if (unlikely(drop_reason)) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -2041,8 +2042,9 @@ process: goto discard_and_relse; } - if (tcp_inbound_md5_hash(sk, skb, &drop_reason, &iph->saddr, - &iph->daddr, AF_INET, dif, sdif)) + drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr, + &iph->daddr, AF_INET, dif, sdif); + if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cb2bb7d2e907..13678d3908fa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1632,8 +1632,10 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (tcp_inbound_md5_hash(sk, skb, &drop_reason, &hdr->saddr, - &hdr->daddr, AF_INET6, dif, sdif)) { + drop_reason = tcp_inbound_md5_hash(sk, skb, + &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); + if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1704,8 +1706,9 @@ process: goto discard_and_relse; } - if (tcp_inbound_md5_hash(sk, skb, &drop_reason, &hdr->saddr, - &hdr->daddr, AF_INET6, dif, sdif)) + drop_reason = tcp_inbound_md5_hash(sk, skb, &hdr->saddr, &hdr->daddr, + AF_INET6, dif, sdif); + if (drop_reason) goto discard_and_relse; if (tcp_filter(sk, skb)) { -- cgit v1.2.3 From 34f46ae0d4b38e83cfb26fb6f06b5b5efea47fdc Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 27 Jan 2022 15:22:21 +0200 Subject: net/mlx5: Add command failures data to debugfs Add new counters to command interface debugfs to count command failures. The following counters added: total_failed - number of times command failed (any kind of failure). failed_mbox_status - number of times command failed on bad status returned by FW. In addition, add data about last command failure to command interface debugfs: last_failed_errno - last command failed returned errno. last_failed_mbox_status - last bad status returned by FW. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 44 ++++++++++++++++++----- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 7 ++++ include/linux/mlx5/driver.h | 9 +++++ 3 files changed, 51 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 823d5808d5a0..8933c00067e8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1877,16 +1877,38 @@ out_in: return err; } +static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, int err) +{ + struct mlx5_cmd_stats *stats; + + if (!err) + return; + + stats = &dev->cmd.stats[opcode]; + spin_lock_irq(&stats->lock); + stats->failed++; + if (err < 0) + stats->last_failed_errno = -err; + if (err == -EREMOTEIO) { + stats->failed_mbox_status++; + stats->last_failed_mbox_status = status; + } + spin_unlock_irq(&stats->lock); +} + /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */ -static int cmd_status_err(int err, void *out) +static int cmd_status_err(struct mlx5_core_dev *dev, int err, u16 opcode, void *out) { - if (err) /* -EREMOTEIO is preserved */ - return err == -EREMOTEIO ? -EIO : err; + u8 status = MLX5_GET(mbox_out, out, status); - if (MLX5_GET(mbox_out, out, status) != MLX5_CMD_STAT_OK) - return -EREMOTEIO; + if (err == -EREMOTEIO) /* -EREMOTEIO is preserved */ + err = -EIO; - return 0; + if (!err && status != MLX5_CMD_STAT_OK) + err = -EREMOTEIO; + + cmd_status_log(dev, opcode, status, err); + return err; } /** @@ -1910,8 +1932,10 @@ static int cmd_status_err(int err, void *out) int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) { int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, false); + u16 opcode = MLX5_GET(mbox_in, in, opcode); - return cmd_status_err(err, out); + err = cmd_status_err(dev, err, opcode, out); + return err; } EXPORT_SYMBOL(mlx5_cmd_do); @@ -1954,8 +1978,9 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size) { int err = cmd_exec(dev, in, in_size, out, out_size, NULL, NULL, true); + u16 opcode = MLX5_GET(mbox_in, in, opcode); - err = cmd_status_err(err, out); + err = cmd_status_err(dev, err, opcode, out); return mlx5_cmd_check(dev, err, in, out); } EXPORT_SYMBOL(mlx5_cmd_exec_polling); @@ -1991,7 +2016,7 @@ static void mlx5_cmd_exec_cb_handler(int status, void *_work) struct mlx5_async_ctx *ctx; ctx = work->ctx; - status = cmd_status_err(status, work->out); + status = cmd_status_err(ctx->dev, status, work->opcode, work->out); work->user_callback(status, work); if (atomic_dec_and_test(&ctx->num_inflight)) wake_up(&ctx->wait); @@ -2005,6 +2030,7 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, work->ctx = ctx; work->user_callback = callback; + work->opcode = MLX5_GET(mbox_in, in, opcode); work->out = out; if (WARN_ON(!atomic_inc_not_zero(&ctx->num_inflight))) return -EIO; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 10d195042ab5..18b04e977bb8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -180,6 +180,13 @@ void mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev) debugfs_create_file("average", 0400, stats->root, stats, &stats_fops); debugfs_create_u64("n", 0400, stats->root, &stats->n); + debugfs_create_u64("failed", 0400, stats->root, &stats->failed); + debugfs_create_u64("failed_mbox_status", 0400, stats->root, + &stats->failed_mbox_status); + debugfs_create_u32("last_failed_errno", 0400, stats->root, + &stats->last_failed_errno); + debugfs_create_u8("last_failed_mbox_status", 0400, stats->root, + &stats->last_failed_mbox_status); } } } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d3b1a6a1f8d2..f18c1e15a12c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -264,6 +264,14 @@ enum { struct mlx5_cmd_stats { u64 sum; u64 n; + /* number of times command failed */ + u64 failed; + /* number of times command failed on bad status returned by FW */ + u64 failed_mbox_status; + /* last command failed returned errno */ + u32 last_failed_errno; + /* last bad status returned by FW */ + u8 last_failed_mbox_status; struct dentry *root; /* protect command average calculations */ spinlock_t lock; @@ -955,6 +963,7 @@ typedef void (*mlx5_async_cbk_t)(int status, struct mlx5_async_work *context); struct mlx5_async_work { struct mlx5_async_ctx *ctx; mlx5_async_cbk_t user_callback; + u16 opcode; /* cmd opcode */ void *out; /* pointer to the cmd output buffer */ }; -- cgit v1.2.3 From d2cb8dda214fb75f946ba554a1ecd25da7004b2b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 26 Jan 2022 07:23:53 +0200 Subject: net/mlx5: Change release_all_pages cap bit location mlx5 FW has changed release_all_pages cap bit by one bit offset to reflect a fix in the FW flow for release_all_pages. The driver should use the new bit to ensure it calls release_all_pages only if the FW fix is there. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ea65131835ab..69985e4d8dfe 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1419,8 +1419,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_130[0xa]; u8 log_max_ra_res_dc[0x6]; - u8 reserved_at_140[0x6]; + u8 reserved_at_140[0x5]; u8 release_all_pages[0x1]; + u8 must_not_use[0x1]; u8 reserved_at_147[0x2]; u8 roce_accl[0x1]; u8 log_max_ra_req_qp[0x6]; -- cgit v1.2.3 From 66771a1c729e816dc84e29ba555013b6e722fb22 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 18 Feb 2022 09:36:20 +0200 Subject: net/mlx5: Move debugfs entries to separate struct Move the debugfs entry pointers under priv to their own struct. Add get function for device debugfs root. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/cong.c | 3 +-- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mlx5/mr.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 30 +++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/main.c | 8 +++--- .../ethernet/mellanox/mlx5/core/steering/dr_dbg.c | 2 +- include/linux/mlx5/driver.h | 17 +++++++----- 8 files changed, 37 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index 0b61df52332a..290ea8ac3838 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -433,8 +433,7 @@ void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u32 port_num) dev->port[port_num].dbg_cc_params = dbg_cc_params; - dbg_cc_params->root = debugfs_create_dir("cc_params", - mdev->priv.dbg_root); + dbg_cc_params->root = debugfs_create_dir("cc_params", mlx5_debugfs_get_dev_root(mdev)); for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { dbg_cc_params->params[i].offset = i; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 85f526c861e9..32a0ea820573 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4178,7 +4178,7 @@ static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev) if (!mlx5_debugfs_root) return 0; - root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root); + root = debugfs_create_dir("delay_drop", mlx5_debugfs_get_dev_root(dev->mdev)); dev->delay_drop.dir_debugfs = root; debugfs_create_atomic_t("num_timeout_events", 0400, root, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 06e4b8cea6bd..32cb7068f0ca 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -696,7 +696,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) if (!mlx5_debugfs_root || dev->is_rep) return; - cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root); + cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { ent = &cache->ent[i]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 8933c00067e8..3b61224e9a3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1543,7 +1543,7 @@ static void create_debugfs_files(struct mlx5_core_dev *dev) { struct mlx5_cmd_debug *dbg = &dev->cmd.dbg; - dbg->dbg_root = debugfs_create_dir("cmd", dev->priv.dbg_root); + dbg->dbg_root = debugfs_create_dir("cmd", mlx5_debugfs_get_dev_root(dev)); debugfs_create_file("in", 0400, dbg->dbg_root, dev, &dfops); debugfs_create_file("out", 0200, dbg->dbg_root, dev, &dfops); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 18b04e977bb8..883e44457367 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -99,26 +99,32 @@ void mlx5_unregister_debugfs(void) debugfs_remove(mlx5_debugfs_root); } +struct dentry *mlx5_debugfs_get_dev_root(struct mlx5_core_dev *dev) +{ + return dev->priv.dbg.dbg_root; +} +EXPORT_SYMBOL(mlx5_debugfs_get_dev_root); + void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev) { - dev->priv.qp_debugfs = debugfs_create_dir("QPs", dev->priv.dbg_root); + dev->priv.dbg.qp_debugfs = debugfs_create_dir("QPs", dev->priv.dbg.dbg_root); } EXPORT_SYMBOL(mlx5_qp_debugfs_init); void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev) { - debugfs_remove_recursive(dev->priv.qp_debugfs); + debugfs_remove_recursive(dev->priv.dbg.qp_debugfs); } EXPORT_SYMBOL(mlx5_qp_debugfs_cleanup); void mlx5_eq_debugfs_init(struct mlx5_core_dev *dev) { - dev->priv.eq_debugfs = debugfs_create_dir("EQs", dev->priv.dbg_root); + dev->priv.dbg.eq_debugfs = debugfs_create_dir("EQs", dev->priv.dbg.dbg_root); } void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev) { - debugfs_remove_recursive(dev->priv.eq_debugfs); + debugfs_remove_recursive(dev->priv.dbg.eq_debugfs); } static ssize_t average_read(struct file *filp, char __user *buf, size_t count, @@ -168,8 +174,8 @@ void mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev) const char *namep; int i; - cmd = &dev->priv.cmdif_debugfs; - *cmd = debugfs_create_dir("commands", dev->priv.dbg_root); + cmd = &dev->priv.dbg.cmdif_debugfs; + *cmd = debugfs_create_dir("commands", dev->priv.dbg.dbg_root); for (i = 0; i < MLX5_CMD_OP_MAX; i++) { stats = &dev->cmd.stats[i]; @@ -193,17 +199,17 @@ void mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev) void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev) { - debugfs_remove_recursive(dev->priv.cmdif_debugfs); + debugfs_remove_recursive(dev->priv.dbg.cmdif_debugfs); } void mlx5_cq_debugfs_init(struct mlx5_core_dev *dev) { - dev->priv.cq_debugfs = debugfs_create_dir("CQs", dev->priv.dbg_root); + dev->priv.dbg.cq_debugfs = debugfs_create_dir("CQs", dev->priv.dbg.dbg_root); } void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev) { - debugfs_remove_recursive(dev->priv.cq_debugfs); + debugfs_remove_recursive(dev->priv.dbg.cq_debugfs); } static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, @@ -448,7 +454,7 @@ int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) if (!mlx5_debugfs_root) return 0; - err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.qp_debugfs, + err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.dbg.qp_debugfs, &qp->dbg, qp->qpn, qp_fields, ARRAY_SIZE(qp_fields), qp); if (err) @@ -475,7 +481,7 @@ int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq) if (!mlx5_debugfs_root) return 0; - err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.eq_debugfs, + err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.dbg.eq_debugfs, &eq->dbg, eq->eqn, eq_fields, ARRAY_SIZE(eq_fields), eq); if (err) @@ -500,7 +506,7 @@ int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq) if (!mlx5_debugfs_root) return 0; - err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.cq_debugfs, + err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.dbg.cq_debugfs, &cq->dbg, cq->cqn, cq_fields, ARRAY_SIZE(cq_fields), cq); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 98be7050aa8d..d8d36477b97f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1487,8 +1487,8 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) INIT_LIST_HEAD(&priv->pgdir_list); priv->numa_node = dev_to_node(mlx5_core_dma_dev(dev)); - priv->dbg_root = debugfs_create_dir(dev_name(dev->device), - mlx5_debugfs_root); + priv->dbg.dbg_root = debugfs_create_dir(dev_name(dev->device), + mlx5_debugfs_root); INIT_LIST_HEAD(&priv->traps); err = mlx5_tout_init(dev); @@ -1524,7 +1524,7 @@ err_pagealloc_init: err_health_init: mlx5_tout_cleanup(dev); err_timeout_init: - debugfs_remove(dev->priv.dbg_root); + debugfs_remove(dev->priv.dbg.dbg_root); mutex_destroy(&priv->pgdir_mutex); mutex_destroy(&priv->alloc_mutex); mutex_destroy(&priv->bfregs.wc_head.lock); @@ -1542,7 +1542,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev) mlx5_pagealloc_cleanup(dev); mlx5_health_cleanup(dev); mlx5_tout_cleanup(dev); - debugfs_remove_recursive(dev->priv.dbg_root); + debugfs_remove_recursive(dev->priv.dbg.dbg_root); mutex_destroy(&priv->pgdir_mutex); mutex_destroy(&priv->alloc_mutex); mutex_destroy(&priv->bfregs.wc_head.lock); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c index 2784cd59fefe..2e8b109fb34e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_dbg.c @@ -630,7 +630,7 @@ void mlx5dr_dbg_init_dump(struct mlx5dr_domain *dmn) } dmn->dump_info.steering_debugfs = - debugfs_create_dir("steering", dev->priv.dbg_root); + debugfs_create_dir("steering", mlx5_debugfs_get_dev_root(dev)); dmn->dump_info.fdb_debugfs = debugfs_create_dir("fdb", dmn->dump_info.steering_debugfs); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f18c1e15a12c..aa539d245a12 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -551,6 +551,14 @@ struct mlx5_adev { int idx; }; +struct mlx5_debugfs_entries { + struct dentry *dbg_root; + struct dentry *qp_debugfs; + struct dentry *eq_debugfs; + struct dentry *cq_debugfs; + struct dentry *cmdif_debugfs; +}; + struct mlx5_ft_pool; struct mlx5_priv { /* IRQ table valid only for real pci devices PF or VF */ @@ -570,12 +578,7 @@ struct mlx5_priv { struct mlx5_core_health health; struct list_head traps; - /* start: qp staff */ - struct dentry *qp_debugfs; - struct dentry *eq_debugfs; - struct dentry *cq_debugfs; - struct dentry *cmdif_debugfs; - /* end: qp staff */ + struct mlx5_debugfs_entries dbg; /* start: alloc staff */ /* protect buffer allocation according to numa node */ @@ -585,7 +588,6 @@ struct mlx5_priv { struct mutex pgdir_mutex; struct list_head pgdir_list; /* end: alloc staff */ - struct dentry *dbg_root; struct list_head ctx_list; spinlock_t ctx_lock; @@ -1038,6 +1040,7 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); +struct dentry *mlx5_debugfs_get_dev_root(struct mlx5_core_dev *dev); void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev); void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, -- cgit v1.2.3 From 4e05cbf05c66ca6931ee4f2a5aff0d84f1e65f40 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 27 Jan 2022 07:03:33 +0200 Subject: net/mlx5: Add pages debugfs Add pages debugfs to expose the following counters for debuggability: fw_pages_total - How many pages were given to FW and not returned yet. vfs_pages - For SRIOV, how many pages were given to FW for virtual functions usage. host_pf_pages - For ECPF, how many pages were given to FW for external hosts physical functions usage. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 17 +++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 2 ++ include/linux/mlx5/driver.h | 9 ++++++--- 3 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 883e44457367..8673ba2df910 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -212,6 +212,23 @@ void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev) debugfs_remove_recursive(dev->priv.dbg.cq_debugfs); } +void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev) +{ + struct dentry *pages; + + dev->priv.dbg.pages_debugfs = debugfs_create_dir("pages", dev->priv.dbg.dbg_root); + pages = dev->priv.dbg.pages_debugfs; + + debugfs_create_u32("fw_pages_total", 0400, pages, &dev->priv.fw_pages); + debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.vfs_pages); + debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.host_pf_pages); +} + +void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev) +{ + debugfs_remove_recursive(dev->priv.dbg.pages_debugfs); +} + static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, int index, int *is_str) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index d29a305858ad..8855fe71d480 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -714,12 +714,14 @@ int mlx5_pagealloc_init(struct mlx5_core_dev *dev) return -ENOMEM; xa_init(&dev->priv.page_root_xa); + mlx5_pages_debugfs_init(dev); return 0; } void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev) { + mlx5_pages_debugfs_cleanup(dev); xa_destroy(&dev->priv.page_root_xa); destroy_workqueue(dev->priv.pg_wq); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index aa539d245a12..c5f93b5910ed 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -557,6 +557,7 @@ struct mlx5_debugfs_entries { struct dentry *eq_debugfs; struct dentry *cq_debugfs; struct dentry *cmdif_debugfs; + struct dentry *pages_debugfs; }; struct mlx5_ft_pool; @@ -569,11 +570,11 @@ struct mlx5_priv { struct mlx5_nb pg_nb; struct workqueue_struct *pg_wq; struct xarray page_root_xa; - int fw_pages; + u32 fw_pages; atomic_t reg_pages; struct list_head free_list; - int vfs_pages; - int host_pf_pages; + u32 vfs_pages; + u32 host_pf_pages; struct mlx5_core_health health; struct list_head traps; @@ -1026,6 +1027,8 @@ int mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); void mlx5_pagealloc_start(struct mlx5_core_dev *dev); void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); +void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev); void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, s32 npages, bool ec_function); int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); -- cgit v1.2.3 From 32071187e9fb18da62f5be569bd2ea0d7a981ee8 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 27 Jan 2022 07:51:14 +0200 Subject: net/mlx5: Add debugfs counters for page commands failures Add the following new debugfs counters for debug and verbosity: fw_pages_alloc_failed - number of pages FW requested but driver failed to allocate. give_pages_dropped - number of pages given to FW, but command give pages failed by FW. reclaim_pages_discard - number of pages which were about to reclaim back and FW failed the command. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/debugfs.c | 4 ++++ drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 14 +++++++++++--- include/linux/mlx5/driver.h | 3 +++ 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 8673ba2df910..d69bac93a83b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -222,6 +222,10 @@ void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev) debugfs_create_u32("fw_pages_total", 0400, pages, &dev->priv.fw_pages); debugfs_create_u32("fw_pages_vfs", 0400, pages, &dev->priv.vfs_pages); debugfs_create_u32("fw_pages_host_pf", 0400, pages, &dev->priv.host_pf_pages); + debugfs_create_u32("fw_pages_alloc_failed", 0400, pages, &dev->priv.fw_pages_alloc_failed); + debugfs_create_u32("fw_pages_give_dropped", 0400, pages, &dev->priv.give_pages_dropped); + debugfs_create_u32("fw_pages_reclaim_discard", 0400, pages, + &dev->priv.reclaim_pages_discard); } void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 8855fe71d480..e0543b860144 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -352,8 +352,10 @@ retry: if (err) { if (err == -ENOMEM) err = alloc_system_page(dev, function); - if (err) + if (err) { + dev->priv.fw_pages_alloc_failed += (npages - i); goto out_4k; + } goto retry; } @@ -372,14 +374,14 @@ retry: /* if triggered by FW and failed by FW ignore */ if (event) { err = 0; - goto out_4k; + goto out_dropped; } } if (err) { err = mlx5_cmd_check(dev, err, in, out); mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", func_id, npages, err); - goto out_4k; + goto out_dropped; } dev->priv.fw_pages += npages; @@ -394,6 +396,8 @@ retry: kvfree(in); return 0; +out_dropped: + dev->priv.give_pages_dropped += npages; out_4k: for (i--; i >= 0; i--) free_4k(dev, MLX5_GET64(manage_pages_in, in, pas[i]), function); @@ -516,6 +520,10 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, mlx5_core_dbg(dev, "func 0x%x, npages %d, outlen %d\n", func_id, npages, outlen); err = reclaim_pages_cmd(dev, in, sizeof(in), out, outlen); + if (err) { + npages = MLX5_GET(manage_pages_in, in, input_num_entries); + dev->priv.reclaim_pages_discard += npages; + } /* if triggered by FW event and failed by FW then ignore */ if (event && err == -EREMOTEIO) err = 0; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c5f93b5910ed..00a914b0716e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -575,6 +575,9 @@ struct mlx5_priv { struct list_head free_list; u32 vfs_pages; u32 host_pf_pages; + u32 fw_pages_alloc_failed; + u32 give_pages_dropped; + u32 reclaim_pages_discard; struct mlx5_core_health health; struct list_head traps; -- cgit v1.2.3 From 5c422bfad2fbab96381273e50c7084465199501d Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 24 Feb 2022 00:58:58 +0200 Subject: net/mlx5: DR, Add support for matching on Internet Header Length (IHL) Add support for matching on new field - Internet Header Length (IHL). Signed-off-by: Muhammad Sammar Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c | 11 +++++++++-- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 4 +++- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 6 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index 38971fe1dfe1..1668c2b2f60f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -47,6 +47,11 @@ static bool dr_mask_is_ttl_set(struct mlx5dr_match_spec *spec) return spec->ttl_hoplimit; } +static bool dr_mask_is_ipv4_ihl_set(struct mlx5dr_match_spec *spec) +{ + return spec->ipv4_ihl; +} + #define DR_MASK_IS_L2_DST(_spec, _misc, _inner_outer) (_spec.first_vid || \ (_spec).first_cfi || (_spec).first_prio || (_spec).cvlan_tag || \ (_spec).svlan_tag || (_spec).dmac_47_16 || (_spec).dmac_15_0 || \ @@ -507,7 +512,8 @@ static int dr_matcher_set_ste_builders(struct mlx5dr_matcher *matcher, mlx5dr_ste_build_eth_l3_ipv4_5_tuple(ste_ctx, &sb[idx++], &mask, inner, rx); - if (dr_mask_is_ttl_set(&mask.outer)) + if (dr_mask_is_ttl_set(&mask.outer) || + dr_mask_is_ipv4_ihl_set(&mask.outer)) mlx5dr_ste_build_eth_l3_ipv4_misc(ste_ctx, &sb[idx++], &mask, inner, rx); } @@ -614,7 +620,8 @@ static int dr_matcher_set_ste_builders(struct mlx5dr_matcher *matcher, mlx5dr_ste_build_eth_l3_ipv4_5_tuple(ste_ctx, &sb[idx++], &mask, inner, rx); - if (dr_mask_is_ttl_set(&mask.inner)) + if (dr_mask_is_ttl_set(&mask.inner) || + dr_mask_is_ipv4_ihl_set(&mask.inner)) mlx5dr_ste_build_eth_l3_ipv4_misc(ste_ctx, &sb[idx++], &mask, inner, rx); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index 187e29b409b6..c7094fb10a7f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -793,6 +793,7 @@ static void dr_ste_copy_mask_spec(char *mask, struct mlx5dr_match_spec *spec, bo spec->tcp_sport = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, tcp_sport, clr); spec->tcp_dport = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, tcp_dport, clr); + spec->ipv4_ihl = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ipv4_ihl, clr); spec->ttl_hoplimit = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, ttl_hoplimit, clr); spec->udp_sport = IFC_GET_CLR(fte_match_set_lyr_2_4, mask, udp_sport, clr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c index 2d62950f7a29..80424d1e3bb7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c @@ -1152,6 +1152,7 @@ dr_ste_v0_build_eth_l3_ipv4_misc_tag(struct mlx5dr_match_param *value, struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; DR_STE_SET_TAG(eth_l3_ipv4_misc, tag, time_to_live, spec, ttl_hoplimit); + DR_STE_SET_TAG(eth_l3_ipv4_misc, tag, ihl, spec, ipv4_ihl); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c index 6ca06800f1d9..d248a428f872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c @@ -1331,6 +1331,7 @@ static int dr_ste_v1_build_eth_l3_ipv4_misc_tag(struct mlx5dr_match_param *value struct mlx5dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; DR_STE_SET_TAG(eth_l3_ipv4_misc_v1, tag, time_to_live, spec, ttl_hoplimit); + DR_STE_SET_TAG(eth_l3_ipv4_misc_v1, tag, ihl, spec, ipv4_ihl); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 55fcb751e24a..02590f665174 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -555,7 +555,9 @@ struct mlx5dr_match_spec { */ u32 tcp_dport:16; - u32 reserved_auto1:24; + u32 reserved_auto1:16; + u32 ipv4_ihl:4; + u32 reserved_auto2:4; u32 ttl_hoplimit:8; /* UDP source port.;tcp and udp sport/dport are mutually exclusive */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 69985e4d8dfe..1f0c35162b7b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -493,7 +493,10 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 tcp_sport[0x10]; u8 tcp_dport[0x10]; - u8 reserved_at_c0[0x18]; + u8 reserved_at_c0[0x10]; + u8 ipv4_ihl[0x4]; + u8 reserved_at_c4[0x4]; + u8 ttl_hoplimit[0x8]; u8 udp_sport[0x10]; -- cgit v1.2.3 From 6862c787c7e88df490675ed781dc9052dba88a56 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 23 Feb 2022 18:40:59 +0200 Subject: net/mlx5: DR, Add support for ConnectX-7 steering Add support for a new SW format version that is implemented by ConnectX-7. Except for several differences, the STEv2 is identical to STEv1, so for most callbacks the STEv2 context struct will call STEv1 functions. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 1 + .../mellanox/mlx5/core/steering/dr_domain.c | 2 +- .../mellanox/mlx5/core/steering/dr_matcher.c | 8 +- .../ethernet/mellanox/mlx5/core/steering/dr_ste.c | 2 + .../ethernet/mellanox/mlx5/core/steering/dr_ste.h | 1 + .../mellanox/mlx5/core/steering/dr_ste_v1.c | 202 +++++++++--------- .../mellanox/mlx5/core/steering/dr_ste_v1.h | 94 +++++++++ .../mellanox/mlx5/core/steering/dr_ste_v2.c | 231 +++++++++++++++++++++ .../ethernet/mellanox/mlx5/core/steering/fs_dr.c | 2 +- .../ethernet/mellanox/mlx5/core/steering/mlx5dr.h | 2 +- include/linux/mlx5/mlx5_ifc.h | 1 + 11 files changed, 436 insertions(+), 110 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.h create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index a7170ab3af97..b94cca45eb78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -103,6 +103,7 @@ mlx5_core-$(CONFIG_MLX5_SW_STEERING) += steering/dr_domain.o steering/dr_table.o steering/dr_icm_pool.o steering/dr_buddy.o \ steering/dr_ste.o steering/dr_send.o \ steering/dr_ste_v0.o steering/dr_ste_v1.o \ + steering/dr_ste_v2.o \ steering/dr_cmd.o steering/dr_fw.o \ steering/dr_action.o steering/fs_dr.o \ steering/dr_dbg.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c index 5fa7f9d6d8b9..fc6ae49b5ecc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c @@ -8,7 +8,7 @@ #define DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, dmn_type) \ ((dmn)->info.caps.dmn_type##_sw_owner || \ ((dmn)->info.caps.dmn_type##_sw_owner_v2 && \ - (dmn)->info.caps.sw_format_ver <= MLX5_STEERING_FORMAT_CONNECTX_6DX)) + (dmn)->info.caps.sw_format_ver <= MLX5_STEERING_FORMAT_CONNECTX_7)) static void dr_domain_init_csum_recalc_fts(struct mlx5dr_domain *dmn) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c index 1668c2b2f60f..a4b5b415df90 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_matcher.c @@ -108,7 +108,7 @@ dr_mask_is_vxlan_gpe_set(struct mlx5dr_match_misc3 *misc3) static bool dr_matcher_supp_vxlan_gpe(struct mlx5dr_cmd_caps *caps) { - return (caps->sw_format_ver == MLX5_STEERING_FORMAT_CONNECTX_6DX) || + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || (caps->flex_protocols & MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED); } @@ -149,7 +149,7 @@ static bool dr_mask_is_tnl_geneve_tlv_opt_exist_set(struct mlx5dr_match_misc *mi static bool dr_matcher_supp_tnl_geneve(struct mlx5dr_cmd_caps *caps) { - return (caps->sw_format_ver == MLX5_STEERING_FORMAT_CONNECTX_6DX) || + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || (caps->flex_protocols & MLX5_FLEX_PARSER_GENEVE_ENABLED); } @@ -266,13 +266,13 @@ static bool dr_mask_is_tnl_gtpu_any(struct mlx5dr_match_param *mask, static int dr_matcher_supp_icmp_v4(struct mlx5dr_cmd_caps *caps) { - return (caps->sw_format_ver == MLX5_STEERING_FORMAT_CONNECTX_6DX) || + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED); } static int dr_matcher_supp_icmp_v6(struct mlx5dr_cmd_caps *caps) { - return (caps->sw_format_ver == MLX5_STEERING_FORMAT_CONNECTX_6DX) || + return (caps->sw_format_ver >= MLX5_STEERING_FORMAT_CONNECTX_6DX) || (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V6_ENABLED); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c index b25df6ae8ef3..518e949847a3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c @@ -1367,6 +1367,8 @@ struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx(u8 version) return mlx5dr_ste_get_ctx_v0(); else if (version == MLX5_STEERING_FORMAT_CONNECTX_6DX) return mlx5dr_ste_get_ctx_v1(); + else if (version == MLX5_STEERING_FORMAT_CONNECTX_7) + return mlx5dr_ste_get_ctx_v2(); return NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h index 9d9b073df75b..17513baff9b0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.h @@ -201,5 +201,6 @@ struct mlx5dr_ste_ctx { struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v0(void); struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v1(void); +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v2(void); #endif /* _DR_STE_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c index 2f6d0d62874f..fcb962c6db2e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c @@ -3,7 +3,7 @@ #include #include "mlx5_ifc_dr_ste_v1.h" -#include "dr_ste.h" +#include "dr_ste_v1.h" #define DR_STE_CALC_DFNR_TYPE(lookup_type, inner) \ ((inner) ? DR_STE_V1_LU_TYPE_##lookup_type##_I : \ @@ -262,7 +262,7 @@ static void dr_ste_v1_set_entry_type(u8 *hw_ste_p, u8 entry_type) MLX5_SET(ste_match_bwc_v1, hw_ste_p, entry_format, entry_type); } -static void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr) +void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr) { u64 index = miss_addr >> 6; @@ -270,7 +270,7 @@ static void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr) MLX5_SET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6, index); } -static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p) +u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p) { u64 index = ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) | @@ -279,12 +279,12 @@ static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p) return index << 6; } -static void dr_ste_v1_set_byte_mask(u8 *hw_ste_p, u16 byte_mask) +void dr_ste_v1_set_byte_mask(u8 *hw_ste_p, u16 byte_mask) { MLX5_SET(ste_match_bwc_v1, hw_ste_p, byte_mask, byte_mask); } -static u16 dr_ste_v1_get_byte_mask(u8 *hw_ste_p) +u16 dr_ste_v1_get_byte_mask(u8 *hw_ste_p) { return MLX5_GET(ste_match_bwc_v1, hw_ste_p, byte_mask); } @@ -295,13 +295,13 @@ static void dr_ste_v1_set_lu_type(u8 *hw_ste_p, u16 lu_type) MLX5_SET(ste_match_bwc_v1, hw_ste_p, match_definer_ctx_idx, lu_type & 0xFF); } -static void dr_ste_v1_set_next_lu_type(u8 *hw_ste_p, u16 lu_type) +void dr_ste_v1_set_next_lu_type(u8 *hw_ste_p, u16 lu_type) { MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_entry_format, lu_type >> 8); MLX5_SET(ste_match_bwc_v1, hw_ste_p, hash_definer_ctx_idx, lu_type & 0xFF); } -static u16 dr_ste_v1_get_next_lu_type(u8 *hw_ste_p) +u16 dr_ste_v1_get_next_lu_type(u8 *hw_ste_p) { u8 mode = MLX5_GET(ste_match_bwc_v1, hw_ste_p, next_entry_format); u8 index = MLX5_GET(ste_match_bwc_v1, hw_ste_p, hash_definer_ctx_idx); @@ -314,7 +314,7 @@ static void dr_ste_v1_set_hit_gvmi(u8 *hw_ste_p, u16 gvmi) MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_table_base_63_48, gvmi); } -static void dr_ste_v1_set_hit_addr(u8 *hw_ste_p, u64 icm_addr, u32 ht_size) +void dr_ste_v1_set_hit_addr(u8 *hw_ste_p, u64 icm_addr, u32 ht_size) { u64 index = (icm_addr >> 5) | ht_size; @@ -322,8 +322,7 @@ static void dr_ste_v1_set_hit_addr(u8 *hw_ste_p, u64 icm_addr, u32 ht_size) MLX5_SET(ste_match_bwc_v1, hw_ste_p, next_table_base_31_5_size, index); } -static void dr_ste_v1_init(u8 *hw_ste_p, u16 lu_type, - bool is_rx, u16 gvmi) +void dr_ste_v1_init(u8 *hw_ste_p, u16 lu_type, bool is_rx, u16 gvmi) { dr_ste_v1_set_lu_type(hw_ste_p, lu_type); dr_ste_v1_set_next_lu_type(hw_ste_p, MLX5DR_STE_LU_TYPE_DONT_CARE); @@ -333,8 +332,7 @@ static void dr_ste_v1_init(u8 *hw_ste_p, u16 lu_type, MLX5_SET(ste_match_bwc_v1, hw_ste_p, miss_address_63_48, gvmi); } -static void dr_ste_v1_prepare_for_postsend(u8 *hw_ste_p, - u32 ste_size) +void dr_ste_v1_prepare_for_postsend(u8 *hw_ste_p, u32 ste_size) { u8 *tag = hw_ste_p + DR_STE_SIZE_CTRL; u8 *mask = tag + DR_STE_SIZE_TAG; @@ -511,12 +509,12 @@ static void dr_ste_v1_arr_init_next_match(u8 **last_ste, memset(action, 0, MLX5_FLD_SZ_BYTES(ste_mask_and_match_v1, action)); } -static void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, - u8 *action_type_set, - u32 actions_caps, - u8 *last_ste, - struct mlx5dr_ste_actions_attr *attr, - u32 *added_stes) +void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) { u8 *action = MLX5_ADDR_OF(ste_match_bwc_v1, last_ste, action); u8 action_sz = DR_STE_ACTION_DOUBLE_SZ; @@ -635,12 +633,12 @@ static void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1); } -static void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, - u8 *action_type_set, - u32 actions_caps, - u8 *last_ste, - struct mlx5dr_ste_actions_attr *attr, - u32 *added_stes) +void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, + u8 *action_type_set, + u32 actions_caps, + u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, + u32 *added_stes) { u8 *action = MLX5_ADDR_OF(ste_match_bwc_v1, last_ste, action); u8 action_sz = DR_STE_ACTION_DOUBLE_SZ; @@ -808,11 +806,11 @@ static void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1); } -static void dr_ste_v1_set_action_set(u8 *d_action, - u8 hw_field, - u8 shifter, - u8 length, - u32 data) +void dr_ste_v1_set_action_set(u8 *d_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) { shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; MLX5_SET(ste_double_action_set_v1, d_action, action_id, DR_STE_V1_ACTION_ID_SET); @@ -822,11 +820,11 @@ static void dr_ste_v1_set_action_set(u8 *d_action, MLX5_SET(ste_double_action_set_v1, d_action, inline_data, data); } -static void dr_ste_v1_set_action_add(u8 *d_action, - u8 hw_field, - u8 shifter, - u8 length, - u32 data) +void dr_ste_v1_set_action_add(u8 *d_action, + u8 hw_field, + u8 shifter, + u8 length, + u32 data) { shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; MLX5_SET(ste_double_action_add_v1, d_action, action_id, DR_STE_V1_ACTION_ID_ADD); @@ -836,12 +834,12 @@ static void dr_ste_v1_set_action_add(u8 *d_action, MLX5_SET(ste_double_action_add_v1, d_action, add_value, data); } -static void dr_ste_v1_set_action_copy(u8 *d_action, - u8 dst_hw_field, - u8 dst_shifter, - u8 dst_len, - u8 src_hw_field, - u8 src_shifter) +void dr_ste_v1_set_action_copy(u8 *d_action, + u8 dst_hw_field, + u8 dst_shifter, + u8 dst_len, + u8 src_hw_field, + u8 src_shifter) { dst_shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; src_shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; @@ -856,11 +854,11 @@ static void dr_ste_v1_set_action_copy(u8 *d_action, #define DR_STE_DECAP_L3_ACTION_NUM 8 #define DR_STE_L2_HDR_MAX_SZ 20 -static int dr_ste_v1_set_action_decap_l3_list(void *data, - u32 data_sz, - u8 *hw_action, - u32 hw_action_sz, - u16 *used_hw_action_num) +int dr_ste_v1_set_action_decap_l3_list(void *data, + u32 data_sz, + u8 *hw_action, + u32 hw_action_sz, + u16 *used_hw_action_num) { u8 padded_data[DR_STE_L2_HDR_MAX_SZ] = {}; void *data_ptr = padded_data; @@ -985,8 +983,8 @@ static int dr_ste_v1_build_eth_l2_src_dst_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_eth_l2_src_dst_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l2_src_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l2_src_dst_bit_mask(mask, sb->inner, sb->bit_mask); @@ -1009,8 +1007,8 @@ static int dr_ste_v1_build_eth_l3_ipv6_dst_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_eth_l3_ipv6_dst_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l3_ipv6_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l3_ipv6_dst_tag(mask, sb, sb->bit_mask); @@ -1033,8 +1031,8 @@ static int dr_ste_v1_build_eth_l3_ipv6_src_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_eth_l3_ipv6_src_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l3_ipv6_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l3_ipv6_src_tag(mask, sb, sb->bit_mask); @@ -1068,8 +1066,8 @@ static int dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag(struct mlx5dr_match_param *va return 0; } -static void dr_ste_v1_build_eth_l3_ipv4_5_tuple_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l3_ipv4_5_tuple_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag(mask, sb, sb->bit_mask); @@ -1209,8 +1207,8 @@ static int dr_ste_v1_build_eth_l2_src_tag(struct mlx5dr_match_param *value, return dr_ste_v1_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); } -static void dr_ste_v1_build_eth_l2_src_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l2_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l2_src_bit_mask(mask, sb->inner, sb->bit_mask); @@ -1242,8 +1240,8 @@ static int dr_ste_v1_build_eth_l2_dst_tag(struct mlx5dr_match_param *value, return dr_ste_v1_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); } -static void dr_ste_v1_build_eth_l2_dst_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l2_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l2_dst_bit_mask(mask, sb->inner, sb->bit_mask); @@ -1322,8 +1320,8 @@ static int dr_ste_v1_build_eth_l2_tnl_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_eth_l2_tnl_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l2_tnl_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l2_tnl_bit_mask(mask, sb->inner, sb->bit_mask); @@ -1344,8 +1342,8 @@ static int dr_ste_v1_build_eth_l3_ipv4_misc_tag(struct mlx5dr_match_param *value return 0; } -static void dr_ste_v1_build_eth_l3_ipv4_misc_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l3_ipv4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l3_ipv4_misc_tag(mask, sb, sb->bit_mask); @@ -1384,8 +1382,8 @@ static int dr_ste_v1_build_eth_ipv6_l3_l4_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_eth_ipv6_l3_l4_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_ipv6_l3_l4_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_ipv6_l3_l4_tag(mask, sb, sb->bit_mask); @@ -1408,8 +1406,8 @@ static int dr_ste_v1_build_mpls_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_mpls_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_mpls_tag(mask, sb, sb->bit_mask); @@ -1435,8 +1433,8 @@ static int dr_ste_v1_build_tnl_gre_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_tnl_gre_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_tnl_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_tnl_gre_tag(mask, sb, sb->bit_mask); @@ -1480,8 +1478,8 @@ static int dr_ste_v1_build_tnl_mpls_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_tnl_mpls_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_tnl_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_tnl_mpls_tag(mask, sb, sb->bit_mask); @@ -1515,8 +1513,8 @@ static int dr_ste_v1_build_tnl_mpls_over_udp_tag(struct mlx5dr_match_param *valu return 0; } -static void dr_ste_v1_build_tnl_mpls_over_udp_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_tnl_mpls_over_udp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_tnl_mpls_over_udp_tag(mask, sb, sb->bit_mask); @@ -1556,8 +1554,8 @@ static int dr_ste_v1_build_tnl_mpls_over_gre_tag(struct mlx5dr_match_param *valu return 0; } -static void dr_ste_v1_build_tnl_mpls_over_gre_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_tnl_mpls_over_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_tnl_mpls_over_gre_tag(mask, sb, sb->bit_mask); @@ -1603,8 +1601,8 @@ static int dr_ste_v1_build_icmp_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_icmp_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_icmp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_icmp_tag(mask, sb, sb->bit_mask); @@ -1625,8 +1623,8 @@ static int dr_ste_v1_build_general_purpose_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_general_purpose_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_general_purpose_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_general_purpose_tag(mask, sb, sb->bit_mask); @@ -1652,8 +1650,8 @@ static int dr_ste_v1_build_eth_l4_misc_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_eth_l4_misc_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_eth_l4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_eth_l4_misc_tag(mask, sb, sb->bit_mask); @@ -1682,9 +1680,8 @@ dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag(struct mlx5dr_match_param *value, return 0; } -static void -dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag(mask, sb, sb->bit_mask); @@ -1712,9 +1709,8 @@ dr_ste_v1_build_flex_parser_tnl_geneve_tag(struct mlx5dr_match_param *value, return 0; } -static void -dr_ste_v1_build_flex_parser_tnl_geneve_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_flex_parser_tnl_geneve_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_flex_parser_tnl_geneve_tag(mask, sb, sb->bit_mask); @@ -1735,8 +1731,8 @@ static int dr_ste_v1_build_tnl_header_0_1_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_tnl_header_0_1_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_tnl_header_0_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; dr_ste_v1_build_tnl_header_0_1_tag(mask, sb, sb->bit_mask); @@ -1758,8 +1754,8 @@ static int dr_ste_v1_build_register_0_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_register_0_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_register_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_register_0_tag(mask, sb, sb->bit_mask); @@ -1782,8 +1778,8 @@ static int dr_ste_v1_build_register_1_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_register_1_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_register_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_register_1_tag(mask, sb, sb->bit_mask); @@ -1846,8 +1842,8 @@ static int dr_ste_v1_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_src_gvmi_qpn_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_src_gvmi_qpn_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); @@ -1901,8 +1897,8 @@ static int dr_ste_v1_build_felx_parser_tag(struct mlx5dr_match_param *value, return 0; } -static void dr_ste_v1_build_flex_parser_0_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_flex_parser_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_0; dr_ste_v1_build_felx_parser_tag(mask, sb, sb->bit_mask); @@ -1910,8 +1906,8 @@ static void dr_ste_v1_build_flex_parser_0_init(struct mlx5dr_ste_build *sb, sb->ste_build_tag_func = &dr_ste_v1_build_felx_parser_tag; } -static void dr_ste_v1_build_flex_parser_1_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_flex_parser_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_1; dr_ste_v1_build_felx_parser_tag(mask, sb, sb->bit_mask); @@ -1935,7 +1931,7 @@ dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_tag(struct mlx5dr_match_param *va return 0; } -static void +void dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_init(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask) { @@ -1968,7 +1964,7 @@ dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_tag(struct mlx5dr_match_par return 0; } -static void +void dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_init(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask) { @@ -1991,8 +1987,8 @@ static int dr_ste_v1_build_flex_parser_tnl_gtpu_tag(struct mlx5dr_match_param *v return 0; } -static void dr_ste_v1_build_flex_parser_tnl_gtpu_init(struct mlx5dr_ste_build *sb, - struct mlx5dr_match_param *mask) +void dr_ste_v1_build_flex_parser_tnl_gtpu_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask) { dr_ste_v1_build_flex_parser_tnl_gtpu_tag(mask, sb, sb->bit_mask); @@ -2017,7 +2013,7 @@ dr_ste_v1_build_tnl_gtpu_flex_parser_0_tag(struct mlx5dr_match_param *value, return 0; } -static void +void dr_ste_v1_build_tnl_gtpu_flex_parser_0_init(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask) { @@ -2044,7 +2040,7 @@ dr_ste_v1_build_tnl_gtpu_flex_parser_1_tag(struct mlx5dr_match_param *value, return 0; } -static void +void dr_ste_v1_build_tnl_gtpu_flex_parser_1_init(struct mlx5dr_ste_build *sb, struct mlx5dr_match_param *mask) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.h new file mode 100644 index 000000000000..8a1d49790c6e --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef _DR_STE_V1_ +#define _DR_STE_V1_ + +#include "dr_types.h" +#include "dr_ste.h" + +void dr_ste_v1_set_miss_addr(u8 *hw_ste_p, u64 miss_addr); +u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p); +void dr_ste_v1_set_byte_mask(u8 *hw_ste_p, u16 byte_mask); +u16 dr_ste_v1_get_byte_mask(u8 *hw_ste_p); +void dr_ste_v1_set_next_lu_type(u8 *hw_ste_p, u16 lu_type); +u16 dr_ste_v1_get_next_lu_type(u8 *hw_ste_p); +void dr_ste_v1_set_hit_addr(u8 *hw_ste_p, u64 icm_addr, u32 ht_size); +void dr_ste_v1_init(u8 *hw_ste_p, u16 lu_type, bool is_rx, u16 gvmi); +void dr_ste_v1_prepare_for_postsend(u8 *hw_ste_p, u32 ste_size); +void dr_ste_v1_set_actions_tx(struct mlx5dr_domain *dmn, u8 *action_type_set, + u32 actions_caps, u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, u32 *added_stes); +void dr_ste_v1_set_actions_rx(struct mlx5dr_domain *dmn, u8 *action_type_set, + u32 actions_caps, u8 *last_ste, + struct mlx5dr_ste_actions_attr *attr, u32 *added_stes); +void dr_ste_v1_set_action_set(u8 *d_action, u8 hw_field, u8 shifter, + u8 length, u32 data); +void dr_ste_v1_set_action_add(u8 *d_action, u8 hw_field, u8 shifter, + u8 length, u32 data); +void dr_ste_v1_set_action_copy(u8 *d_action, u8 dst_hw_field, u8 dst_shifter, + u8 dst_len, u8 src_hw_field, u8 src_shifter); +int dr_ste_v1_set_action_decap_l3_list(void *data, u32 data_sz, u8 *hw_action, + u32 hw_action_sz, u16 *used_hw_action_num); +void dr_ste_v1_build_eth_l2_src_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l3_ipv6_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l3_ipv6_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l3_ipv4_5_tuple_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l2_src_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l2_dst_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l2_tnl_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l3_ipv4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_ipv6_l3_l4_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_tnl_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_tnl_mpls_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_tnl_mpls_over_udp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_tnl_mpls_over_gre_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_icmp_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_general_purpose_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_eth_l4_misc_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_flex_parser_tnl_geneve_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_tnl_header_0_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_register_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_register_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_src_gvmi_qpn_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_flex_parser_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_flex_parser_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_flex_parser_tnl_gtpu_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_tnl_gtpu_flex_parser_0_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); +void dr_ste_v1_build_tnl_gtpu_flex_parser_1_init(struct mlx5dr_ste_build *sb, + struct mlx5dr_match_param *mask); + +#endif /* _DR_STE_V1_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c new file mode 100644 index 000000000000..c60fddd125d2 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v2.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "dr_ste_v1.h" + +enum { + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, + DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, + DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, + DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, + DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, + DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, + DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, + DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, + DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, + DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, + DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, + DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, + DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0 = 0x6f, + DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1 = 0x70, + DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, + DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0 = 0x90, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1 = 0x91, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0 = 0x92, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1 = 0x93, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0 = 0x94, + DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1 = 0x95, +}; + +static const struct mlx5dr_ste_action_modify_field dr_ste_v2_action_modify_field_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_SRC_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 18, .end = 23, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_1, .start = 16, .end = 24, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_SRC_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV6_DST_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_IPV4_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_A] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_GNRL_PURPOSE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_B] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_METADATA_2_CQE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_1] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_2] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_3] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_1_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_4] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_METADATA_REG_C_5] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_REGISTER_2_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_TCP_MISC_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_L2_OUT_2, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_31_0] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_EMD_47_32] = { + .hw_field = DR_STE_V2_ACTION_MDFY_FLD_CFG_HDR_0_0, .start = 0, .end = 15, + }, +}; + +static struct mlx5dr_ste_ctx ste_ctx_v2 = { + /* Builders */ + .build_eth_l2_src_dst_init = &dr_ste_v1_build_eth_l2_src_dst_init, + .build_eth_l3_ipv6_src_init = &dr_ste_v1_build_eth_l3_ipv6_src_init, + .build_eth_l3_ipv6_dst_init = &dr_ste_v1_build_eth_l3_ipv6_dst_init, + .build_eth_l3_ipv4_5_tuple_init = &dr_ste_v1_build_eth_l3_ipv4_5_tuple_init, + .build_eth_l2_src_init = &dr_ste_v1_build_eth_l2_src_init, + .build_eth_l2_dst_init = &dr_ste_v1_build_eth_l2_dst_init, + .build_eth_l2_tnl_init = &dr_ste_v1_build_eth_l2_tnl_init, + .build_eth_l3_ipv4_misc_init = &dr_ste_v1_build_eth_l3_ipv4_misc_init, + .build_eth_ipv6_l3_l4_init = &dr_ste_v1_build_eth_ipv6_l3_l4_init, + .build_mpls_init = &dr_ste_v1_build_mpls_init, + .build_tnl_gre_init = &dr_ste_v1_build_tnl_gre_init, + .build_tnl_mpls_init = &dr_ste_v1_build_tnl_mpls_init, + .build_tnl_mpls_over_udp_init = &dr_ste_v1_build_tnl_mpls_over_udp_init, + .build_tnl_mpls_over_gre_init = &dr_ste_v1_build_tnl_mpls_over_gre_init, + .build_icmp_init = &dr_ste_v1_build_icmp_init, + .build_general_purpose_init = &dr_ste_v1_build_general_purpose_init, + .build_eth_l4_misc_init = &dr_ste_v1_build_eth_l4_misc_init, + .build_tnl_vxlan_gpe_init = &dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init, + .build_tnl_geneve_init = &dr_ste_v1_build_flex_parser_tnl_geneve_init, + .build_tnl_geneve_tlv_opt_init = &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_init, + .build_tnl_geneve_tlv_opt_exist_init = + &dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_init, + .build_register_0_init = &dr_ste_v1_build_register_0_init, + .build_register_1_init = &dr_ste_v1_build_register_1_init, + .build_src_gvmi_qpn_init = &dr_ste_v1_build_src_gvmi_qpn_init, + .build_flex_parser_0_init = &dr_ste_v1_build_flex_parser_0_init, + .build_flex_parser_1_init = &dr_ste_v1_build_flex_parser_1_init, + .build_tnl_gtpu_init = &dr_ste_v1_build_flex_parser_tnl_gtpu_init, + .build_tnl_header_0_1_init = &dr_ste_v1_build_tnl_header_0_1_init, + .build_tnl_gtpu_flex_parser_0_init = &dr_ste_v1_build_tnl_gtpu_flex_parser_0_init, + .build_tnl_gtpu_flex_parser_1_init = &dr_ste_v1_build_tnl_gtpu_flex_parser_1_init, + + /* Getters and Setters */ + .ste_init = &dr_ste_v1_init, + .set_next_lu_type = &dr_ste_v1_set_next_lu_type, + .get_next_lu_type = &dr_ste_v1_get_next_lu_type, + .set_miss_addr = &dr_ste_v1_set_miss_addr, + .get_miss_addr = &dr_ste_v1_get_miss_addr, + .set_hit_addr = &dr_ste_v1_set_hit_addr, + .set_byte_mask = &dr_ste_v1_set_byte_mask, + .get_byte_mask = &dr_ste_v1_get_byte_mask, + + /* Actions */ + .actions_caps = DR_STE_CTX_ACTION_CAP_TX_POP | + DR_STE_CTX_ACTION_CAP_RX_PUSH | + DR_STE_CTX_ACTION_CAP_RX_ENCAP, + .set_actions_rx = &dr_ste_v1_set_actions_rx, + .set_actions_tx = &dr_ste_v1_set_actions_tx, + .modify_field_arr_sz = ARRAY_SIZE(dr_ste_v2_action_modify_field_arr), + .modify_field_arr = dr_ste_v2_action_modify_field_arr, + .set_action_set = &dr_ste_v1_set_action_set, + .set_action_add = &dr_ste_v1_set_action_add, + .set_action_copy = &dr_ste_v1_set_action_copy, + .set_action_decap_l3_list = &dr_ste_v1_set_action_decap_l3_list, + + /* Send */ + .prepare_for_postsend = &dr_ste_v1_prepare_for_postsend, +}; + +struct mlx5dr_ste_ctx *mlx5dr_ste_get_ctx_v2(void) +{ + return &ste_ctx_v2; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 57ffcab7294c..045b0cf90063 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -758,7 +758,7 @@ static u32 mlx5_cmd_dr_get_capabilities(struct mlx5_flow_root_namespace *ns, enum fs_flow_table_type ft_type) { if (ft_type != FS_FT_FDB || - MLX5_CAP_GEN(ns->dev, steering_format_version) != MLX5_STEERING_FORMAT_CONNECTX_6DX) + MLX5_CAP_GEN(ns->dev, steering_format_version) == MLX5_STEERING_FORMAT_CONNECTX_5) return 0; return MLX5_FLOW_STEERING_CAP_VLAN_PUSH_ON_RX | MLX5_FLOW_STEERING_CAP_VLAN_POP_ON_TX; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h index dfa223415fe2..03efbdf3fec3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/mlx5dr.h @@ -136,7 +136,7 @@ mlx5dr_is_supported(struct mlx5_core_dev *dev) (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner) || (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, sw_owner_v2) && (MLX5_CAP_GEN(dev, steering_format_version) <= - MLX5_STEERING_FORMAT_CONNECTX_6DX))); + MLX5_STEERING_FORMAT_CONNECTX_7))); } /* buddy functions & structure */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1f0c35162b7b..bcf60ede6fcc 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1346,6 +1346,7 @@ enum mlx5_fc_bulk_alloc_bitmask { enum { MLX5_STEERING_FORMAT_CONNECTX_5 = 0, MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, + MLX5_STEERING_FORMAT_CONNECTX_7 = 2, }; struct mlx5_ifc_cmd_hca_cap_bits { -- cgit v1.2.3 From 013a3e7c79ac51e6cdd84e3580863a562c7597c7 Mon Sep 17 00:00:00 2001 From: Min Li Date: Tue, 8 Mar 2022 09:10:51 -0500 Subject: ptp: idt82p33: use rsmu driver to access i2c/spi bus rsmu (Renesas Synchronization Management Unit ) driver is located in drivers/mfd and responsible for creating multiple devices including idt82p33 phc, which will then use the exposed regmap and mutex handle to access i2c/spi bus. Signed-off-by: Min Li Acked-by: Richard Cochran Link: https://lore.kernel.org/r/1646748651-16811-1-git-send-email-min.li.xe@renesas.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_idt82p33.c | 344 ++++++++++----------------------------- drivers/ptp/ptp_idt82p33.h | 151 +++++------------ include/linux/mfd/idt82p33_reg.h | 3 + 3 files changed, 129 insertions(+), 369 deletions(-) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index c1c959f7e52b..97c1be44e323 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -6,13 +6,17 @@ #define pr_fmt(fmt) "IDT_82p33xxx: " fmt #include -#include +#include #include #include #include +#include #include #include #include +#include +#include +#include #include "ptp_private.h" #include "ptp_idt82p33.h" @@ -24,15 +28,25 @@ MODULE_LICENSE("GPL"); MODULE_FIRMWARE(FW_FILENAME); /* Module Parameters */ -static u32 sync_tod_timeout = SYNC_TOD_TIMEOUT_SEC; -module_param(sync_tod_timeout, uint, 0); -MODULE_PARM_DESC(sync_tod_timeout, -"duration in second to keep SYNC_TOD on (set to 0 to keep it always on)"); - static u32 phase_snap_threshold = SNAP_THRESHOLD_NS; module_param(phase_snap_threshold, uint, 0); MODULE_PARM_DESC(phase_snap_threshold, -"threshold (150000ns by default) below which adjtime would ignore"); +"threshold (10000ns by default) below which adjtime would use double dco"); + +static char *firmware; +module_param(firmware, charp, 0); + +static inline int idt82p33_read(struct idt82p33 *idt82p33, u16 regaddr, + u8 *buf, u16 count) +{ + return regmap_bulk_read(idt82p33->regmap, regaddr, buf, count); +} + +static inline int idt82p33_write(struct idt82p33 *idt82p33, u16 regaddr, + u8 *buf, u16 count) +{ + return regmap_bulk_write(idt82p33->regmap, regaddr, buf, count); +} static void idt82p33_byte_array_to_timespec(struct timespec64 *ts, u8 buf[TOD_BYTE_COUNT]) @@ -78,110 +92,6 @@ static void idt82p33_timespec_to_byte_array(struct timespec64 const *ts, } } -static int idt82p33_xfer_read(struct idt82p33 *idt82p33, - unsigned char regaddr, - unsigned char *buf, - unsigned int count) -{ - struct i2c_client *client = idt82p33->client; - struct i2c_msg msg[2]; - int cnt; - - msg[0].addr = client->addr; - msg[0].flags = 0; - msg[0].len = 1; - msg[0].buf = ®addr; - - msg[1].addr = client->addr; - msg[1].flags = I2C_M_RD; - msg[1].len = count; - msg[1].buf = buf; - - cnt = i2c_transfer(client->adapter, msg, 2); - if (cnt < 0) { - dev_err(&client->dev, "i2c_transfer returned %d\n", cnt); - return cnt; - } else if (cnt != 2) { - dev_err(&client->dev, - "i2c_transfer sent only %d of %d messages\n", cnt, 2); - return -EIO; - } - return 0; -} - -static int idt82p33_xfer_write(struct idt82p33 *idt82p33, - u8 regaddr, - u8 *buf, - u16 count) -{ - struct i2c_client *client = idt82p33->client; - /* we add 1 byte for device register */ - u8 msg[IDT82P33_MAX_WRITE_COUNT + 1]; - int err; - - if (count > IDT82P33_MAX_WRITE_COUNT) - return -EINVAL; - - msg[0] = regaddr; - memcpy(&msg[1], buf, count); - - err = i2c_master_send(client, msg, count + 1); - if (err < 0) { - dev_err(&client->dev, "i2c_master_send returned %d\n", err); - return err; - } - - return 0; -} - -static int idt82p33_page_offset(struct idt82p33 *idt82p33, unsigned char val) -{ - int err; - - if (idt82p33->page_offset == val) - return 0; - - err = idt82p33_xfer_write(idt82p33, PAGE_ADDR, &val, sizeof(val)); - if (err) - dev_err(&idt82p33->client->dev, - "failed to set page offset %d\n", val); - else - idt82p33->page_offset = val; - - return err; -} - -static int idt82p33_rdwr(struct idt82p33 *idt82p33, unsigned int regaddr, - unsigned char *buf, unsigned int count, bool write) -{ - u8 offset, page; - int err; - - page = _PAGE(regaddr); - offset = _OFFSET(regaddr); - - err = idt82p33_page_offset(idt82p33, page); - if (err) - return err; - - if (write) - return idt82p33_xfer_write(idt82p33, offset, buf, count); - - return idt82p33_xfer_read(idt82p33, offset, buf, count); -} - -static int idt82p33_read(struct idt82p33 *idt82p33, unsigned int regaddr, - unsigned char *buf, unsigned int count) -{ - return idt82p33_rdwr(idt82p33, regaddr, buf, count, false); -} - -static int idt82p33_write(struct idt82p33 *idt82p33, unsigned int regaddr, - unsigned char *buf, unsigned int count) -{ - return idt82p33_rdwr(idt82p33, regaddr, buf, count, true); -} - static int idt82p33_dpll_set_mode(struct idt82p33_channel *channel, enum pll_mode mode) { @@ -206,7 +116,7 @@ static int idt82p33_dpll_set_mode(struct idt82p33_channel *channel, if (err) return err; - channel->pll_mode = dpll_mode; + channel->pll_mode = mode; return 0; } @@ -467,7 +377,7 @@ static int idt82p33_measure_tod_write_overhead(struct idt82p33_channel *channel) err = idt82p33_measure_settime_gettime_gap_overhead(channel, &gap_ns); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); return err; } @@ -499,8 +409,8 @@ static int idt82p33_check_and_set_masks(struct idt82p33 *idt82p33, if (page == PLLMASK_ADDR_HI && offset == PLLMASK_ADDR_LO) { if ((val & 0xfc) || !(val & 0x3)) { - dev_err(&idt82p33->client->dev, - "Invalid PLL mask 0x%hhx\n", val); + dev_err(idt82p33->dev, + "Invalid PLL mask 0x%x\n", val); err = -EINVAL; } else { idt82p33->pll_mask = val; @@ -520,14 +430,14 @@ static void idt82p33_display_masks(struct idt82p33 *idt82p33) { u8 mask, i; - dev_info(&idt82p33->client->dev, + dev_info(idt82p33->dev, "pllmask = 0x%02x\n", idt82p33->pll_mask); for (i = 0; i < MAX_PHC_PLL; i++) { mask = 1 << i; if (mask & idt82p33->pll_mask) - dev_info(&idt82p33->client->dev, + dev_info(idt82p33->dev, "PLL%d output_mask = 0x%04x\n", i, idt82p33->channel[i].output_mask); } @@ -539,11 +449,6 @@ static int idt82p33_sync_tod(struct idt82p33_channel *channel, bool enable) u8 sync_cnfg; int err; - /* Turn it off after sync_tod_timeout seconds */ - if (enable && sync_tod_timeout) - ptp_schedule_worker(channel->ptp_clock, - sync_tod_timeout * HZ); - err = idt82p33_read(idt82p33, channel->dpll_sync_cnfg, &sync_cnfg, sizeof(sync_cnfg)); if (err) @@ -557,22 +462,6 @@ static int idt82p33_sync_tod(struct idt82p33_channel *channel, bool enable) &sync_cnfg, sizeof(sync_cnfg)); } -static long idt82p33_sync_tod_work_handler(struct ptp_clock_info *ptp) -{ - struct idt82p33_channel *channel = - container_of(ptp, struct idt82p33_channel, caps); - struct idt82p33 *idt82p33 = channel->idt82p33; - - mutex_lock(&idt82p33->reg_lock); - - (void)idt82p33_sync_tod(channel, false); - - mutex_unlock(&idt82p33->reg_lock); - - /* Return a negative value here to not reschedule */ - return -1; -} - static int idt82p33_output_enable(struct idt82p33_channel *channel, bool enable, unsigned int outn) { @@ -634,18 +523,11 @@ static int idt82p33_enable_tod(struct idt82p33_channel *channel) struct idt82p33 *idt82p33 = channel->idt82p33; struct timespec64 ts = {0, 0}; int err; - u8 val; - - val = 0; - err = idt82p33_write(idt82p33, channel->dpll_input_mode_cnfg, - &val, sizeof(val)); - if (err) - return err; err = idt82p33_measure_tod_write_overhead(channel); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); return err; } @@ -673,16 +555,14 @@ static void idt82p33_ptp_clock_unregister_all(struct idt82p33 *idt82p33) } static int idt82p33_enable(struct ptp_clock_info *ptp, - struct ptp_clock_request *rq, int on) + struct ptp_clock_request *rq, int on) { struct idt82p33_channel *channel = container_of(ptp, struct idt82p33_channel, caps); struct idt82p33 *idt82p33 = channel->idt82p33; - int err; - - err = -EOPNOTSUPP; + int err = -EOPNOTSUPP; - mutex_lock(&idt82p33->reg_lock); + mutex_lock(idt82p33->lock); if (rq->type == PTP_CLK_REQ_PEROUT) { if (!on) @@ -690,15 +570,18 @@ static int idt82p33_enable(struct ptp_clock_info *ptp, &rq->perout); /* Only accept a 1-PPS aligned to the second. */ else if (rq->perout.start.nsec || rq->perout.period.sec != 1 || - rq->perout.period.nsec) { + rq->perout.period.nsec) err = -ERANGE; - } else + else err = idt82p33_perout_enable(channel, true, &rq->perout); } - mutex_unlock(&idt82p33->reg_lock); + mutex_unlock(idt82p33->lock); + if (err) + dev_err(idt82p33->dev, + "Failed in %s with err %d!\n", __func__, err); return err; } @@ -727,11 +610,11 @@ static int idt82p33_adjwritephase(struct ptp_clock_info *ptp, s32 offset_ns) val[3] = (offset_regval >> 24) & 0x1F; val[3] |= PH_OFFSET_EN; - mutex_lock(&idt82p33->reg_lock); + mutex_lock(idt82p33->lock); err = idt82p33_dpll_set_mode(channel, PLL_MODE_WPH); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); goto out; } @@ -740,7 +623,7 @@ static int idt82p33_adjwritephase(struct ptp_clock_info *ptp, s32 offset_ns) sizeof(val)); out: - mutex_unlock(&idt82p33->reg_lock); + mutex_unlock(idt82p33->lock); return err; } @@ -751,12 +634,12 @@ static int idt82p33_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) struct idt82p33 *idt82p33 = channel->idt82p33; int err; - mutex_lock(&idt82p33->reg_lock); + mutex_lock(idt82p33->lock); err = _idt82p33_adjfine(channel, scaled_ppm); + mutex_unlock(idt82p33->lock); if (err) - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); - mutex_unlock(&idt82p33->reg_lock); return err; } @@ -768,29 +651,20 @@ static int idt82p33_adjtime(struct ptp_clock_info *ptp, s64 delta_ns) struct idt82p33 *idt82p33 = channel->idt82p33; int err; - mutex_lock(&idt82p33->reg_lock); + mutex_lock(idt82p33->lock); if (abs(delta_ns) < phase_snap_threshold) { - mutex_unlock(&idt82p33->reg_lock); + mutex_unlock(idt82p33->lock); return 0; } err = _idt82p33_adjtime(channel, delta_ns); - if (err) { - mutex_unlock(&idt82p33->reg_lock); - dev_err(&idt82p33->client->dev, - "Adjtime failed in %s with err %d!\n", __func__, err); - return err; - } + mutex_unlock(idt82p33->lock); - err = idt82p33_sync_tod(channel, true); if (err) - dev_err(&idt82p33->client->dev, - "Sync_tod failed in %s with err %d!\n", __func__, err); - - mutex_unlock(&idt82p33->reg_lock); - + dev_err(idt82p33->dev, + "Failed in %s with err %d!\n", __func__, err); return err; } @@ -801,31 +675,31 @@ static int idt82p33_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) struct idt82p33 *idt82p33 = channel->idt82p33; int err; - mutex_lock(&idt82p33->reg_lock); + mutex_lock(idt82p33->lock); err = _idt82p33_gettime(channel, ts); + mutex_unlock(idt82p33->lock); + if (err) - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); - mutex_unlock(&idt82p33->reg_lock); - return err; } static int idt82p33_settime(struct ptp_clock_info *ptp, - const struct timespec64 *ts) + const struct timespec64 *ts) { struct idt82p33_channel *channel = container_of(ptp, struct idt82p33_channel, caps); struct idt82p33 *idt82p33 = channel->idt82p33; int err; - mutex_lock(&idt82p33->reg_lock); + mutex_lock(idt82p33->lock); err = _idt82p33_settime(channel, ts); + mutex_unlock(idt82p33->lock); + if (err) - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); - mutex_unlock(&idt82p33->reg_lock); - return err; } @@ -864,7 +738,7 @@ static int idt82p33_channel_init(struct idt82p33_channel *channel, int index) static void idt82p33_caps_init(struct ptp_clock_info *caps) { caps->owner = THIS_MODULE; - caps->max_adj = 92000; + caps->max_adj = DCO_MAX_PPB; caps->n_per_out = 11; caps->adjphase = idt82p33_adjwritephase; caps->adjfine = idt82p33_adjfine; @@ -872,7 +746,6 @@ static void idt82p33_caps_init(struct ptp_clock_info *caps) caps->gettime64 = idt82p33_gettime; caps->settime64 = idt82p33_settime; caps->enable = idt82p33_enable; - caps->do_aux_work = idt82p33_sync_tod_work_handler; } static int idt82p33_enable_channel(struct idt82p33 *idt82p33, u32 index) @@ -887,7 +760,7 @@ static int idt82p33_enable_channel(struct idt82p33 *idt82p33, u32 index) err = idt82p33_channel_init(channel, index); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Channel_init failed in %s with err %d!\n", __func__, err); return err; @@ -912,7 +785,7 @@ static int idt82p33_enable_channel(struct idt82p33 *idt82p33, u32 index) err = idt82p33_dpll_set_mode(channel, PLL_MODE_DCO); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Dpll_set_mode failed in %s with err %d!\n", __func__, err); return err; @@ -920,13 +793,13 @@ static int idt82p33_enable_channel(struct idt82p33 *idt82p33, u32 index) err = idt82p33_enable_tod(channel); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Enable_tod failed in %s with err %d!\n", __func__, err); return err; } - dev_info(&idt82p33->client->dev, "PLL%d registered as ptp%d\n", + dev_info(idt82p33->dev, "PLL%d registered as ptp%d\n", index, channel->ptp_clock->index); return 0; @@ -940,25 +813,24 @@ static int idt82p33_load_firmware(struct idt82p33 *idt82p33) int err; s32 len; - dev_dbg(&idt82p33->client->dev, - "requesting firmware '%s'\n", FW_FILENAME); + dev_dbg(idt82p33->dev, "requesting firmware '%s'\n", FW_FILENAME); - err = request_firmware(&fw, FW_FILENAME, &idt82p33->client->dev); + err = request_firmware(&fw, FW_FILENAME, idt82p33->dev); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); return err; } - dev_dbg(&idt82p33->client->dev, "firmware size %zu bytes\n", fw->size); + dev_dbg(idt82p33->dev, "firmware size %zu bytes\n", fw->size); rec = (struct idt82p33_fwrc *) fw->data; for (len = fw->size; len > 0; len -= sizeof(*rec)) { if (rec->reserved) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "bad firmware, reserved field non-zero\n"); err = -EINVAL; } else { @@ -973,16 +845,11 @@ static int idt82p33_load_firmware(struct idt82p33 *idt82p33) } if (err == 0) { - /* maximum 8 pages */ - if (page >= PAGE_NUM) - continue; - /* Page size 128, last 4 bytes of page skipped */ - if (((loaddr > 0x7b) && (loaddr <= 0x7f)) - || loaddr > 0xfb) + if (loaddr > 0x7b) continue; - err = idt82p33_write(idt82p33, _ADDR(page, loaddr), + err = idt82p33_write(idt82p33, REG_ADDR(page, loaddr), &val, sizeof(val)); } @@ -997,36 +864,34 @@ out: } -static int idt82p33_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int idt82p33_probe(struct platform_device *pdev) { + struct rsmu_ddata *ddata = dev_get_drvdata(pdev->dev.parent); struct idt82p33 *idt82p33; int err; u8 i; - (void)id; - - idt82p33 = devm_kzalloc(&client->dev, + idt82p33 = devm_kzalloc(&pdev->dev, sizeof(struct idt82p33), GFP_KERNEL); if (!idt82p33) return -ENOMEM; - mutex_init(&idt82p33->reg_lock); - - idt82p33->client = client; - idt82p33->page_offset = 0xff; + idt82p33->dev = &pdev->dev; + idt82p33->mfd = pdev->dev.parent; + idt82p33->lock = &ddata->lock; + idt82p33->regmap = ddata->regmap; idt82p33->tod_write_overhead_ns = 0; idt82p33->calculate_overhead_flag = 0; idt82p33->pll_mask = DEFAULT_PLL_MASK; idt82p33->channel[0].output_mask = DEFAULT_OUTPUT_MASK_PLL0; idt82p33->channel[1].output_mask = DEFAULT_OUTPUT_MASK_PLL1; - mutex_lock(&idt82p33->reg_lock); + mutex_lock(idt82p33->lock); err = idt82p33_load_firmware(idt82p33); if (err) - dev_warn(&idt82p33->client->dev, + dev_warn(idt82p33->dev, "loading firmware failed with %d\n", err); if (idt82p33->pll_mask) { @@ -1034,7 +899,7 @@ static int idt82p33_probe(struct i2c_client *client, if (idt82p33->pll_mask & (1 << i)) { err = idt82p33_enable_channel(idt82p33, i); if (err) { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "Failed in %s with err %d!\n", __func__, err); break; @@ -1042,69 +907,38 @@ static int idt82p33_probe(struct i2c_client *client, } } } else { - dev_err(&idt82p33->client->dev, + dev_err(idt82p33->dev, "no PLLs flagged as PHCs, nothing to do\n"); err = -ENODEV; } - mutex_unlock(&idt82p33->reg_lock); + mutex_unlock(idt82p33->lock); if (err) { idt82p33_ptp_clock_unregister_all(idt82p33); return err; } - i2c_set_clientdata(client, idt82p33); + platform_set_drvdata(pdev, idt82p33); return 0; } -static int idt82p33_remove(struct i2c_client *client) +static int idt82p33_remove(struct platform_device *pdev) { - struct idt82p33 *idt82p33 = i2c_get_clientdata(client); + struct idt82p33 *idt82p33 = platform_get_drvdata(pdev); idt82p33_ptp_clock_unregister_all(idt82p33); - mutex_destroy(&idt82p33->reg_lock); return 0; } -#ifdef CONFIG_OF -static const struct of_device_id idt82p33_dt_id[] = { - { .compatible = "idt,82p33810" }, - { .compatible = "idt,82p33813" }, - { .compatible = "idt,82p33814" }, - { .compatible = "idt,82p33831" }, - { .compatible = "idt,82p33910" }, - { .compatible = "idt,82p33913" }, - { .compatible = "idt,82p33914" }, - { .compatible = "idt,82p33931" }, - {}, -}; -MODULE_DEVICE_TABLE(of, idt82p33_dt_id); -#endif - -static const struct i2c_device_id idt82p33_i2c_id[] = { - { "idt82p33810", }, - { "idt82p33813", }, - { "idt82p33814", }, - { "idt82p33831", }, - { "idt82p33910", }, - { "idt82p33913", }, - { "idt82p33914", }, - { "idt82p33931", }, - {}, -}; -MODULE_DEVICE_TABLE(i2c, idt82p33_i2c_id); - -static struct i2c_driver idt82p33_driver = { +static struct platform_driver idt82p33_driver = { .driver = { - .of_match_table = of_match_ptr(idt82p33_dt_id), - .name = "idt82p33", + .name = "82p33x1x-phc", }, - .probe = idt82p33_probe, - .remove = idt82p33_remove, - .id_table = idt82p33_i2c_id, + .probe = idt82p33_probe, + .remove = idt82p33_remove, }; -module_i2c_driver(idt82p33_driver); +module_platform_driver(idt82p33_driver); diff --git a/drivers/ptp/ptp_idt82p33.h b/drivers/ptp/ptp_idt82p33.h index 1c7a0f0872e8..0ea1c35c0f9f 100644 --- a/drivers/ptp/ptp_idt82p33.h +++ b/drivers/ptp/ptp_idt82p33.h @@ -8,94 +8,19 @@ #define PTP_IDT82P33_H #include -#include +#include +#include - -/* Register Map - AN888_SMUforIEEE_SynchEther_82P33xxx_RevH.pdf */ -#define PAGE_NUM (8) -#define _ADDR(page, offset) (((page) << 0x7) | ((offset) & 0x7f)) -#define _PAGE(addr) (((addr) >> 0x7) & 0x7) -#define _OFFSET(addr) ((addr) & 0x7f) - -#define DPLL1_TOD_CNFG 0x134 -#define DPLL2_TOD_CNFG 0x1B4 - -#define DPLL1_TOD_STS 0x10B -#define DPLL2_TOD_STS 0x18B - -#define DPLL1_TOD_TRIGGER 0x115 -#define DPLL2_TOD_TRIGGER 0x195 - -#define DPLL1_OPERATING_MODE_CNFG 0x120 -#define DPLL2_OPERATING_MODE_CNFG 0x1A0 - -#define DPLL1_HOLDOVER_FREQ_CNFG 0x12C -#define DPLL2_HOLDOVER_FREQ_CNFG 0x1AC - -#define DPLL1_PHASE_OFFSET_CNFG 0x143 -#define DPLL2_PHASE_OFFSET_CNFG 0x1C3 - -#define DPLL1_SYNC_EDGE_CNFG 0X140 -#define DPLL2_SYNC_EDGE_CNFG 0X1C0 - -#define DPLL1_INPUT_MODE_CNFG 0X116 -#define DPLL2_INPUT_MODE_CNFG 0X196 - -#define OUT_MUX_CNFG(outn) _ADDR(0x6, (0xC * (outn))) - -#define PAGE_ADDR 0x7F -/* Register Map end */ - -/* Register definitions - AN888_SMUforIEEE_SynchEther_82P33xxx_RevH.pdf*/ -#define TOD_TRIGGER(wr_trig, rd_trig) ((wr_trig & 0xf) << 4 | (rd_trig & 0xf)) -#define SYNC_TOD BIT(1) -#define PH_OFFSET_EN BIT(7) -#define SQUELCH_ENABLE BIT(5) - -/* Bit definitions for the DPLL_MODE register */ -#define PLL_MODE_SHIFT (0) -#define PLL_MODE_MASK (0x1F) - -#define PEROUT_ENABLE_OUTPUT_MASK (0xdeadbeef) - -enum pll_mode { - PLL_MODE_MIN = 0, - PLL_MODE_AUTOMATIC = PLL_MODE_MIN, - PLL_MODE_FORCE_FREERUN = 1, - PLL_MODE_FORCE_HOLDOVER = 2, - PLL_MODE_FORCE_LOCKED = 4, - PLL_MODE_FORCE_PRE_LOCKED2 = 5, - PLL_MODE_FORCE_PRE_LOCKED = 6, - PLL_MODE_FORCE_LOST_PHASE = 7, - PLL_MODE_DCO = 10, - PLL_MODE_WPH = 18, - PLL_MODE_MAX = PLL_MODE_WPH, -}; - -enum hw_tod_trig_sel { - HW_TOD_TRIG_SEL_MIN = 0, - HW_TOD_TRIG_SEL_NO_WRITE = HW_TOD_TRIG_SEL_MIN, - HW_TOD_TRIG_SEL_SYNC_SEL = 1, - HW_TOD_TRIG_SEL_IN12 = 2, - HW_TOD_TRIG_SEL_IN13 = 3, - HW_TOD_TRIG_SEL_IN14 = 4, - HW_TOD_TRIG_SEL_TOD_PPS = 5, - HW_TOD_TRIG_SEL_TIMER_INTERVAL = 6, - HW_TOD_TRIG_SEL_MSB_PHASE_OFFSET_CNFG = 7, - HW_TOD_TRIG_SEL_MSB_HOLDOVER_FREQ_CNFG = 8, - HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG = 9, - HW_TOD_RD_TRIG_SEL_LSB_TOD_STS = HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG, - WR_TRIG_SEL_MAX = HW_TOD_WR_TRIG_SEL_MSB_TOD_CNFG, -}; - -/* Register bit definitions end */ #define FW_FILENAME "idt82p33xxx.bin" -#define MAX_PHC_PLL (2) -#define TOD_BYTE_COUNT (10) -#define MAX_MEASURMENT_COUNT (5) -#define SNAP_THRESHOLD_NS (150000) -#define SYNC_TOD_TIMEOUT_SEC (5) -#define IDT82P33_MAX_WRITE_COUNT (512) +#define MAX_PHC_PLL (2) +#define TOD_BYTE_COUNT (10) +#define DCO_MAX_PPB (92000) +#define MAX_MEASURMENT_COUNT (5) +#define SNAP_THRESHOLD_NS (10000) +#define IMMEDIATE_SNAP_THRESHOLD_NS (50000) +#define DDCO_THRESHOLD_NS (5) +#define IDT82P33_MAX_WRITE_COUNT (512) +#define PEROUT_ENABLE_OUTPUT_MASK (0xdeadbeef) #define PLLMASK_ADDR_HI 0xFF #define PLLMASK_ADDR_LO 0xA5 @@ -116,15 +41,25 @@ enum hw_tod_trig_sel { #define DEFAULT_OUTPUT_MASK_PLL0 (0xc0) #define DEFAULT_OUTPUT_MASK_PLL1 DEFAULT_OUTPUT_MASK_PLL0 +/** + * @brief Maximum absolute value for write phase offset in femtoseconds + */ +#define WRITE_PHASE_OFFSET_LIMIT (20000052084ll) + +/** @brief Phase offset resolution + * + * DPLL phase offset = 10^15 fs / ( System Clock * 2^13) + * = 10^15 fs / ( 1638400000 * 2^23) + * = 74.5058059692382 fs + */ +#define IDT_T0DPLL_PHASE_RESOL 74506 + /* PTP Hardware Clock interface */ struct idt82p33_channel { struct ptp_clock_info caps; struct ptp_clock *ptp_clock; - struct idt82p33 *idt82p33; - enum pll_mode pll_mode; - /* task to turn off SYNC_TOD bit after pps sync */ - struct delayed_work sync_tod_work; - bool sync_tod_on; + struct idt82p33 *idt82p33; + enum pll_mode pll_mode; s32 current_freq_ppb; u8 output_mask; u16 dpll_tod_cnfg; @@ -138,15 +73,17 @@ struct idt82p33_channel { }; struct idt82p33 { - struct idt82p33_channel channel[MAX_PHC_PLL]; - struct i2c_client *client; - u8 page_offset; - u8 pll_mask; - ktime_t start_time; - int calculate_overhead_flag; - s64 tod_write_overhead_ns; - /* Protects I2C read/modify/write registers from concurrent access */ - struct mutex reg_lock; + struct idt82p33_channel channel[MAX_PHC_PLL]; + struct device *dev; + u8 pll_mask; + /* Mutex to protect operations from being interrupted */ + struct mutex *lock; + struct regmap *regmap; + struct device *mfd; + /* Overhead calculation for adjtime */ + ktime_t start_time; + int calculate_overhead_flag; + s64 tod_write_overhead_ns; }; /* firmware interface */ @@ -157,18 +94,4 @@ struct idt82p33_fwrc { u8 reserved; } __packed; -/** - * @brief Maximum absolute value for write phase offset in femtoseconds - */ -#define WRITE_PHASE_OFFSET_LIMIT (20000052084ll) - -/** @brief Phase offset resolution - * - * DPLL phase offset = 10^15 fs / ( System Clock * 2^13) - * = 10^15 fs / ( 1638400000 * 2^23) - * = 74.5058059692382 fs - */ -#define IDT_T0DPLL_PHASE_RESOL 74506 - - #endif /* PTP_IDT82P33_H */ diff --git a/include/linux/mfd/idt82p33_reg.h b/include/linux/mfd/idt82p33_reg.h index 129a6c078221..1db532feeb91 100644 --- a/include/linux/mfd/idt82p33_reg.h +++ b/include/linux/mfd/idt82p33_reg.h @@ -7,6 +7,8 @@ #ifndef HAVE_IDT82P33_REG #define HAVE_IDT82P33_REG +#define REG_ADDR(page, offset) (((page) << 0x7) | ((offset) & 0x7f)) + /* Register address */ #define DPLL1_TOD_CNFG 0x134 #define DPLL2_TOD_CNFG 0x1B4 @@ -41,6 +43,7 @@ #define REG_SOFT_RESET 0X381 #define OUT_MUX_CNFG(outn) REG_ADDR(0x6, (0xC * (outn))) +#define TOD_TRIGGER(wr_trig, rd_trig) ((wr_trig & 0xf) << 4 | (rd_trig & 0xf)) /* Register bit definitions */ #define SYNC_TOD BIT(1) -- cgit v1.2.3 From 3b5d4ddf8fe1f60082513f94bae586ac80188a03 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 9 Mar 2022 01:04:50 -0800 Subject: bpf: net: Remove TC_AT_INGRESS_OFFSET and SKB_MONO_DELIVERY_TIME_OFFSET macro This patch removes the TC_AT_INGRESS_OFFSET and SKB_MONO_DELIVERY_TIME_OFFSET macros. Instead, PKT_VLAN_PRESENT_OFFSET is used because all of them are at the same offset. Comment is added to make it clear that changing the position of tc_at_ingress or mono_delivery_time will require to adjust the defined macros. The earlier discussion can be found here: https://lore.kernel.org/bpf/419d994e-ff61-7c11-0ec7-11fefcb0186e@iogearbox.net/ Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220309090450.3710955-1-kafai@fb.com --- include/linux/skbuff.h | 10 +++++----- net/core/filter.c | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2be263184d1e..7b0cb10e70cb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -960,10 +960,10 @@ struct sk_buff { __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 dst_pending_confirm:1; - __u8 mono_delivery_time:1; + __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; - __u8 tc_at_ingress:1; + __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; @@ -1062,7 +1062,9 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) -/* if you move pkt_vlan_present around you also must adapt these constants */ +/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time + * around, you also must adapt these constants. + */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_VLAN_PRESENT_BIT 7 #define TC_AT_INGRESS_MASK (1 << 0) @@ -1073,8 +1075,6 @@ struct sk_buff { #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif #define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) -#define TC_AT_INGRESS_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) -#define SKB_MONO_DELIVERY_TIME_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) #ifdef __KERNEL__ /* diff --git a/net/core/filter.c b/net/core/filter.c index 88767f7da150..738a294a3c82 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8896,7 +8896,7 @@ static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si, __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); + PKT_VLAN_PRESENT_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, SKB_MONO_DELIVERY_TIME_MASK); *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); @@ -8912,7 +8912,7 @@ static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si, *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1); #ifdef CONFIG_NET_CLS_ACT - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); /* At ingress, value_reg = 0 */ @@ -8959,14 +8959,14 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, if (!prog->delivery_time_access) { __u8 tmp_reg = BPF_REG_AX; - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 5); /* @ingress, read __sk_buff->tstamp as the (rcv) timestamp, * so check the skb->mono_delivery_time. */ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); + PKT_VLAN_PRESENT_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, SKB_MONO_DELIVERY_TIME_MASK); *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); @@ -8992,18 +8992,18 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, if (!prog->delivery_time_access) { __u8 tmp_reg = BPF_REG_AX; - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, TC_AT_INGRESS_OFFSET); + *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 3); /* Writing __sk_buff->tstamp at ingress as the (rcv) timestamp. * Clear the skb->mono_delivery_time. */ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); + PKT_VLAN_PRESENT_OFFSET); *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, - SKB_MONO_DELIVERY_TIME_OFFSET); + PKT_VLAN_PRESENT_OFFSET); } #endif -- cgit v1.2.3 From 9bb984f28d5bcb917d35d930fcfb89f90f9449fd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 9 Mar 2022 01:05:09 -0800 Subject: bpf: Remove BPF_SKB_DELIVERY_TIME_NONE and rename s/delivery_time_/tstamp_/ This patch is to simplify the uapi bpf.h regarding to the tstamp type and use a similar way as the kernel to describe the value stored in __sk_buff->tstamp. My earlier thought was to avoid describing the semantic and clock base for the rcv timestamp until there is more clarity on the use case, so the __sk_buff->delivery_time_type naming instead of __sk_buff->tstamp_type. With some thoughts, it can reuse the UNSPEC naming. This patch first removes BPF_SKB_DELIVERY_TIME_NONE and also rename BPF_SKB_DELIVERY_TIME_UNSPEC to BPF_SKB_TSTAMP_UNSPEC and BPF_SKB_DELIVERY_TIME_MONO to BPF_SKB_TSTAMP_DELIVERY_MONO. The semantic of BPF_SKB_TSTAMP_DELIVERY_MONO is the same: __sk_buff->tstamp has delivery time in mono clock base. BPF_SKB_TSTAMP_UNSPEC means __sk_buff->tstamp has the (rcv) tstamp at ingress and the delivery time at egress. At egress, the clock base could be found from skb->sk->sk_clockid. __sk_buff->tstamp == 0 naturally means NONE, so NONE is not needed. With BPF_SKB_TSTAMP_UNSPEC for the rcv tstamp at ingress, the __sk_buff->delivery_time_type is also renamed to __sk_buff->tstamp_type which was also suggested in the earlier discussion: https://lore.kernel.org/bpf/b181acbe-caf8-502d-4b7b-7d96b9fc5d55@iogearbox.net/ The above will then make __sk_buff->tstamp and __sk_buff->tstamp_type the same as its kernel skb->tstamp and skb->mono_delivery_time counter part. The internal kernel function bpf_skb_convert_dtime_type_read() is then renamed to bpf_skb_convert_tstamp_type_read() and it can be simplified with the BPF_SKB_DELIVERY_TIME_NONE gone. A BPF_ALU32_IMM(BPF_AND) insn is also saved by using BPF_JMP32_IMM(BPF_JSET). The bpf helper bpf_skb_set_delivery_time() is also renamed to bpf_skb_set_tstamp(). The arg name is changed from dtime to tstamp also. It only allows setting tstamp 0 for BPF_SKB_TSTAMP_UNSPEC and it could be relaxed later if there is use case to change mono delivery time to non mono. prog->delivery_time_access is also renamed to prog->tstamp_type_access. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220309090509.3712315-1-kafai@fb.com --- include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 40 ++++++++++--------- net/core/filter.c | 88 ++++++++++++++++-------------------------- tools/include/uapi/linux/bpf.h | 40 ++++++++++--------- 4 files changed, 77 insertions(+), 93 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9bf26307247f..05ed9bd31b45 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -573,7 +573,7 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ - delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */ + tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc23020b638d..d288a0a9f797 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5090,23 +5090,22 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. * - * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) * Description - * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also - * change the __sk_buff->delivery_time_type to *dtime_type*. + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. * - * When setting a delivery time (non zero *dtime*) to - * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* - * is supported. It is the only delivery_time_type that will be - * kept after bpf_redirect_*(). - * - * If there is no need to change the __sk_buff->delivery_time_type, - * the delivery time can be directly written to __sk_buff->tstamp + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp * instead. * - * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE - * can be used to clear any delivery time stored in - * __sk_buff->tstamp. + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. * * Only IPv4 and IPv6 skb->protocol are supported. * @@ -5119,7 +5118,7 @@ union bpf_attr { * Return * 0 on success. * **-EINVAL** for invalid input - * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol + * **-EOPNOTSUPP** for unsupported protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5314,7 +5313,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ - FN(skb_set_delivery_time), \ + FN(skb_set_tstamp), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5505,9 +5504,12 @@ union { \ } __attribute__((aligned(8))) enum { - BPF_SKB_DELIVERY_TIME_NONE, - BPF_SKB_DELIVERY_TIME_UNSPEC, - BPF_SKB_DELIVERY_TIME_MONO, + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ }; /* user accessible mirror of in-kernel sk_buff. @@ -5550,7 +5552,7 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u8 delivery_time_type; + __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; diff --git a/net/core/filter.c b/net/core/filter.c index f914e4b13b18..03655f2074ae 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7388,36 +7388,36 @@ static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_skb_set_delivery_time, struct sk_buff *, skb, - u64, dtime, u32, dtime_type) +BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, + u64, tstamp, u32, tstamp_type) { /* skb_clear_delivery_time() is done for inet protocol */ if (skb->protocol != htons(ETH_P_IP) && skb->protocol != htons(ETH_P_IPV6)) return -EOPNOTSUPP; - switch (dtime_type) { - case BPF_SKB_DELIVERY_TIME_MONO: - if (!dtime) + switch (tstamp_type) { + case BPF_SKB_TSTAMP_DELIVERY_MONO: + if (!tstamp) return -EINVAL; - skb->tstamp = dtime; + skb->tstamp = tstamp; skb->mono_delivery_time = 1; break; - case BPF_SKB_DELIVERY_TIME_NONE: - if (dtime) + case BPF_SKB_TSTAMP_UNSPEC: + if (tstamp) return -EINVAL; skb->tstamp = 0; skb->mono_delivery_time = 0; break; default: - return -EOPNOTSUPP; + return -EINVAL; } return 0; } -static const struct bpf_func_proto bpf_skb_set_delivery_time_proto = { - .func = bpf_skb_set_delivery_time, +static const struct bpf_func_proto bpf_skb_set_tstamp_proto = { + .func = bpf_skb_set_tstamp, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -7786,8 +7786,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_gen_syncookie_proto; case BPF_FUNC_sk_assign: return &bpf_sk_assign_proto; - case BPF_FUNC_skb_set_delivery_time: - return &bpf_skb_set_delivery_time_proto; + case BPF_FUNC_skb_set_tstamp: + return &bpf_skb_set_tstamp_proto; #endif default: return bpf_sk_base_func_proto(func_id); @@ -8127,9 +8127,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; - case offsetof(struct __sk_buff, delivery_time_type): + case offsetof(struct __sk_buff, tstamp_type): return false; - case offsetofend(struct __sk_buff, delivery_time_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: + case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1: /* Explicitly prohibit access to padding in __sk_buff. */ return false; default: @@ -8484,14 +8484,14 @@ static bool tc_cls_act_is_valid_access(int off, int size, break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; - case offsetof(struct __sk_buff, delivery_time_type): + case offsetof(struct __sk_buff, tstamp_type): /* The convert_ctx_access() on reading and writing * __sk_buff->tstamp depends on whether the bpf prog - * has used __sk_buff->delivery_time_type or not. - * Thus, we need to set prog->delivery_time_access + * has used __sk_buff->tstamp_type or not. + * Thus, we need to set prog->tstamp_type_access * earlier during is_valid_access() here. */ - ((struct bpf_prog *)prog)->delivery_time_access = 1; + ((struct bpf_prog *)prog)->tstamp_type_access = 1; return size == sizeof(__u8); } @@ -8888,42 +8888,22 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static struct bpf_insn *bpf_convert_dtime_type_read(const struct bpf_insn *si, - struct bpf_insn *insn) +static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, + struct bpf_insn *insn) { __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; + /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, - SKB_MONO_DELIVERY_TIME_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); - /* value_reg = BPF_SKB_DELIVERY_TIME_MONO */ - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_MONO); - *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 10 : 5); - - *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, skb_reg, - offsetof(struct sk_buff, tstamp)); - *insn++ = BPF_JMP_IMM(BPF_JNE, tmp_reg, 0, 2); - /* value_reg = BPF_SKB_DELIVERY_TIME_NONE */ - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_NONE); - *insn++ = BPF_JMP_A(IS_ENABLED(CONFIG_NET_CLS_ACT) ? 6 : 1); - -#ifdef CONFIG_NET_CLS_ACT - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); - *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, TC_AT_INGRESS_MASK); - *insn++ = BPF_JMP32_IMM(BPF_JEQ, tmp_reg, 0, 2); - /* At ingress, value_reg = 0 */ - *insn++ = BPF_MOV32_IMM(value_reg, 0); + *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, + SKB_MONO_DELIVERY_TIME_MASK, 2); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); *insn++ = BPF_JMP_A(1); -#endif - - /* value_reg = BPF_SKB_DELIVERYT_TIME_UNSPEC */ - *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_DELIVERY_TIME_UNSPEC); + *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); - /* 15 insns with CONFIG_NET_CLS_ACT */ return insn; } @@ -8956,11 +8936,11 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 skb_reg = si->src_reg; #ifdef CONFIG_NET_CLS_ACT - /* If the delivery_time_type is read, + /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. - * Thus, read skb->tstamp as is if delivery_time_access is true. + * Thus, read skb->tstamp as is if tstamp_type_access is true. */ - if (!prog->delivery_time_access) { + if (!prog->tstamp_type_access) { /* AX is needed because src_reg and dst_reg could be the same */ __u8 tmp_reg = BPF_REG_AX; @@ -8990,13 +8970,13 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 skb_reg = si->dst_reg; #ifdef CONFIG_NET_CLS_ACT - /* If the delivery_time_type is read, + /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. - * Thus, write skb->tstamp as is if delivery_time_access is true. + * Thus, write skb->tstamp as is if tstamp_type_access is true. * Otherwise, writing at ingress will have to clear the * mono_delivery_time bit also. */ - if (!prog->delivery_time_access) { + if (!prog->tstamp_type_access) { __u8 tmp_reg = BPF_REG_AX; *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, PKT_VLAN_PRESENT_OFFSET); @@ -9329,8 +9309,8 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, insn = bpf_convert_tstamp_read(prog, si, insn); break; - case offsetof(struct __sk_buff, delivery_time_type): - insn = bpf_convert_dtime_type_read(si, insn); + case offsetof(struct __sk_buff, tstamp_type): + insn = bpf_convert_tstamp_type_read(si, insn); break; case offsetof(struct __sk_buff, gso_segs): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bc23020b638d..d288a0a9f797 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5090,23 +5090,22 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. * - * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) * Description - * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also - * change the __sk_buff->delivery_time_type to *dtime_type*. + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. * - * When setting a delivery time (non zero *dtime*) to - * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* - * is supported. It is the only delivery_time_type that will be - * kept after bpf_redirect_*(). - * - * If there is no need to change the __sk_buff->delivery_time_type, - * the delivery time can be directly written to __sk_buff->tstamp + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp * instead. * - * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE - * can be used to clear any delivery time stored in - * __sk_buff->tstamp. + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. * * Only IPv4 and IPv6 skb->protocol are supported. * @@ -5119,7 +5118,7 @@ union bpf_attr { * Return * 0 on success. * **-EINVAL** for invalid input - * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol + * **-EOPNOTSUPP** for unsupported protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5314,7 +5313,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ - FN(skb_set_delivery_time), \ + FN(skb_set_tstamp), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5505,9 +5504,12 @@ union { \ } __attribute__((aligned(8))) enum { - BPF_SKB_DELIVERY_TIME_NONE, - BPF_SKB_DELIVERY_TIME_UNSPEC, - BPF_SKB_DELIVERY_TIME_MONO, + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ }; /* user accessible mirror of in-kernel sk_buff. @@ -5550,7 +5552,7 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u8 delivery_time_type; + __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; -- cgit v1.2.3 From 6789ab9668d920d7be636cb994d85be9a816f9d5 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 10 Mar 2022 13:16:55 -0800 Subject: compiler_types: Refactor the use of btf_type_tag attribute. Previous patches have introduced the compiler attribute btf_type_tag for __user and __percpu. The availability of this attribute depends on some CONFIGs and compiler support. This patch refactors the use of btf_type_tag by introducing BTF_TYPE_TAG, which hides all the dependencies. No functional change. Suggested-by: Andrii Nakryiko Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220310211655.3173786-1-haoluo@google.com --- include/linux/compiler_types.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index b9a8ae9440c7..1bc760ba400c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -4,6 +4,13 @@ #ifndef __ASSEMBLY__ +#if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ + __has_attribute(btf_type_tag) +# define BTF_TYPE_TAG(value) __attribute__((btf_type_tag(#value))) +#else +# define BTF_TYPE_TAG(value) /* nothing */ +#endif + #ifdef __CHECKER__ /* address spaces */ # define __kernel __attribute__((address_space(0))) @@ -31,19 +38,11 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __kernel # ifdef STRUCTLEAK_PLUGIN # define __user __attribute__((user)) -# elif defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ - __has_attribute(btf_type_tag) -# define __user __attribute__((btf_type_tag("user"))) # else -# define __user +# define __user BTF_TYPE_TAG(user) # endif # define __iomem -# if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ - __has_attribute(btf_type_tag) -# define __percpu __attribute__((btf_type_tag("percpu"))) -# else -# define __percpu -# endif +# define __percpu BTF_TYPE_TAG(percpu) # define __rcu # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 -- cgit v1.2.3 From 271907ee2f29cd1078fd219f0778fd824fb1971c Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 17 Jan 2022 15:14:44 +0200 Subject: net/mlx5: Query the maximum MCIA register read size from firmware The MCIA register supports either 12 or 32 dwords, use the correct value by querying the capability from the MCAM register. Signed-off-by: Gal Pressman Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 8 +++++++- include/linux/mlx5/mlx5_ifc.h | 4 +++- include/linux/mlx5/port.h | 1 - 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 289b29a23418..418ab777f6e8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -365,6 +365,12 @@ static void mlx5_sfp_eeprom_params_set(u16 *i2c_addr, int *page_num, u16 *offset *offset -= MLX5_EEPROM_PAGE_LENGTH; } +static int mlx5_mcia_max_bytes(struct mlx5_core_dev *dev) +{ + /* mcia supports either 12 dwords or 32 dwords */ + return (MLX5_CAP_MCAM_FEATURE(dev, mcia_32dwords) ? 32 : 12) * sizeof(u32); +} + static int mlx5_query_mcia(struct mlx5_core_dev *dev, struct mlx5_module_eeprom_query_params *params, u8 *data) { @@ -374,7 +380,7 @@ static int mlx5_query_mcia(struct mlx5_core_dev *dev, void *ptr; u16 size; - size = min_t(int, params->size, MLX5_EEPROM_MAX_BYTES); + size = min_t(int, params->size, mlx5_mcia_max_bytes(dev)); MLX5_SET(mcia_reg, in, l, 0); MLX5_SET(mcia_reg, in, size, size); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 318fae4b3560..745107ff681d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9691,7 +9691,9 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x6a]; + u8 reserved_at_0[0x5d]; + u8 mcia_32dwords[0x1]; + u8 reserved_at_5e[0xc]; u8 reset_state[0x1]; u8 ptpcyc2realtime_modify[0x1]; u8 reserved_at_6c[0x2]; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 77ea4f9c5265..402413b3e914 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -56,7 +56,6 @@ enum mlx5_an_status { MLX5_AN_LINK_DOWN = 4, }; -#define MLX5_EEPROM_MAX_BYTES 32 #define MLX5_EEPROM_IDENTIFIER_BYTE_MASK 0x000000ff #define MLX5_I2C_ADDR_LOW 0x50 #define MLX5_I2C_ADDR_HIGH 0x51 -- cgit v1.2.3 From fcb610a86c53dfcfbb2aa62e704481112752f367 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 17 Jan 2022 15:53:06 +0200 Subject: net/mlx5: Parse module mapping using mlx5_ifc The assumption that the first byte in the module mapping dword is the module number shouldn't be hard-coded in the driver, but come from mlx5_ifc structs. While at it, fix the incorrect width for the 'rx_lane' and 'tx_lane' fields. Signed-off-by: Gal Pressman Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 6 +++--- include/linux/mlx5/mlx5_ifc.h | 8 ++++---- include/linux/mlx5/port.h | 1 - 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 418ab777f6e8..493cacb4610b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -275,7 +275,6 @@ static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num) { u32 in[MLX5_ST_SZ_DW(pmlp_reg)] = {0}; u32 out[MLX5_ST_SZ_DW(pmlp_reg)]; - int module_mapping; int err; MLX5_SET(pmlp_reg, in, local_port, 1); @@ -284,8 +283,9 @@ static int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num) if (err) return err; - module_mapping = MLX5_GET(pmlp_reg, out, lane0_module_mapping); - *module_num = module_mapping & MLX5_EEPROM_IDENTIFIER_BYTE_MASK; + *module_num = MLX5_GET(lane_2_module_mapping, + MLX5_ADDR_OF(pmlp_reg, out, lane0_module_mapping), + module); return 0; } diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 745107ff681d..91b7f730ed91 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9888,10 +9888,10 @@ struct mlx5_ifc_pcmr_reg_bits { }; struct mlx5_ifc_lane_2_module_mapping_bits { - u8 reserved_at_0[0x6]; - u8 rx_lane[0x2]; - u8 reserved_at_8[0x6]; - u8 tx_lane[0x2]; + u8 reserved_at_0[0x4]; + u8 rx_lane[0x4]; + u8 reserved_at_8[0x4]; + u8 tx_lane[0x4]; u8 reserved_at_10[0x8]; u8 module[0x8]; }; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 402413b3e914..28a928b0684b 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -56,7 +56,6 @@ enum mlx5_an_status { MLX5_AN_LINK_DOWN = 4, }; -#define MLX5_EEPROM_IDENTIFIER_BYTE_MASK 0x000000ff #define MLX5_I2C_ADDR_LOW 0x50 #define MLX5_I2C_ADDR_HIGH 0x51 #define MLX5_EEPROM_PAGE_LENGTH 256 -- cgit v1.2.3 From d3826a95222c44a527f76b011bb5af8c924632e9 Mon Sep 17 00:00:00 2001 From: Dirk van der Merwe Date: Fri, 11 Mar 2022 11:43:06 +0100 Subject: nfp: add support for NFP3800/NFP3803 PCIe devices Enable binding the nfp driver to NFP3800 and NFP3803 devices. The PCIE_SRAM offset is different for the NFP3800 device, which also only supports a single explicit group. Changes to Dirk's work: * 48-bit dma addressing is not ready yet. Keep 40-bit dma addressing for NFP3800. Signed-off-by: Dirk van der Merwe Signed-off-by: Jakub Kicinski Signed-off-by: Fei Qin Signed-off-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_main.c | 4 ++++ drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c | 4 ++++ drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h | 4 ---- drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.c | 19 +++++++++++++++++++ drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.h | 2 ++ include/linux/pci_ids.h | 2 ++ 6 files changed, 31 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index dd135ac8b1a3..8693f9905fbe 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -33,6 +33,10 @@ static const char nfp_driver_name[] = "nfp"; static const struct pci_device_id nfp_pci_device_ids[] = { + { PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP3800, + PCI_VENDOR_ID_NETRONOME, PCI_ANY_ID, + PCI_ANY_ID, 0, NFP_DEV_NFP3800, + }, { PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP4000, PCI_VENDOR_ID_NETRONOME, PCI_ANY_ID, PCI_ANY_ID, 0, NFP_DEV_NFP6000, diff --git a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c index db4301f8cd85..9ef226c6706e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_netvf_main.c @@ -38,6 +38,10 @@ struct nfp_net_vf { static const char nfp_net_driver_name[] = "nfp_netvf"; static const struct pci_device_id nfp_netvf_pci_device_ids[] = { + { PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP3800_VF, + PCI_VENDOR_ID_NETRONOME, PCI_ANY_ID, + PCI_ANY_ID, 0, NFP_DEV_NFP3800_VF, + }, { PCI_VENDOR_ID_NETRONOME, PCI_DEVICE_ID_NETRONOME_NFP6000_VF, PCI_VENDOR_ID_NETRONOME, PCI_ANY_ID, PCI_ANY_ID, 0, NFP_DEV_NFP6000_VF, diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h index 2dd0f5842873..3d379e937184 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h @@ -32,10 +32,6 @@ #define PCI_64BIT_BAR_COUNT 3 -/* NFP hardware vendor/device ids. - */ -#define PCI_DEVICE_ID_NETRONOME_NFP3800 0x3800 - #define NFP_CPP_NUM_TARGETS 16 /* Max size of area it should be safe to request */ #define NFP_CPP_SAFE_AREA_SIZE SZ_2M diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.c index 0c1ef01f90eb..28384d6d1c6f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.c @@ -8,6 +8,25 @@ #include "nfp_dev.h" const struct nfp_dev_info nfp_dev_info[NFP_DEV_CNT] = { + [NFP_DEV_NFP3800] = { + .dma_mask = DMA_BIT_MASK(40), + .qc_idx_mask = GENMASK(8, 0), + .qc_addr_offset = 0x400000, + .min_qc_size = 512, + .max_qc_size = SZ_64K, + + .chip_names = "NFP3800", + .pcie_cfg_expbar_offset = 0x0a00, + .pcie_expl_offset = 0xd000, + .qc_area_sz = 0x100000, + }, + [NFP_DEV_NFP3800_VF] = { + .dma_mask = DMA_BIT_MASK(40), + .qc_idx_mask = GENMASK(8, 0), + .qc_addr_offset = 0, + .min_qc_size = 512, + .max_qc_size = SZ_64K, + }, [NFP_DEV_NFP6000] = { .dma_mask = DMA_BIT_MASK(40), .qc_idx_mask = GENMASK(7, 0), diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.h index deadd9b97f9f..d4189869cf7b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.h +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_dev.h @@ -7,6 +7,8 @@ #include enum nfp_dev_id { + NFP_DEV_NFP3800, + NFP_DEV_NFP3800_VF, NFP_DEV_NFP6000, NFP_DEV_NFP6000_VF, NFP_DEV_CNT, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c7e6f2043c7d..5462c29f0538 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2531,9 +2531,11 @@ #define PCI_VENDOR_ID_HUAWEI 0x19e5 #define PCI_VENDOR_ID_NETRONOME 0x19ee +#define PCI_DEVICE_ID_NETRONOME_NFP3800 0x3800 #define PCI_DEVICE_ID_NETRONOME_NFP4000 0x4000 #define PCI_DEVICE_ID_NETRONOME_NFP5000 0x5000 #define PCI_DEVICE_ID_NETRONOME_NFP6000 0x6000 +#define PCI_DEVICE_ID_NETRONOME_NFP3800_VF 0x3803 #define PCI_DEVICE_ID_NETRONOME_NFP6000_VF 0x6003 #define PCI_VENDOR_ID_QMI 0x1a32 -- cgit v1.2.3 From 625788b5844511cf4c30cffa7fa0bc3a69cebc82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Mar 2022 21:14:20 -0800 Subject: net: add per-cpu storage and net->core_stats Before adding yet another possibly contended atomic_long_t, it is time to add per-cpu storage for existing ones: dev->tx_dropped, dev->rx_dropped, and dev->rx_nohandler Because many devices do not have to increment such counters, allocate the per-cpu storage on demand, so that dev_get_stats() does not have to spend considerable time folding zero counters. Note that some drivers have abused these counters which were supposed to be only used by core networking stack. v4: should use per_cpu_ptr() in dev_get_stats() (Jakub) v3: added a READ_ONCE() in netdev_core_stats_alloc() (Paolo) v2: add a missing include (reported by kernel test robot ) Change in netdev_core_stats_alloc() (Jakub) Signed-off-by: Eric Dumazet Cc: jeffreyji Reviewed-by: Brian Vazquez Reviewed-by: Jakub Kicinski Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220311051420.2608812-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 +- drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 4 +- .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 2 +- drivers/net/ipvlan/ipvlan_core.c | 2 +- drivers/net/macvlan.c | 2 +- drivers/net/net_failover.c | 2 +- drivers/net/tun.c | 16 +++---- drivers/net/vxlan/vxlan_core.c | 2 +- include/linux/netdevice.h | 46 +++++++++++++++---- include/net/bonding.h | 2 +- net/core/dev.c | 51 ++++++++++++++++++---- net/core/gro_cells.c | 2 +- net/hsr/hsr_device.c | 2 +- net/xfrm/xfrm_device.c | 2 +- 15 files changed, 101 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 55e0ba2a163d..15eddca7b4b6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5120,7 +5120,7 @@ static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb, if (xmit_suc) return NETDEV_TX_OK; - atomic_long_inc(&bond_dev->tx_dropped); + dev_core_stats_tx_dropped_inc(bond_dev); return NET_XMIT_DROP; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2de02950086f..92a1a43b3bee 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -370,7 +370,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) i = skb_get_queue_mapping(skb); if (unlikely(i >= bp->tx_nr_rings)) { dev_kfree_skb_any(skb); - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); return NETDEV_TX_OK; } @@ -646,7 +646,7 @@ tx_kick_pending: if (txr->kick_pending) bnxt_txr_db_kick(bp, txr, txr->tx_prod); txr->tx_buf_ring[txr->tx_prod].skb = NULL; - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c index d7a27c244d48..54faf0f2d1d8 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c @@ -887,8 +887,8 @@ static void hns_get_ethtool_stats(struct net_device *netdev, p[21] = net_stats->rx_compressed; p[22] = net_stats->tx_compressed; - p[23] = netdev->rx_dropped.counter; - p[24] = netdev->tx_dropped.counter; + p[23] = 0; /* was netdev->rx_dropped.counter */ + p[24] = 0; /* was netdev->tx_dropped.counter */ p[25] = priv->tx_timeout_count; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index bfbd7847f946..a313242a762e 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -207,7 +207,7 @@ rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb) dev = skb->dev; port = rmnet_get_port_rcu(dev); if (unlikely(!port)) { - atomic_long_inc(&skb->dev->rx_nohandler); + dev_core_stats_rx_nohandler_inc(skb->dev); kfree_skb(skb); goto done; } diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index c613900c3811..6ffb27419e64 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -555,7 +555,7 @@ static void ipvlan_multicast_enqueue(struct ipvl_port *port, schedule_work(&port->wq); } else { spin_unlock(&port->backlog.lock); - atomic_long_inc(&skb->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb(skb); } } diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 33753a2fde29..4b77819e9328 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -371,7 +371,7 @@ static void macvlan_broadcast_enqueue(struct macvlan_port *port, free_nskb: kfree_skb(nskb); err: - atomic_long_inc(&skb->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(skb->dev); } static void macvlan_flush_sources(struct macvlan_port *port, diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 86ec5aae4289..21a0435c02de 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -89,7 +89,7 @@ static int net_failover_close(struct net_device *dev) static netdev_tx_t net_failover_drop_xmit(struct sk_buff *skb, struct net_device *dev) { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NETDEV_TX_OK; } diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2b9a22669a12..276a0e42ca8e 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1135,7 +1135,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; drop: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); skb_tx_error(skb); kfree_skb_reason(skb, drop_reason); rcu_read_unlock(); @@ -1291,7 +1291,7 @@ resample: void *frame = tun_xdp_to_ptr(xdp); if (__ptr_ring_produce(&tfile->tx_ring, frame)) { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); break; } nxmit++; @@ -1626,7 +1626,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, trace_xdp_exception(tun->dev, xdp_prog, act); fallthrough; case XDP_DROP: - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); break; } @@ -1797,7 +1797,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, */ skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp); if (IS_ERR(skb)) { - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); return PTR_ERR(skb); } if (!skb) @@ -1826,7 +1826,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); if (frags) mutex_unlock(&tfile->napi_mutex); return PTR_ERR(skb); @@ -1841,7 +1841,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, err = -EFAULT; drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT; drop: - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); kfree_skb_reason(skb, drop_reason); if (frags) { tfile->napi.skb = NULL; @@ -1876,7 +1876,7 @@ drop: pi.proto = htons(ETH_P_IPV6); break; default: - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); kfree_skb(skb); return -EINVAL; } @@ -1956,7 +1956,7 @@ drop: skb_headlen(skb)); if (unlikely(headlen > skb_headlen(skb))) { - atomic_long_inc(&tun->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(tun->dev); napi_free_frags(&tfile->napi); rcu_read_unlock(); mutex_unlock(&tfile->napi_mutex); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 3872f76ea1d3..de97ff98d36e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1760,7 +1760,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); - atomic_long_inc(&vxlan->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); goto drop; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index acd3cf69b61f..0d994710b335 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -194,6 +195,14 @@ struct net_device_stats { unsigned long tx_compressed; }; +/* per-cpu stats, allocated on demand. + * Try to fit them in a single cache line, for dev_get_stats() sake. + */ +struct net_device_core_stats { + local_t rx_dropped; + local_t tx_dropped; + local_t rx_nohandler; +} __aligned(4 * sizeof(local_t)); #include #include @@ -1735,12 +1744,8 @@ enum netdev_ml_priv_type { * @stats: Statistics struct, which was left as a legacy, use * rtnl_link_stats64 instead * - * @rx_dropped: Dropped packets by core network, - * do not use this in drivers - * @tx_dropped: Dropped packets by core network, + * @core_stats: core networking counters, * do not use this in drivers - * @rx_nohandler: nohandler dropped packets by core network on - * inactive devices, do not use this in drivers * @carrier_up_count: Number of times the carrier has been up * @carrier_down_count: Number of times the carrier has been down * @@ -2023,9 +2028,7 @@ struct net_device { struct net_device_stats stats; /* not used by modern drivers */ - atomic_long_t rx_dropped; - atomic_long_t tx_dropped; - atomic_long_t rx_nohandler; + struct net_device_core_stats __percpu *core_stats; /* Stats to monitor link on/off, flapping */ atomic_t carrier_up_count; @@ -3839,13 +3842,38 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev, return false; } +struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev); + +static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev) +{ + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); + + if (likely(p)) + return this_cpu_ptr(p); + + return netdev_core_stats_alloc(dev); +} + +#define DEV_CORE_STATS_INC(FIELD) \ +static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ +{ \ + struct net_device_core_stats *p = dev_core_stats(dev); \ + \ + if (p) \ + local_inc(&p->FIELD); \ +} +DEV_CORE_STATS_INC(rx_dropped) +DEV_CORE_STATS_INC(tx_dropped) +DEV_CORE_STATS_INC(rx_nohandler) + static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, const bool check_mtu) { if (skb_orphan_frags(skb, GFP_ATOMIC) || unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) { - atomic_long_inc(&dev->rx_dropped); + dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); return NET_RX_DROP; } diff --git a/include/net/bonding.h b/include/net/bonding.h index d0dfe727e0b1..b14f4c0b4e9e 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -770,7 +770,7 @@ extern const struct sysfs_ops slave_sysfs_ops; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } diff --git a/net/core/dev.c b/net/core/dev.c index 7ed27c178a1f..8d25ec5b3af7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3633,7 +3633,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device out_kfree_skb: kfree_skb(skb); out_null: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); return NULL; } @@ -4184,7 +4184,7 @@ recursion_alert: rc = -ENETDOWN; rcu_read_unlock_bh(); - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return rc; out: @@ -4236,7 +4236,7 @@ int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) local_bh_enable(); return ret; drop: - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); kfree_skb_list(skb); return NET_XMIT_DROP; } @@ -4602,7 +4602,7 @@ drop: sd->dropped++; rps_unlock_irq_restore(sd, &flags); - atomic_long_inc(&skb->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); return NET_RX_DROP; } @@ -5357,10 +5357,10 @@ check_vlan_id: } else { drop: if (!deliver_exact) { - atomic_long_inc(&skb->dev->rx_dropped); + dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, SKB_DROP_REASON_PTYPE_ABSENT); } else { - atomic_long_inc(&skb->dev->rx_nohandler); + dev_core_stats_rx_nohandler_inc(skb->dev); kfree_skb(skb); } /* Jamal, now you will not able to escape explaining @@ -10280,6 +10280,25 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, } EXPORT_SYMBOL(netdev_stats_to_stats64); +struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev) +{ + struct net_device_core_stats __percpu *p; + + p = alloc_percpu_gfp(struct net_device_core_stats, + GFP_ATOMIC | __GFP_NOWARN); + + if (p && cmpxchg(&dev->core_stats, NULL, p)) + free_percpu(p); + + /* This READ_ONCE() pairs with the cmpxchg() above */ + p = READ_ONCE(dev->core_stats); + if (!p) + return NULL; + + return this_cpu_ptr(p); +} +EXPORT_SYMBOL(netdev_core_stats_alloc); + /** * dev_get_stats - get network device statistics * @dev: device to get statistics from @@ -10294,6 +10313,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage) { const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_core_stats __percpu *p; if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); @@ -10303,9 +10323,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } else { netdev_stats_to_stats64(storage, &dev->stats); } - storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped); - storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped); - storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler); + + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + p = READ_ONCE(dev->core_stats); + if (p) { + const struct net_device_core_stats *core_stats; + int i; + + for_each_possible_cpu(i) { + core_stats = per_cpu_ptr(p, i); + storage->rx_dropped += local_read(&core_stats->rx_dropped); + storage->tx_dropped += local_read(&core_stats->tx_dropped); + storage->rx_nohandler += local_read(&core_stats->rx_nohandler); + } + } return storage; } EXPORT_SYMBOL(dev_get_stats); @@ -10567,6 +10598,8 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; #endif + free_percpu(dev->core_stats); + dev->core_stats = NULL; free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 8462f926ab45..541c7a72a28a 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -28,7 +28,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { drop: - atomic_long_inc(&dev->rx_dropped); + dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); res = NET_RX_DROP; goto unlock; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 7f250216433d..6ffef47e9be5 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -221,7 +221,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_len(skb); hsr_forward_skb(skb, master); } else { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } return NETDEV_TX_OK; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 39bce5d764de..3e3448ada1bb 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -143,7 +143,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur segs = skb_gso_segment(skb, esp_features); if (IS_ERR(segs)) { kfree_skb(skb); - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); return NULL; } else { consume_skb(skb); -- cgit v1.2.3 From 1f4a5983d623d6dbda4cc7587a2d9d798e0d4035 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Fri, 11 Mar 2022 17:04:03 +0800 Subject: net: macvlan: add net device refcount tracker Add net device refcount tracker to macvlan. Signed-off-by: Ziyang Xuan Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 4 ++-- include/linux/if_macvlan.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index b339581e36aa..069e8824c264 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -912,7 +912,7 @@ static int macvlan_init(struct net_device *dev) port->count += 1; /* Get macvlan's reference to lowerdev */ - dev_hold(lowerdev); + dev_hold_track(lowerdev, &vlan->dev_tracker, GFP_KERNEL); return 0; } @@ -1181,7 +1181,7 @@ static void macvlan_dev_free(struct net_device *dev) struct macvlan_dev *vlan = netdev_priv(dev); /* Get rid of the macvlan's reference to lowerdev */ - dev_put(vlan->lowerdev); + dev_put_track(vlan->lowerdev, &vlan->dev_tracker); } void macvlan_common_setup(struct net_device *dev) diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 10c94a3936ca..b42294739063 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -21,6 +21,7 @@ struct macvlan_dev { struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; + netdevice_tracker dev_tracker; void *accel_priv; struct vlan_pcpu_stats __percpu *pcpu_stats; -- cgit v1.2.3 From fbd9a2ceba5c74bbfa19cf257ae4b4b2c820860d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 11 Mar 2022 16:03:42 +0100 Subject: net: Add lockdep asserts to ____napi_schedule(). ____napi_schedule() needs to be invoked with disabled interrupts due to __raise_softirq_irqoff (in order not to corrupt the per-CPU list). ____napi_schedule() needs also to be invoked from an interrupt context so that the raised-softirq is processed while the interrupt context is left. Add lockdep asserts for both conditions. While this is the second time the irq/softirq check is needed, provide a generic lockdep_assert_softirq_will_run() which is used by both caller. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/lockdep.h | 7 +++++++ net/core/dev.c | 5 ++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 467b94257105..0cc65d216701 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -329,6 +329,12 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) +/* + * Ensure that softirq is handled within the callchain and not delayed and + * handled by chance. + */ +#define lockdep_assert_softirq_will_run() \ + lockdep_assert_once(hardirq_count() | softirq_count()) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) @@ -414,6 +420,7 @@ extern int lockdep_is_held(const void *); #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) +#define lockdep_assert_softirq_will_run() do { } while (0) #define lockdep_recursing(tsk) (0) diff --git a/net/core/dev.c b/net/core/dev.c index 8d25ec5b3af7..75bab5b0dbae 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4265,6 +4265,9 @@ static inline void ____napi_schedule(struct softnet_data *sd, { struct task_struct *thread; + lockdep_assert_softirq_will_run(); + lockdep_assert_irqs_disabled(); + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in * napi_enable()/dev_set_threaded(). @@ -4872,7 +4875,7 @@ int __netif_rx(struct sk_buff *skb) { int ret; - lockdep_assert_once(hardirq_count() | softirq_count()); + lockdep_assert_softirq_will_run(); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); -- cgit v1.2.3 From fc93db153b0187a9047f00bfd5cce75108530593 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 12 Mar 2022 13:45:05 -0800 Subject: net: disable preemption in dev_core_stats_XXX_inc() helpers syzbot was kind enough to remind us that dev->{tx_dropped|rx_dropped} could be increased in process context. BUG: using smp_processor_id() in preemptible [00000000] code: syz-executor413/3593 caller is netdev_core_stats_alloc+0x98/0x110 net/core/dev.c:10298 CPU: 1 PID: 3593 Comm: syz-executor413 Not tainted 5.17.0-rc7-syzkaller-02426-g97aeb877de7f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 check_preemption_disabled+0x16b/0x170 lib/smp_processor_id.c:49 netdev_core_stats_alloc+0x98/0x110 net/core/dev.c:10298 dev_core_stats include/linux/netdevice.h:3855 [inline] dev_core_stats_rx_dropped_inc include/linux/netdevice.h:3866 [inline] tun_get_user+0x3455/0x3ab0 drivers/net/tun.c:1800 tun_chr_write_iter+0xe1/0x200 drivers/net/tun.c:2015 call_write_iter include/linux/fs.h:2074 [inline] new_sync_write+0x431/0x660 fs/read_write.c:503 vfs_write+0x7cd/0xae0 fs/read_write.c:590 ksys_write+0x12d/0x250 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f2cf4f887e3 Code: 5d 41 5c 41 5d 41 5e e9 9b fd ff ff 66 2e 0f 1f 84 00 00 00 00 00 90 64 8b 04 25 18 00 00 00 85 c0 75 14 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 55 c3 0f 1f 40 00 48 83 ec 28 48 89 54 24 18 RSP: 002b:00007ffd50dd5fd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007ffd50dd6000 RCX: 00007f2cf4f887e3 RDX: 000000000000002a RSI: 0000000000000000 RDI: 00000000000000c8 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd50dd5ff0 R14: 00007ffd50dd5fe8 R15: 00007ffd50dd5fe4 Fixes: 625788b58445 ("net: add per-cpu storage and net->core_stats") Signed-off-by: Eric Dumazet Cc: jeffreyji Cc: Brian Vazquez Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220312214505.3294762-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0d994710b335..8cbe96ce0a2c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3858,10 +3858,14 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de #define DEV_CORE_STATS_INC(FIELD) \ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ - struct net_device_core_stats *p = dev_core_stats(dev); \ + struct net_device_core_stats *p; \ + \ + preempt_disable(); \ + p = dev_core_stats(dev); \ \ if (p) \ local_inc(&p->FIELD); \ + preempt_enable(); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) -- cgit v1.2.3 From 938d3480b92fa5e454b7734294f12a7b75126f09 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 4 Mar 2022 16:11:42 +0800 Subject: bpf, sockmap: Fix memleak in sk_psock_queue_msg If tcp_bpf_sendmsg is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it. sk1 (redirect sk2) sk2 ------------------- --------------- tcp_bpf_sendmsg() tcp_bpf_send_verdict() tcp_bpf_sendmsg_redir() bpf_tcp_ingress() sock_map_close() lock_sock() lock_sock() ... blocking sk_psock_stop sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); release_sock(sk); lock_sock() sk_mem_charge() get_page() sk_psock_queue_msg() sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED); drop_sk_msg() release_sock() While drop_sk_msg(), the msg has charged memory form sk by sk_mem_charge and has sg pages need to put. To fix we use sk_msg_free() and then kfee() msg. This issue can cause the following info: WARNING: CPU: 0 PID: 9202 at net/core/stream.c:205 sk_stream_kill_queues+0xc8/0xe0 Call Trace: inet_csk_destroy_sock+0x55/0x110 tcp_rcv_state_process+0xe5f/0xe90 ? sk_filter_trim_cap+0x10d/0x230 ? tcp_v4_do_rcv+0x161/0x250 tcp_v4_do_rcv+0x161/0x250 tcp_v4_rcv+0xc3a/0xce0 ip_protocol_deliver_rcu+0x3d/0x230 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0xfd/0x110 ? ip_protocol_deliver_rcu+0x230/0x230 ip_rcv+0xd6/0x100 ? ip_local_deliver+0x110/0x110 __netif_receive_skb_one_core+0x85/0xa0 process_backlog+0xa4/0x160 __napi_poll+0x29/0x1b0 net_rx_action+0x287/0x300 __do_softirq+0xff/0x2fc do_softirq+0x79/0x90 WARNING: CPU: 0 PID: 531 at net/ipv4/af_inet.c:154 inet_sock_destruct+0x175/0x1b0 Call Trace: __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Fixes: 9635720b7c88 ("bpf, sockmap: Fix memleak on ingress msg enqueue") Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220304081145.2037182-2-wangyufen@huawei.com --- include/linux/skmsg.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index fdb5375f0562..c5a2d6f50f25 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -304,21 +304,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) -{ - if (msg->skb) - sock_drop(psock->sk, msg->skb); - kfree(msg); -} - static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); - else - drop_sk_msg(psock, msg); + else { + sk_msg_free(psock->sk, msg); + kfree(msg); + } spin_unlock_bh(&psock->ingress_lock); } -- cgit v1.2.3 From d2a3b7c5becc3992f8e7d2b9bf5eacceeedb9a48 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 9 Mar 2022 20:33:20 +0800 Subject: bpf: Fix net.core.bpf_jit_harden race It is the bpf_jit_harden counterpart to commit 60b58afc96c9 ("bpf: fix net.core.bpf_jit_enable race"). bpf_jit_harden will be tested twice for each subprog if there are subprogs in bpf program and constant blinding may increase the length of program, so when running "./test_progs -t subprogs" and toggling bpf_jit_harden between 0 and 2, jit_subprogs may fail because constant blinding increases the length of subprog instructions during extra passs. So cache the value of bpf_jit_blinding_enabled() during program allocation, and use the cached value during constant blinding, subprog JITing and args tracking of tail call. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220309123321.2400262-4-houtao1@huawei.com --- include/linux/filter.h | 1 + kernel/bpf/core.c | 3 ++- kernel/bpf/verifier.c | 5 +++-- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 05ed9bd31b45..ed0c0ff42ad5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -566,6 +566,7 @@ struct bpf_prog { gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ + blinding_requested:1, /* needs constant blinding */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ab630f773ec1..1324f9523e7c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -105,6 +105,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); + fp->blinding_requested = bpf_jit_blinding_enabled(fp); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -1382,7 +1383,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog) || prog->blinded) + if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0db6cd8dcb35..cf92f9c01556 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13023,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; func[i]->aux->linfo = prog->aux->linfo; @@ -13146,6 +13147,7 @@ out_free: out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; + prog->blinding_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (!bpf_pseudo_call(insn)) continue; @@ -13239,7 +13241,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; enum bpf_attach_type eatype = prog->expected_attach_type; - bool expect_blinding = bpf_jit_blinding_enabled(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; @@ -13403,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !expect_blinding && + if (env->bpf_capable && !prog->blinding_requested && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && -- cgit v1.2.3 From 4206fe40b2c01fb5988980cff6ea958b16578026 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 27 Apr 2021 16:30:32 +0300 Subject: net/mlx5: Remove unused exported contiguous coherent buffer allocation API All WQ types moved to using the fragmented allocation API for coherent memory. Contiguous API is not used anymore. Remove it, reduce the number of exported functions. Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 47 ------------------------- include/linux/mlx5/driver.h | 3 -- 2 files changed, 50 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index d5408f6ce5a7..1762c5c36042 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -71,53 +71,6 @@ static void *mlx5_dma_zalloc_coherent_node(struct mlx5_core_dev *dev, return cpu_handle; } -static int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, - struct mlx5_frag_buf *buf, int node) -{ - dma_addr_t t; - - buf->size = size; - buf->npages = 1; - buf->page_shift = (u8)get_order(size) + PAGE_SHIFT; - - buf->frags = kzalloc(sizeof(*buf->frags), GFP_KERNEL); - if (!buf->frags) - return -ENOMEM; - - buf->frags->buf = mlx5_dma_zalloc_coherent_node(dev, size, - &t, node); - if (!buf->frags->buf) - goto err_out; - - buf->frags->map = t; - - while (t & ((1 << buf->page_shift) - 1)) { - --buf->page_shift; - buf->npages *= 2; - } - - return 0; -err_out: - kfree(buf->frags); - return -ENOMEM; -} - -int mlx5_buf_alloc(struct mlx5_core_dev *dev, - int size, struct mlx5_frag_buf *buf) -{ - return mlx5_buf_alloc_node(dev, size, buf, dev->priv.numa_node); -} -EXPORT_SYMBOL(mlx5_buf_alloc); - -void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf) -{ - dma_free_coherent(mlx5_core_dma_dev(dev), buf->size, buf->frags->buf, - buf->frags->map); - - kfree(buf->frags); -} -EXPORT_SYMBOL_GPL(mlx5_buf_free); - int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_frag_buf *buf, int node) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 00a914b0716e..a386aec1eb65 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1009,9 +1009,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); -int mlx5_buf_alloc(struct mlx5_core_dev *dev, - int size, struct mlx5_frag_buf *buf); -void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf); int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_frag_buf *buf, int node); void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf); -- cgit v1.2.3 From 770c9a3a01af178a90368a78c75eb91707c7233c Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 20 May 2021 15:34:57 +0300 Subject: net/mlx5: Remove unused fill page array API function mlx5_fill_page_array API function is not used. Remove it, reduce the number of exported functions. Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/alloc.c | 13 ------------- include/linux/mlx5/driver.h | 1 - 2 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c index 1762c5c36042..e52b0bac09da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c @@ -239,19 +239,6 @@ void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db) } EXPORT_SYMBOL_GPL(mlx5_db_free); -void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas) -{ - u64 addr; - int i; - - for (i = 0; i < buf->npages; i++) { - addr = buf->frags->map + (i << buf->page_shift); - - pas[i] = cpu_to_be64(addr); - } -} -EXPORT_SYMBOL_GPL(mlx5_fill_page_array); - void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm) { int i; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a386aec1eb65..96cd740d94a3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1036,7 +1036,6 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev); void mlx5_register_debugfs(void); void mlx5_unregister_debugfs(void); -void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); -- cgit v1.2.3 From cceac97afa090284b3ceecd93ea6b7b527116767 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:49 +0100 Subject: net: bridge: mst: Add helper to map an MSTI to a VID set br_mst_get_info answers the question: "On this bridge, which VIDs are mapped to the given MSTI?" This is useful in switchdev drivers, which might have to fan-out operations, relating to an MSTI, per VLAN. An example: When a port's MST state changes from forwarding to blocking, a driver may choose to flush the dynamic FDB entries on that port to get faster reconvergence of the network, but this should only be done in the VLANs that are managed by the MSTI in question. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 7 +++++++ net/bridge/br_mst.c | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 3aae023a9353..1cf0cc46d90d 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -119,6 +119,7 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); +int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -151,6 +152,12 @@ static inline int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, { return -EINVAL; } + +static inline int br_mst_get_info(const struct net_device *dev, u16 msti, + unsigned long *vids) +{ + return -EINVAL; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 00935a19afcc..00b36e629224 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -13,6 +13,32 @@ DEFINE_STATIC_KEY_FALSE(br_mst_used); +int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) +{ + const struct net_bridge_vlan_group *vg; + const struct net_bridge_vlan *v; + const struct net_bridge *br; + + ASSERT_RTNL(); + + if (!netif_is_bridge_master(dev)) + return -EINVAL; + + br = netdev_priv(dev); + if (!br_opt_get(br, BROPT_MST_ENABLED)) + return -EINVAL; + + vg = br_vlan_group(br); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (v->msti == msti) + __set_bit(v->vid, vids); + } + + return 0; +} +EXPORT_SYMBOL_GPL(br_mst_get_info); + static void br_mst_vlan_set_state(struct net_bridge_port *p, struct net_bridge_vlan *v, u8 state) { -- cgit v1.2.3 From 48d57b2e5f439246c4e12ed7705a5e3241294b03 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:50 +0100 Subject: net: bridge: mst: Add helper to check if MST is enabled This is useful for switchdev drivers that might want to refuse to join a bridge where MST is enabled, if the hardware can't support it. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 6 ++++++ net/bridge/br_mst.c | 9 +++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 1cf0cc46d90d..4efd5540279a 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -119,6 +119,7 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); +bool br_mst_enabled(const struct net_device *dev); int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids); #else static inline bool br_vlan_enabled(const struct net_device *dev) @@ -153,6 +154,11 @@ static inline int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, return -EINVAL; } +static inline bool br_mst_enabled(const struct net_device *dev) +{ + return false; +} + static inline int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) { diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 00b36e629224..830a5746479f 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -13,6 +13,15 @@ DEFINE_STATIC_KEY_FALSE(br_mst_used); +bool br_mst_enabled(const struct net_device *dev) +{ + if (!netif_is_bridge_master(dev)) + return false; + + return br_opt_get(netdev_priv(dev), BROPT_MST_ENABLED); +} +EXPORT_SYMBOL_GPL(br_mst_enabled); + int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) { const struct net_bridge_vlan_group *vg; -- cgit v1.2.3 From f54fd0e163068ced4d0d27db64cd3cbda95b74e4 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:51 +0100 Subject: net: bridge: mst: Add helper to query a port's MST state This is useful for switchdev drivers who are offloading MST states into hardware. As an example, a driver may wish to flush the FDB for a port when it transitions from forwarding to blocking - which means that the previous state must be discoverable. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 6 ++++++ net/bridge/br_mst.c | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 4efd5540279a..d62ef428e3aa 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -121,6 +121,7 @@ int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); bool br_mst_enabled(const struct net_device *dev); int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids); +int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -164,6 +165,11 @@ static inline int br_mst_get_info(const struct net_device *dev, u16 msti, { return -EINVAL; } +static inline int br_mst_get_state(const struct net_device *dev, u16 msti, + u8 *state) +{ + return -EINVAL; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 830a5746479f..ee680adcee17 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -48,6 +48,31 @@ int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) } EXPORT_SYMBOL_GPL(br_mst_get_info); +int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state) +{ + const struct net_bridge_port *p = NULL; + const struct net_bridge_vlan_group *vg; + const struct net_bridge_vlan *v; + + ASSERT_RTNL(); + + p = br_port_get_check_rtnl(dev); + if (!p || !br_opt_get(p->br, BROPT_MST_ENABLED)) + return -EINVAL; + + vg = nbp_vlan_group(p); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (v->brvlan->msti == msti) { + *state = v->state; + return 0; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL_GPL(br_mst_get_state); + static void br_mst_vlan_set_state(struct net_bridge_port *p, struct net_bridge_vlan *v, u8 state) { -- cgit v1.2.3 From 4f554e955614f19425cee86de4669351741a6280 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 15 Mar 2022 23:00:26 +0900 Subject: ftrace: Add ftrace_set_filter_ips function Adding ftrace_set_filter_ips function to be able to set filter on multiple ip addresses at once. With the kprobe multi attach interface we have cases where we need to initialize ftrace_ops object with thousands of functions, so having single function diving into ftrace_hash_move_and_update_ops with ftrace_lock is faster. The functions ips are passed as unsigned long array with count. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735282673.1084943.18310504594134769804.stgit@devnote2 --- include/linux/ftrace.h | 3 +++ kernel/trace/ftrace.c | 58 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9999e29187de..60847cbce0da 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -512,6 +512,8 @@ struct dyn_ftrace { int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset); +int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, + unsigned int cnt, int remove, int reset); int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, @@ -802,6 +804,7 @@ static inline unsigned long ftrace_location(unsigned long ip) #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) #define ftrace_set_filter_ip(ops, ip, remove, reset) ({ -ENODEV; }) +#define ftrace_set_filter_ips(ops, ips, cnt, remove, reset) ({ -ENODEV; }) #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_free_filter(ops) do { } while (0) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a4b462b6f944..93e992962ada 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4958,7 +4958,7 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) { struct ftrace_func_entry *entry; @@ -4976,9 +4976,30 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } +static int +ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips, + unsigned int cnt, int remove) +{ + unsigned int i; + int err; + + for (i = 0; i < cnt; i++) { + err = __ftrace_match_addr(hash, ips[i], remove); + if (err) { + /* + * This expects the @hash is a temporary hash and if this + * fails the caller must free the @hash. + */ + return err; + } + } + return 0; +} + static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, - unsigned long ip, int remove, int reset, int enable) + unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -5008,8 +5029,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } - if (ip) { - ret = ftrace_match_addr(hash, ip, remove); + if (ips) { + ret = ftrace_match_addr(hash, ips, cnt, remove); if (ret < 0) goto out_regex_unlock; } @@ -5026,10 +5047,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } static int -ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, - int reset, int enable) +ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt, + int remove, int reset, int enable) { - return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable); } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS @@ -5634,10 +5655,29 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) { ftrace_ops_init(ops); - return ftrace_set_addr(ops, ip, remove, reset, 1); + return ftrace_set_addr(ops, &ip, 1, remove, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); +/** + * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses + * @ops - the ops to set the filter with + * @ips - the array of addresses to add to or remove from the filter. + * @cnt - the number of addresses in @ips + * @remove - non zero to remove ips from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ips array or any ip specified within is NULL , it fails to update filter. + */ +int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, + unsigned int cnt, int remove, int reset) +{ + ftrace_ops_init(ops); + return ftrace_set_addr(ops, ips, cnt, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); + /** * ftrace_ops_set_global_filter - setup ops to use global filters * @ops - the ops which will use the global filters @@ -5659,7 +5699,7 @@ static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) { - return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); + return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable); } /** -- cgit v1.2.3 From cad9931f64dc7f5dbdec12cae9f30063360f9855 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:00:38 +0900 Subject: fprobe: Add ftrace based probe APIs The fprobe is a wrapper API for ftrace function tracer. Unlike kprobes, this probes only supports the function entry, but this can probe multiple functions by one fprobe. The usage is similar, user will set their callback to fprobe::entry_handler and call register_fprobe*() with probed functions. There are 3 registration interfaces, - register_fprobe() takes filtering patterns of the functin names. - register_fprobe_ips() takes an array of ftrace-location addresses. - register_fprobe_syms() takes an array of function names. The registered fprobes can be unregistered with unregister_fprobe(). e.g. struct fprobe fp = { .entry_handler = user_handler }; const char *targets[] = { "func1", "func2", "func3"}; ... ret = register_fprobe_syms(&fp, targets, ARRAY_SIZE(targets)); ... unregister_fprobe(&fp); Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735283857.1084943.1154436951479395551.stgit@devnote2 --- include/linux/fprobe.h | 87 ++++++++++++++++++++ kernel/trace/Kconfig | 12 +++ kernel/trace/Makefile | 1 + kernel/trace/fprobe.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 311 insertions(+) create mode 100644 include/linux/fprobe.h create mode 100644 kernel/trace/fprobe.c (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h new file mode 100644 index 000000000000..2ba099aff041 --- /dev/null +++ b/include/linux/fprobe.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Simple ftrace probe wrapper */ +#ifndef _LINUX_FPROBE_H +#define _LINUX_FPROBE_H + +#include +#include + +/** + * struct fprobe - ftrace based probe. + * @ops: The ftrace_ops. + * @nmissed: The counter for missing events. + * @flags: The status flag. + * @entry_handler: The callback function for function entry. + */ +struct fprobe { +#ifdef CONFIG_FUNCTION_TRACER + /* + * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too. + * But user of fprobe may keep embedding the struct fprobe on their own + * code. To avoid build error, this will keep the fprobe data structure + * defined here, but remove ftrace_ops data structure. + */ + struct ftrace_ops ops; +#endif + unsigned long nmissed; + unsigned int flags; + void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); +}; + +#define FPROBE_FL_DISABLED 1 + +static inline bool fprobe_disabled(struct fprobe *fp) +{ + return (fp) ? fp->flags & FPROBE_FL_DISABLED : false; +} + +#ifdef CONFIG_FPROBE +int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter); +int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); +int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); +int unregister_fprobe(struct fprobe *fp); +#else +static inline int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) +{ + return -EOPNOTSUPP; +} +static inline int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) +{ + return -EOPNOTSUPP; +} +static inline int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) +{ + return -EOPNOTSUPP; +} +static inline int unregister_fprobe(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} +#endif + +/** + * disable_fprobe() - Disable fprobe + * @fp: The fprobe to be disabled. + * + * This will soft-disable @fp. Note that this doesn't remove the ftrace + * hooks from the function entry. + */ +static inline void disable_fprobe(struct fprobe *fp) +{ + if (fp) + fp->flags |= FPROBE_FL_DISABLED; +} + +/** + * enable_fprobe() - Enable fprobe + * @fp: The fprobe to be enabled. + * + * This will soft-enable @fp. + */ +static inline void enable_fprobe(struct fprobe *fp) +{ + if (fp) + fp->flags &= ~FPROBE_FL_DISABLED; +} + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5eb5e7fd624..7ce31abc542b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -236,6 +236,18 @@ config DYNAMIC_FTRACE_WITH_ARGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS +config FPROBE + bool "Kernel Function Probe (fprobe)" + depends on FUNCTION_TRACER + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + This option enables kernel function probe (fprobe) based on ftrace, + which is similar to kprobes, but probes only for kernel function + entries and it can probe multiple functions by one fprobe. + + If unsure, say N. + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index bedc5caceec7..79255f9de9a4 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -97,6 +97,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o +obj-$(CONFIG_FPROBE) += fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c new file mode 100644 index 000000000000..7e8ceee339a0 --- /dev/null +++ b/kernel/trace/fprobe.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fprobe - Simple ftrace probe wrapper for function entry. + */ +#define pr_fmt(fmt) "fprobe: " fmt + +#include +#include +#include +#include +#include +#include + +static void fprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp; + int bit; + + fp = container_of(ops, struct fprobe, ops); + if (fprobe_disabled(fp)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) { + fp->nmissed++; + return; + } + + if (fp->entry_handler) + fp->entry_handler(fp, ip, ftrace_get_regs(fregs)); + + ftrace_test_recursion_unlock(bit); +} +NOKPROBE_SYMBOL(fprobe_handler); + +/* Convert ftrace location address from symbols */ +static unsigned long *get_ftrace_locations(const char **syms, int num) +{ + unsigned long addr, size; + unsigned long *addrs; + int i; + + /* Convert symbols to symbol address */ + addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < num; i++) { + addr = kallsyms_lookup_name(syms[i]); + if (!addr) /* Maybe wrong symbol */ + goto error; + + /* Convert symbol address to ftrace location. */ + if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size) + goto error; + + addr = ftrace_location_range(addr, addr + size - 1); + if (!addr) /* No dynamic ftrace there. */ + goto error; + + addrs[i] = addr; + } + + return addrs; + +error: + kfree(addrs); + + return ERR_PTR(-ENOENT); +} + +static void fprobe_init(struct fprobe *fp) +{ + fp->nmissed = 0; + fp->ops.func = fprobe_handler; + fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; +} + +/** + * register_fprobe() - Register fprobe to ftrace by pattern. + * @fp: A fprobe data structure to be registered. + * @filter: A wildcard pattern of probed symbols. + * @notfilter: A wildcard pattern of NOT probed symbols. + * + * Register @fp to ftrace for enabling the probe on the symbols matched to @filter. + * If @notfilter is not NULL, the symbols matched the @notfilter are not probed. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) +{ + unsigned char *str; + int ret, len; + + if (!fp || !filter) + return -EINVAL; + + fprobe_init(fp); + + len = strlen(filter); + str = kstrdup(filter, GFP_KERNEL); + ret = ftrace_set_filter(&fp->ops, str, len, 0); + kfree(str); + if (ret) + return ret; + + if (notfilter) { + len = strlen(notfilter); + str = kstrdup(notfilter, GFP_KERNEL); + ret = ftrace_set_notrace(&fp->ops, str, len, 0); + kfree(str); + if (ret) + goto out; + } + + ret = register_ftrace_function(&fp->ops); +out: + if (ret) + ftrace_free_filter(&fp->ops); + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe); + +/** + * register_fprobe_ips() - Register fprobe to ftrace by address. + * @fp: A fprobe data structure to be registered. + * @addrs: An array of target ftrace location addresses. + * @num: The number of entries of @addrs. + * + * Register @fp to ftrace for enabling the probe on the address given by @addrs. + * The @addrs must be the addresses of ftrace location address, which may be + * the symbol address + arch-dependent offset. + * If you unsure what this mean, please use other registration functions. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) +{ + int ret; + + if (!fp || !addrs || num <= 0) + return -EINVAL; + + fprobe_init(fp); + + ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + if (!ret) + ret = register_ftrace_function(&fp->ops); + + if (ret) + ftrace_free_filter(&fp->ops); + + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_ips); + +/** + * register_fprobe_syms() - Register fprobe to ftrace by symbols. + * @fp: A fprobe data structure to be registered. + * @syms: An array of target symbols. + * @num: The number of entries of @syms. + * + * Register @fp to the symbols given by @syms array. This will be useful if + * you are sure the symbols exist in the kernel. + * + * Return 0 if @fp is registered successfully, -errno if not. + */ +int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) +{ + unsigned long *addrs; + int ret; + + if (!fp || !syms || num <= 0) + return -EINVAL; + + addrs = get_ftrace_locations(syms, num); + if (IS_ERR(addrs)) + return PTR_ERR(addrs); + + ret = register_fprobe_ips(fp, addrs, num); + + kfree(addrs); + + return ret; +} +EXPORT_SYMBOL_GPL(register_fprobe_syms); + +/** + * unregister_fprobe() - Unregister fprobe from ftrace + * @fp: A fprobe data structure to be unregistered. + * + * Unregister fprobe (and remove ftrace hooks from the function entries). + * + * Return 0 if @fp is unregistered successfully, -errno if not. + */ +int unregister_fprobe(struct fprobe *fp) +{ + int ret; + + if (!fp || fp->ops.func != fprobe_handler) + return -EINVAL; + + ret = unregister_ftrace_function(&fp->ops); + + if (!ret) + ftrace_free_filter(&fp->ops); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_fprobe); -- cgit v1.2.3 From 54ecbe6f1ed5138c895bdff55608cf502755b20e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:00:50 +0900 Subject: rethook: Add a generic return hook Add a return hook framework which hooks the function return. Most of the logic came from the kretprobe, but this is independent from kretprobe. Note that this is expected to be used with other function entry hooking feature, like ftrace, fprobe, adn kprobes. Eventually this will replace the kretprobe (e.g. kprobe + rethook = kretprobe), but at this moment, this is just an additional hook. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735285066.1084943.9259661137330166643.stgit@devnote2 --- include/linux/rethook.h | 100 +++++++++++++++ include/linux/sched.h | 3 + kernel/exit.c | 2 + kernel/fork.c | 3 + kernel/trace/Kconfig | 11 ++ kernel/trace/Makefile | 1 + kernel/trace/rethook.c | 317 ++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 437 insertions(+) create mode 100644 include/linux/rethook.h create mode 100644 kernel/trace/rethook.c (limited to 'include/linux') diff --git a/include/linux/rethook.h b/include/linux/rethook.h new file mode 100644 index 000000000000..c8ac1e5afcd1 --- /dev/null +++ b/include/linux/rethook.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Return hooking with list-based shadow stack. + */ +#ifndef _LINUX_RETHOOK_H +#define _LINUX_RETHOOK_H + +#include +#include +#include +#include +#include +#include + +struct rethook_node; + +typedef void (*rethook_handler_t) (struct rethook_node *, void *, struct pt_regs *); + +/** + * struct rethook - The rethook management data structure. + * @data: The user-defined data storage. + * @handler: The user-defined return hook handler. + * @pool: The pool of struct rethook_node. + * @ref: The reference counter. + * @rcu: The rcu_head for deferred freeing. + * + * Don't embed to another data structure, because this is a self-destructive + * data structure when all rethook_node are freed. + */ +struct rethook { + void *data; + rethook_handler_t handler; + struct freelist_head pool; + refcount_t ref; + struct rcu_head rcu; +}; + +/** + * struct rethook_node - The rethook shadow-stack entry node. + * @freelist: The freelist, linked to struct rethook::pool. + * @rcu: The rcu_head for deferred freeing. + * @llist: The llist, linked to a struct task_struct::rethooks. + * @rethook: The pointer to the struct rethook. + * @ret_addr: The storage for the real return address. + * @frame: The storage for the frame pointer. + * + * You can embed this to your extended data structure to store any data + * on each entry of the shadow stack. + */ +struct rethook_node { + union { + struct freelist_node freelist; + struct rcu_head rcu; + }; + struct llist_node llist; + struct rethook *rethook; + unsigned long ret_addr; + unsigned long frame; +}; + +struct rethook *rethook_alloc(void *data, rethook_handler_t handler); +void rethook_free(struct rethook *rh); +void rethook_add_node(struct rethook *rh, struct rethook_node *node); +struct rethook_node *rethook_try_get(struct rethook *rh); +void rethook_recycle(struct rethook_node *node); +void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount); +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame, + struct llist_node **cur); + +/* Arch dependent code must implement arch_* and trampoline code */ +void arch_rethook_prepare(struct rethook_node *node, struct pt_regs *regs, bool mcount); +void arch_rethook_trampoline(void); + +/** + * is_rethook_trampoline() - Check whether the address is rethook trampoline + * @addr: The address to be checked + * + * Return true if the @addr is the rethook trampoline address. + */ +static inline bool is_rethook_trampoline(unsigned long addr) +{ + return addr == (unsigned long)dereference_symbol_descriptor(arch_rethook_trampoline); +} + +/* If the architecture needs to fixup the return address, implement it. */ +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr); + +/* Generic trampoline handler, arch code must prepare asm stub */ +unsigned long rethook_trampoline_handler(struct pt_regs *regs, + unsigned long frame); + +#ifdef CONFIG_RETHOOK +void rethook_flush_task(struct task_struct *tk); +#else +#define rethook_flush_task(tsk) do { } while (0) +#endif + +#endif + diff --git a/include/linux/sched.h b/include/linux/sched.h index 75ba8aa60248..7034f53404e3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1481,6 +1481,9 @@ struct task_struct { #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif +#ifdef CONFIG_RETHOOK + struct llist_head rethooks; +#endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* diff --git a/kernel/exit.c b/kernel/exit.c index b00a25bb4ab9..2d1803fa8fe6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -169,6 +170,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index a024bf6254df..3db1a4110a25 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2255,6 +2255,9 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif +#ifdef CONFIG_RETHOOK + p->rethooks.first = NULL; +#endif /* * Ensure that the cgroup subsystem policies allow the new process to be diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 7ce31abc542b..e75504e42ab8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT config NOP_TRACER bool +config HAVE_RETHOOK + bool + +config RETHOOK + bool + depends on HAVE_RETHOOK + help + Enable generic return hooking feature. This is an internal + API, which will be used by other function-entry hooking + features like fprobe and kprobes. + config HAVE_FUNCTION_TRACER bool help diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 79255f9de9a4..c6f11a139eac 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -98,6 +98,7 @@ obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_FPROBE) += fprobe.o +obj-$(CONFIG_RETHOOK) += rethook.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c new file mode 100644 index 000000000000..ab463a4d2b23 --- /dev/null +++ b/kernel/trace/rethook.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define pr_fmt(fmt) "rethook: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* Return hook list (shadow stack by list) */ + +/* + * This function is called from delayed_put_task_struct() when a task is + * dead and cleaned up to recycle any kretprobe instances associated with + * this task. These left over instances represent probed functions that + * have been called but will never return. + */ +void rethook_flush_task(struct task_struct *tk) +{ + struct rethook_node *rhn; + struct llist_node *node; + + node = __llist_del_all(&tk->rethooks); + while (node) { + rhn = container_of(node, struct rethook_node, llist); + node = node->next; + preempt_disable(); + rethook_recycle(rhn); + preempt_enable(); + } +} + +static void rethook_free_rcu(struct rcu_head *head) +{ + struct rethook *rh = container_of(head, struct rethook, rcu); + struct rethook_node *rhn; + struct freelist_node *node; + int count = 1; + + node = rh->pool.head; + while (node) { + rhn = container_of(node, struct rethook_node, freelist); + node = node->next; + kfree(rhn); + count++; + } + + /* The rh->ref is the number of pooled node + 1 */ + if (refcount_sub_and_test(count, &rh->ref)) + kfree(rh); +} + +/** + * rethook_free() - Free struct rethook. + * @rh: the struct rethook to be freed. + * + * Free the rethook. Before calling this function, user must ensure the + * @rh::data is cleaned if needed (or, the handler can access it after + * calling this function.) This function will set the @rh to be freed + * after all rethook_node are freed (not soon). And the caller must + * not touch @rh after calling this. + */ +void rethook_free(struct rethook *rh) +{ + rcu_assign_pointer(rh->handler, NULL); + + call_rcu(&rh->rcu, rethook_free_rcu); +} + +/** + * rethook_alloc() - Allocate struct rethook. + * @data: a data to pass the @handler when hooking the return. + * @handler: the return hook callback function. + * + * Allocate and initialize a new rethook with @data and @handler. + * Return NULL if memory allocation fails or @handler is NULL. + * Note that @handler == NULL means this rethook is going to be freed. + */ +struct rethook *rethook_alloc(void *data, rethook_handler_t handler) +{ + struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL); + + if (!rh || !handler) + return NULL; + + rh->data = data; + rh->handler = handler; + rh->pool.head = NULL; + refcount_set(&rh->ref, 1); + + return rh; +} + +/** + * rethook_add_node() - Add a new node to the rethook. + * @rh: the struct rethook. + * @node: the struct rethook_node to be added. + * + * Add @node to @rh. User must allocate @node (as a part of user's + * data structure.) The @node fields are initialized in this function. + */ +void rethook_add_node(struct rethook *rh, struct rethook_node *node) +{ + node->rethook = rh; + freelist_add(&node->freelist, &rh->pool); + refcount_inc(&rh->ref); +} + +static void free_rethook_node_rcu(struct rcu_head *head) +{ + struct rethook_node *node = container_of(head, struct rethook_node, rcu); + + if (refcount_dec_and_test(&node->rethook->ref)) + kfree(node->rethook); + kfree(node); +} + +/** + * rethook_recycle() - return the node to rethook. + * @node: The struct rethook_node to be returned. + * + * Return back the @node to @node::rethook. If the @node::rethook is already + * marked as freed, this will free the @node. + */ +void rethook_recycle(struct rethook_node *node) +{ + lockdep_assert_preemption_disabled(); + + if (likely(READ_ONCE(node->rethook->handler))) + freelist_add(&node->freelist, &node->rethook->pool); + else + call_rcu(&node->rcu, free_rethook_node_rcu); +} +NOKPROBE_SYMBOL(rethook_recycle); + +/** + * rethook_try_get() - get an unused rethook node. + * @rh: The struct rethook which pools the nodes. + * + * Get an unused rethook node from @rh. If the node pool is empty, this + * will return NULL. Caller must disable preemption. + */ +struct rethook_node *rethook_try_get(struct rethook *rh) +{ + rethook_handler_t handler = READ_ONCE(rh->handler); + struct freelist_node *fn; + + lockdep_assert_preemption_disabled(); + + /* Check whether @rh is going to be freed. */ + if (unlikely(!handler)) + return NULL; + + fn = freelist_try_get(&rh->pool); + if (!fn) + return NULL; + + return container_of(fn, struct rethook_node, freelist); +} +NOKPROBE_SYMBOL(rethook_try_get); + +/** + * rethook_hook() - Hook the current function return. + * @node: The struct rethook node to hook the function return. + * @regs: The struct pt_regs for the function entry. + * @mcount: True if this is called from mcount(ftrace) context. + * + * Hook the current running function return. This must be called when the + * function entry (or at least @regs must be the registers of the function + * entry.) @mcount is used for identifying the context. If this is called + * from ftrace (mcount) callback, @mcount must be set true. If this is called + * from the real function entry (e.g. kprobes) @mcount must be set false. + * This is because the way to hook the function return depends on the context. + */ +void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount) +{ + arch_rethook_prepare(node, regs, mcount); + __llist_add(&node->llist, ¤t->rethooks); +} +NOKPROBE_SYMBOL(rethook_hook); + +/* This assumes the 'tsk' is the current task or is not running. */ +static unsigned long __rethook_find_ret_addr(struct task_struct *tsk, + struct llist_node **cur) +{ + struct rethook_node *rh = NULL; + struct llist_node *node = *cur; + + if (!node) + node = tsk->rethooks.first; + else + node = node->next; + + while (node) { + rh = container_of(node, struct rethook_node, llist); + if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) { + *cur = node; + return rh->ret_addr; + } + node = node->next; + } + return 0; +} +NOKPROBE_SYMBOL(__rethook_find_ret_addr); + +/** + * rethook_find_ret_addr -- Find correct return address modified by rethook + * @tsk: Target task + * @frame: A frame pointer + * @cur: a storage of the loop cursor llist_node pointer for next call + * + * Find the correct return address modified by a rethook on @tsk in unsigned + * long type. + * The @tsk must be 'current' or a task which is not running. @frame is a hint + * to get the currect return address - which is compared with the + * rethook::frame field. The @cur is a loop cursor for searching the + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the + * first call, but '@cur' itself must NOT NULL. + * + * Returns found address value or zero if not found. + */ +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame, + struct llist_node **cur) +{ + struct rethook_node *rhn = NULL; + unsigned long ret; + + if (WARN_ON_ONCE(!cur)) + return 0; + + if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + return 0; + + do { + ret = __rethook_find_ret_addr(tsk, cur); + if (!ret) + break; + rhn = container_of(*cur, struct rethook_node, llist); + } while (rhn->frame != frame); + + return ret; +} +NOKPROBE_SYMBOL(rethook_find_ret_addr); + +void __weak arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr) +{ + /* + * Do nothing by default. If the architecture which uses a + * frame pointer to record real return address on the stack, + * it should fill this function to fixup the return address + * so that stacktrace works from the rethook handler. + */ +} + +/* This function will be called from each arch-defined trampoline. */ +unsigned long rethook_trampoline_handler(struct pt_regs *regs, + unsigned long frame) +{ + struct llist_node *first, *node = NULL; + unsigned long correct_ret_addr; + rethook_handler_t handler; + struct rethook_node *rhn; + + correct_ret_addr = __rethook_find_ret_addr(current, &node); + if (!correct_ret_addr) { + pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n"); + BUG_ON(1); + } + + instruction_pointer_set(regs, correct_ret_addr); + + /* + * These loops must be protected from rethook_free_rcu() because those + * are accessing 'rhn->rethook'. + */ + preempt_disable(); + + /* + * Run the handler on the shadow stack. Do not unlink the list here because + * stackdump inside the handlers needs to decode it. + */ + first = current->rethooks.first; + while (first) { + rhn = container_of(first, struct rethook_node, llist); + if (WARN_ON_ONCE(rhn->frame != frame)) + break; + handler = READ_ONCE(rhn->rethook->handler); + if (handler) + handler(rhn, rhn->rethook->data, regs); + + if (first == node) + break; + first = first->next; + } + + /* Fixup registers for returning to correct address. */ + arch_rethook_fixup_return(regs, correct_ret_addr); + + /* Unlink used shadow stack */ + first = current->rethooks.first; + current->rethooks.first = node->next; + node->next = NULL; + + while (first) { + rhn = container_of(first, struct rethook_node, llist); + first = first->next; + rethook_recycle(rhn); + } + preempt_enable(); + + return correct_ret_addr; +} +NOKPROBE_SYMBOL(rethook_trampoline_handler); -- cgit v1.2.3 From 5b0ab78998e32564a011b14c4c7f9c81e2d42b9d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:01:48 +0900 Subject: fprobe: Add exit_handler support Add exit_handler to fprobe. fprobe + rethook allows us to hook the kernel function return. The rethook will be enabled only if the fprobe::exit_handler is set. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735290790.1084943.10601965782208052202.stgit@devnote2 --- include/linux/fprobe.h | 6 +++ kernel/trace/Kconfig | 9 ++-- kernel/trace/fprobe.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 122 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 2ba099aff041..8eefec2b485e 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -5,13 +5,16 @@ #include #include +#include /** * struct fprobe - ftrace based probe. * @ops: The ftrace_ops. * @nmissed: The counter for missing events. * @flags: The status flag. + * @rethook: The rethook data structure. (internal data) * @entry_handler: The callback function for function entry. + * @exit_handler: The callback function for function exit. */ struct fprobe { #ifdef CONFIG_FUNCTION_TRACER @@ -25,7 +28,10 @@ struct fprobe { #endif unsigned long nmissed; unsigned int flags; + struct rethook *rethook; + void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); + void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); }; #define FPROBE_FL_DISABLED 1 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e75504e42ab8..99dd4ca63d68 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -251,11 +251,14 @@ config FPROBE bool "Kernel Function Probe (fprobe)" depends on FUNCTION_TRACER depends on DYNAMIC_FTRACE_WITH_REGS + depends on HAVE_RETHOOK + select RETHOOK default n help - This option enables kernel function probe (fprobe) based on ftrace, - which is similar to kprobes, but probes only for kernel function - entries and it can probe multiple functions by one fprobe. + This option enables kernel function probe (fprobe) based on ftrace. + The fprobe is similar to kprobes, but probes only for kernel function + entries and exits. This also can probe multiple functions by one + fprobe. If unsure, say N. diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 7e8ceee339a0..38073632bfe4 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -8,12 +8,22 @@ #include #include #include +#include #include #include +#include "trace.h" + +struct fprobe_rethook_node { + struct rethook_node node; + unsigned long entry_ip; +}; + static void fprobe_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct fprobe_rethook_node *fpr; + struct rethook_node *rh; struct fprobe *fp; int bit; @@ -30,10 +40,37 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip, if (fp->entry_handler) fp->entry_handler(fp, ip, ftrace_get_regs(fregs)); + if (fp->exit_handler) { + rh = rethook_try_get(fp->rethook); + if (!rh) { + fp->nmissed++; + goto out; + } + fpr = container_of(rh, struct fprobe_rethook_node, node); + fpr->entry_ip = ip; + rethook_hook(rh, ftrace_get_regs(fregs), true); + } + +out: ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(fprobe_handler); +static void fprobe_exit_handler(struct rethook_node *rh, void *data, + struct pt_regs *regs) +{ + struct fprobe *fp = (struct fprobe *)data; + struct fprobe_rethook_node *fpr; + + if (!fp || fprobe_disabled(fp)) + return; + + fpr = container_of(rh, struct fprobe_rethook_node, node); + + fp->exit_handler(fp, fpr->entry_ip, regs); +} +NOKPROBE_SYMBOL(fprobe_exit_handler); + /* Convert ftrace location address from symbols */ static unsigned long *get_ftrace_locations(const char **syms, int num) { @@ -77,6 +114,48 @@ static void fprobe_init(struct fprobe *fp) fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; } +static int fprobe_init_rethook(struct fprobe *fp, int num) +{ + int i, size; + + if (num < 0) + return -EINVAL; + + if (!fp->exit_handler) { + fp->rethook = NULL; + return 0; + } + + /* Initialize rethook if needed */ + size = num * num_possible_cpus() * 2; + if (size < 0) + return -E2BIG; + + fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); + for (i = 0; i < size; i++) { + struct rethook_node *node; + + node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL); + if (!node) { + rethook_free(fp->rethook); + fp->rethook = NULL; + return -ENOMEM; + } + rethook_add_node(fp->rethook, node); + } + return 0; +} + +static void fprobe_fail_cleanup(struct fprobe *fp) +{ + if (fp->rethook) { + /* Don't need to cleanup rethook->handler because this is not used. */ + rethook_free(fp->rethook); + fp->rethook = NULL; + } + ftrace_free_filter(&fp->ops); +} + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. @@ -90,6 +169,7 @@ static void fprobe_init(struct fprobe *fp) */ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) { + struct ftrace_hash *hash; unsigned char *str; int ret, len; @@ -114,10 +194,21 @@ int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter goto out; } - ret = register_ftrace_function(&fp->ops); + /* TODO: + * correctly calculate the total number of filtered symbols + * from both filter and notfilter. + */ + hash = fp->ops.local_hash.filter_hash; + if (WARN_ON_ONCE(!hash)) + goto out; + + ret = fprobe_init_rethook(fp, (int)hash->count); + if (!ret) + ret = register_ftrace_function(&fp->ops); + out: if (ret) - ftrace_free_filter(&fp->ops); + fprobe_fail_cleanup(fp); return ret; } EXPORT_SYMBOL_GPL(register_fprobe); @@ -145,12 +236,15 @@ int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) fprobe_init(fp); ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0); + if (ret) + return ret; + + ret = fprobe_init_rethook(fp, num); if (!ret) ret = register_ftrace_function(&fp->ops); if (ret) - ftrace_free_filter(&fp->ops); - + fprobe_fail_cleanup(fp); return ret; } EXPORT_SYMBOL_GPL(register_fprobe_ips); @@ -201,10 +295,20 @@ int unregister_fprobe(struct fprobe *fp) if (!fp || fp->ops.func != fprobe_handler) return -EINVAL; + /* + * rethook_free() starts disabling the rethook, but the rethook handlers + * may be running on other processors at this point. To make sure that all + * current running handlers are finished, call unregister_ftrace_function() + * after this. + */ + if (fp->rethook) + rethook_free(fp->rethook); + ret = unregister_ftrace_function(&fp->ops); + if (ret < 0) + return ret; - if (!ret) - ftrace_free_filter(&fp->ops); + ftrace_free_filter(&fp->ops); return ret; } -- cgit v1.2.3 From ab51e15d535e07be9839e0df056a4ebe9c5bac83 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:02:11 +0900 Subject: fprobe: Introduce FPROBE_FL_KPROBE_SHARED flag for fprobe Introduce FPROBE_FL_KPROBE_SHARED flag for sharing fprobe callback with kprobes safely from the viewpoint of recursion. Since the recursion safety of the fprobe (and ftrace) is a bit different from the kprobes, this may cause an issue if user wants to run the same code from the fprobe and the kprobes. The kprobes has per-cpu 'current_kprobe' variable which protects the kprobe handler from recursion in any case. On the other hand, the fprobe uses only ftrace_test_recursion_trylock(), which will allow interrupt context calls another (or same) fprobe during the fprobe user handler is running. This is not a matter in cases if the common callback shared among the kprobes and the fprobe has its own recursion detection, or it can handle the recursion in the different contexts (normal/interrupt/NMI.) But if it relies on the 'current_kprobe' recursion lock, it has to check kprobe_running() and use kprobe_busy_*() APIs. Fprobe has FPROBE_FL_KPROBE_SHARED flag to do this. If your common callback code will be shared with kprobes, please set FPROBE_FL_KPROBE_SHARED *before* registering the fprobe, like; fprobe.flags = FPROBE_FL_KPROBE_SHARED; register_fprobe(&fprobe, "func*", NULL); This will protect your common callback from the nested call. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735293127.1084943.15687374237275817599.stgit@devnote2 --- include/linux/fprobe.h | 12 ++++++++++++ include/linux/kprobes.h | 3 +++ kernel/trace/fprobe.c | 19 ++++++++++++++++++- 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 8eefec2b485e..1c2bde0ead73 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -34,13 +34,25 @@ struct fprobe { void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); }; +/* This fprobe is soft-disabled. */ #define FPROBE_FL_DISABLED 1 +/* + * This fprobe handler will be shared with kprobes. + * This flag must be set before registering. + */ +#define FPROBE_FL_KPROBE_SHARED 2 + static inline bool fprobe_disabled(struct fprobe *fp) { return (fp) ? fp->flags & FPROBE_FL_DISABLED : false; } +static inline bool fprobe_shared_with_kprobes(struct fprobe *fp) +{ + return (fp) ? fp->flags & FPROBE_FL_KPROBE_SHARED : false; +} + #ifdef CONFIG_FPROBE int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter); int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 19b884353b15..5f1859836deb 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -427,6 +427,9 @@ static inline struct kprobe *kprobe_running(void) { return NULL; } +#define kprobe_busy_begin() do {} while (0) +#define kprobe_busy_end() do {} while (0) + static inline int register_kprobe(struct kprobe *p) { return -EOPNOTSUPP; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 38073632bfe4..8b2dd5b9dcd1 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -56,6 +56,20 @@ out: } NOKPROBE_SYMBOL(fprobe_handler); +static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct ftrace_regs *fregs) +{ + struct fprobe *fp = container_of(ops, struct fprobe, ops); + + if (unlikely(kprobe_running())) { + fp->nmissed++; + return; + } + kprobe_busy_begin(); + fprobe_handler(ip, parent_ip, ops, fregs); + kprobe_busy_end(); +} + static void fprobe_exit_handler(struct rethook_node *rh, void *data, struct pt_regs *regs) { @@ -110,7 +124,10 @@ error: static void fprobe_init(struct fprobe *fp) { fp->nmissed = 0; - fp->ops.func = fprobe_handler; + if (fprobe_shared_with_kprobes(fp)) + fp->ops.func = fprobe_kprobe_handler; + else + fp->ops.func = fprobe_handler; fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS; } -- cgit v1.2.3 From a0019cd7d41a191253859349535d76ee28ab7d96 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2022 13:24:07 +0100 Subject: lib/sort: Add priv pointer to swap function Adding support to have priv pointer in swap callback function. Following the initial change on cmp callback functions [1] and adding SWAP_WRAPPER macro to identify sort call of sort_r. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20220316122419.933957-2-jolsa@kernel.org [1] 4333fb96ca10 ("media: lib/sort.c: implement sort() variant taking context argument") --- include/linux/sort.h | 2 +- include/linux/types.h | 1 + lib/sort.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 32 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sort.h b/include/linux/sort.h index b5898725fe9d..e163287ac6c1 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -6,7 +6,7 @@ void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, - swap_func_t swap_func, + swap_r_func_t swap_func, const void *priv); void sort(void *base, size_t num, size_t size, diff --git a/include/linux/types.h b/include/linux/types.h index ac825ad90e44..ea8cf60a8a79 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -226,6 +226,7 @@ struct callback_head { typedef void (*rcu_callback_t)(struct rcu_head *head); typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); +typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv); typedef void (*swap_func_t)(void *a, void *b, int size); typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); diff --git a/lib/sort.c b/lib/sort.c index aa18153864d2..b399bf10d675 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -122,16 +122,27 @@ static void swap_bytes(void *a, void *b, size_t n) * a pointer, but small integers make for the smallest compare * instructions. */ -#define SWAP_WORDS_64 (swap_func_t)0 -#define SWAP_WORDS_32 (swap_func_t)1 -#define SWAP_BYTES (swap_func_t)2 +#define SWAP_WORDS_64 (swap_r_func_t)0 +#define SWAP_WORDS_32 (swap_r_func_t)1 +#define SWAP_BYTES (swap_r_func_t)2 +#define SWAP_WRAPPER (swap_r_func_t)3 + +struct wrapper { + cmp_func_t cmp; + swap_func_t swap; +}; /* * The function pointer is last to make tail calls most efficient if the * compiler decides not to inline this function. */ -static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func) +static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv) { + if (swap_func == SWAP_WRAPPER) { + ((const struct wrapper *)priv)->swap(a, b, (int)size); + return; + } + if (swap_func == SWAP_WORDS_64) swap_words_64(a, b, size); else if (swap_func == SWAP_WORDS_32) @@ -139,7 +150,7 @@ static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func) else if (swap_func == SWAP_BYTES) swap_bytes(a, b, size); else - swap_func(a, b, (int)size); + swap_func(a, b, (int)size, priv); } #define _CMP_WRAPPER ((cmp_r_func_t)0L) @@ -147,7 +158,7 @@ static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func) static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv) { if (cmp == _CMP_WRAPPER) - return ((cmp_func_t)(priv))(a, b); + return ((const struct wrapper *)priv)->cmp(a, b); return cmp(a, b, priv); } @@ -198,7 +209,7 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size) */ void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, - swap_func_t swap_func, + swap_r_func_t swap_func, const void *priv) { /* pre-scale counters for performance */ @@ -208,6 +219,10 @@ void sort_r(void *base, size_t num, size_t size, if (!a) /* num < 2 || size == 0 */ return; + /* called from 'sort' without swap function, let's pick the default */ + if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap) + swap_func = NULL; + if (!swap_func) { if (is_aligned(base, size, 8)) swap_func = SWAP_WORDS_64; @@ -230,7 +245,7 @@ void sort_r(void *base, size_t num, size_t size, if (a) /* Building heap: sift down --a */ a -= size; else if (n -= size) /* Sorting: Extract root to --n */ - do_swap(base, base + n, size, swap_func); + do_swap(base, base + n, size, swap_func, priv); else /* Sort complete */ break; @@ -257,7 +272,7 @@ void sort_r(void *base, size_t num, size_t size, c = b; /* Where "a" belongs */ while (b != a) { /* Shift it into place */ b = parent(b, lsbit, size); - do_swap(base + b, base + c, size, swap_func); + do_swap(base + b, base + c, size, swap_func, priv); } } } @@ -267,6 +282,11 @@ void sort(void *base, size_t num, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) { - return sort_r(base, num, size, _CMP_WRAPPER, swap_func, cmp_func); + struct wrapper w = { + .cmp = cmp_func, + .swap = swap_func, + }; + + return sort_r(base, num, size, _CMP_WRAPPER, SWAP_WRAPPER, &w); } EXPORT_SYMBOL(sort); -- cgit v1.2.3 From 0dcac272540613d41c05e89679e4ddb978b612f1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2022 13:24:09 +0100 Subject: bpf: Add multi kprobe link Adding new link type BPF_LINK_TYPE_KPROBE_MULTI that attaches kprobe program through fprobe API. The fprobe API allows to attach probe on multiple functions at once very fast, because it works on top of ftrace. On the other hand this limits the probe point to the function entry or return. The kprobe program gets the same pt_regs input ctx as when it's attached through the perf API. Adding new attach type BPF_TRACE_KPROBE_MULTI that allows attachment kprobe to multiple function with new link. User provides array of addresses or symbols with count to attach the kprobe program to. The new link_create uapi interface looks like: struct { __u32 flags; __u32 cnt; __aligned_u64 syms; __aligned_u64 addrs; } kprobe_multi; The flags field allows single BPF_TRACE_KPROBE_MULTI bit to create return multi kprobe. Signed-off-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220316122419.933957-4-jolsa@kernel.org --- include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 7 ++ include/uapi/linux/bpf.h | 13 +++ kernel/bpf/syscall.c | 26 ++++- kernel/trace/bpf_trace.c | 211 +++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 13 +++ 6 files changed, 266 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 48a91c51c015..3e24ad0c4b3c 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -140,3 +140,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #ifdef CONFIG_PERF_EVENTS BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) #endif +BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index dcea51fb60e2..8f0e9e7cb493 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -15,6 +15,7 @@ struct array_buffer; struct tracer; struct dentry; struct bpf_prog; +union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, @@ -738,6 +739,7 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -779,6 +781,11 @@ static inline int bpf_get_perf_event_info(const struct perf_event *event, { return -EOPNOTSUPP; } +static inline int +bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif enum { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 99fab54ae9c0..d77f47af7752 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -997,6 +997,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, __MAX_BPF_ATTACH_TYPE }; @@ -1011,6 +1012,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, MAX_BPF_LINK_TYPE, }; @@ -1118,6 +1120,11 @@ enum bpf_link_type { */ #define BPF_F_XDP_HAS_FRAGS (1U << 5) +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * @@ -1475,6 +1482,12 @@ union bpf_attr { */ __u64 bpf_cookie; } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + } kprobe_multi; }; } link_create; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9beb585be5a6..b8bb67ee6c57 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -32,6 +32,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -3022,6 +3023,11 @@ out_put_file: fput(perf_file); return err; } +#else +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_PERF_EVENTS */ #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -4255,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len +#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.addrs static int link_create(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type ptype; @@ -4279,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; @@ -4287,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) } ptype = prog->type; break; + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type != BPF_PERF_EVENT && + attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { @@ -4318,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; #endif -#ifdef CONFIG_PERF_EVENTS case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: - case BPF_PROG_TYPE_KPROBE: ret = bpf_perf_link_attach(attr, prog); break; -#endif + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type == BPF_PERF_EVENT) + ret = bpf_perf_link_attach(attr, prog); + else + ret = bpf_kprobe_multi_link_attach(attr, prog); + break; default: ret = -EINVAL; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a2024ba32a20..fffa2171fae4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -2181,3 +2182,213 @@ static int __init bpf_event_init(void) fs_initcall(bpf_event_init); #endif /* CONFIG_MODULES */ + +#ifdef CONFIG_FPROBE +struct bpf_kprobe_multi_link { + struct bpf_link link; + struct fprobe fp; + unsigned long *addrs; +}; + +static void bpf_kprobe_multi_link_release(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + unregister_fprobe(&kmulti_link->fp); +} + +static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + kvfree(kmulti_link->addrs); + kfree(kmulti_link); +} + +static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { + .release = bpf_kprobe_multi_link_release, + .dealloc = bpf_kprobe_multi_link_dealloc, +}; + +static int +kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link, + struct pt_regs *regs) +{ + int err; + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + err = 0; + goto out; + } + + migrate_disable(); + rcu_read_lock(); + err = bpf_prog_run(link->link.prog, regs); + rcu_read_unlock(); + migrate_enable(); + + out: + __this_cpu_dec(bpf_prog_active); + return err; +} + +static void +kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip, + struct pt_regs *regs) +{ + unsigned long saved_ip = instruction_pointer(regs); + struct bpf_kprobe_multi_link *link; + + /* + * Because fprobe's regs->ip is set to the next instruction of + * dynamic-ftrace instruction, correct entry ip must be set, so + * that the bpf program can access entry address via regs as same + * as kprobes. + * + * Both kprobe and kretprobe see the entry ip of traced function + * as instruction pointer. + */ + instruction_pointer_set(regs, entry_ip); + + link = container_of(fp, struct bpf_kprobe_multi_link, fp); + kprobe_multi_link_prog_run(link, regs); + + instruction_pointer_set(regs, saved_ip); +} + +static int +kprobe_multi_resolve_syms(const void *usyms, u32 cnt, + unsigned long *addrs) +{ + unsigned long addr, size; + const char **syms; + int err = -ENOMEM; + unsigned int i; + char *func; + + size = cnt * sizeof(*syms); + syms = kvzalloc(size, GFP_KERNEL); + if (!syms) + return -ENOMEM; + + func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!func) + goto error; + + if (copy_from_user(syms, usyms, size)) { + err = -EFAULT; + goto error; + } + + for (i = 0; i < cnt; i++) { + err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN); + if (err == KSYM_NAME_LEN) + err = -E2BIG; + if (err < 0) + goto error; + err = -EINVAL; + addr = kallsyms_lookup_name(func); + if (!addr) + goto error; + if (!kallsyms_lookup_size_offset(addr, &size, NULL)) + goto error; + addr = ftrace_location_range(addr, addr + size - 1); + if (!addr) + goto error; + addrs[i] = addr; + } + + err = 0; +error: + kvfree(syms); + kfree(func); + return err; +} + +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_kprobe_multi_link *link = NULL; + struct bpf_link_primer link_primer; + unsigned long *addrs; + u32 flags, cnt, size; + void __user *uaddrs; + void __user *usyms; + int err; + + /* no support for 32bit archs yet */ + if (sizeof(u64) != sizeof(void *)) + return -EOPNOTSUPP; + + if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI) + return -EINVAL; + + flags = attr->link_create.kprobe_multi.flags; + if (flags & ~BPF_F_KPROBE_MULTI_RETURN) + return -EINVAL; + + uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs); + usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms); + if (!!uaddrs == !!usyms) + return -EINVAL; + + cnt = attr->link_create.kprobe_multi.cnt; + if (!cnt) + return -EINVAL; + + size = cnt * sizeof(*addrs); + addrs = kvmalloc(size, GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + if (uaddrs) { + if (copy_from_user(addrs, uaddrs, size)) { + err = -EFAULT; + goto error; + } + } else { + err = kprobe_multi_resolve_syms(usyms, cnt, addrs); + if (err) + goto error; + } + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + err = -ENOMEM; + goto error; + } + + bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, + &bpf_kprobe_multi_link_lops, prog); + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto error; + + if (flags & BPF_F_KPROBE_MULTI_RETURN) + link->fp.exit_handler = kprobe_multi_link_handler; + else + link->fp.entry_handler = kprobe_multi_link_handler; + + link->addrs = addrs; + + err = register_fprobe_ips(&link->fp, addrs, cnt); + if (err) { + bpf_link_cleanup(&link_primer); + return err; + } + + return bpf_link_settle(&link_primer); + +error: + kfree(link); + kvfree(addrs); + return err; +} +#else /* !CONFIG_FPROBE */ +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} +#endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 99fab54ae9c0..d77f47af7752 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -997,6 +997,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, __MAX_BPF_ATTACH_TYPE }; @@ -1011,6 +1012,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, MAX_BPF_LINK_TYPE, }; @@ -1118,6 +1120,11 @@ enum bpf_link_type { */ #define BPF_F_XDP_HAS_FRAGS (1U << 5) +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * @@ -1475,6 +1482,12 @@ union bpf_attr { */ __u64 bpf_cookie; } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + } kprobe_multi; }; } link_create; -- cgit v1.2.3 From 046e1537a3cf0adc68fe865b5dc9a7e731cc63b3 Mon Sep 17 00:00:00 2001 From: Íñigo Huguet Date: Tue, 15 Mar 2022 10:18:32 +0100 Subject: net: set default rss queues num to physical cores / 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Network drivers can call to netif_get_num_default_rss_queues to get the default number of receive queues to use. Right now, this default number is min(8, num_online_cpus()). Instead, as suggested by Jakub, use the number of physical cores divided by 2 as a way to avoid wasting CPU resources and to avoid using both CPU threads, but still allowing to scale for high-end processors with many cores. As an exception, select 2 queues for processors with 2 cores, because otherwise it won't take any advantage of RSS despite being SMP capable. Tested: Processor Intel Xeon E5-2620 (2 sockets, 6 cores/socket, 2 threads/core). NIC Broadcom NetXtreme II BCM57810 (10GBps). Ran some tests with `perf stat iperf3 -R`, with parallelisms of 1, 8 and 24, getting the following results: - Number of queues: 6 (instead of 8) - Network throughput: not affected - CPU usage: utilized 0.05-0.12 CPUs more than before (having 24 CPUs this is only 0.2-0.5% higher) - Reduced the number of context switches by 7-50%, being more noticeable when using a higher number of parallel threads. Suggested-by: Jakub Kicinski Signed-off-by: Íñigo Huguet Link: https://lore.kernel.org/r/20220315091832.13873-1-ihuguet@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - net/core/dev.c | 20 ++++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8cbe96ce0a2c..e01a8ce7181f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3664,7 +3664,6 @@ static inline unsigned int get_netdev_rx_queue_index( } #endif -#define DEFAULT_MAX_NUM_RSS_QUEUES (8) int netif_get_num_default_rss_queues(void); enum skb_free_reason { diff --git a/net/core/dev.c b/net/core/dev.c index 75bab5b0dbae..8e0cc5f2020d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2990,13 +2990,25 @@ EXPORT_SYMBOL(netif_set_real_num_queues); /** * netif_get_num_default_rss_queues - default number of RSS queues * - * This routine should set an upper limit on the number of RSS queues - * used by default by multiqueue devices. + * Default value is the number of physical cores if there are only 1 or 2, or + * divided by 2 if there are more. */ int netif_get_num_default_rss_queues(void) { - return is_kdump_kernel() ? - 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); + cpumask_var_t cpus; + int cpu, count = 0; + + if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL))) + return 1; + + cpumask_copy(cpus, cpu_online_mask); + for_each_cpu(cpu, cpus) { + ++count; + cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu)); + } + free_cpumask_var(cpus); + + return count > 2 ? DIV_ROUND_UP(count, 2) : count; } EXPORT_SYMBOL(netif_get_num_default_rss_queues); -- cgit v1.2.3 From b00fa38a9c1cba044a32a601b49a55a18ed719d1 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 17 Mar 2022 21:55:52 -0700 Subject: bpf: Enable non-atomic allocations in local storage Currently, local storage memory can only be allocated atomically (GFP_ATOMIC). This restriction is too strict for sleepable bpf programs. In this patch, the verifier detects whether the program is sleepable, and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a 5th argument to bpf_task/sk/inode_storage_get. This flag will propagate down to the local storage functions that allocate memory. Please note that bpf_task/sk/inode_storage_update_elem functions are invoked by userspace applications through syscalls. Preemption is disabled before bpf_task/sk/inode_storage_update_elem is called, which means they will always have to allocate memory atomically. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220318045553.3091807-2-joannekoong@fb.com --- include/linux/bpf_local_storage.h | 7 +++-- kernel/bpf/bpf_inode_storage.c | 9 +++--- kernel/bpf/bpf_local_storage.c | 58 +++++++++++++++++++++++++-------------- kernel/bpf/bpf_task_storage.c | 10 ++++--- kernel/bpf/verifier.c | 20 ++++++++++++++ net/core/bpf_sk_storage.c | 21 ++++++++------ 6 files changed, 84 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 37b3906af8b1..493e63258497 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -154,16 +154,17 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem); + bool charge_mem, gfp_t gfp_flags); int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem); + struct bpf_local_storage_elem *first_selem, + gfp_t gfp_flags); struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags); + void *value, u64 map_flags, gfp_t gfp_flags); void bpf_local_storage_free_rcu(struct rcu_head *rcu); diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index e29d9e3d853e..96be8d518885 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, - value, map_flags); + value, map_flags, GFP_ATOMIC); fput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) return err; } -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 092a1ac772d7..01aa2b51ec4d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem) + void *value, bool charge_mem, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -282,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem) + struct bpf_local_storage_elem *first_selem, + gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; @@ -293,7 +294,7 @@ int bpf_local_storage_alloc(void *owner, return err; storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -350,10 +351,10 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags) + void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem; + struct bpf_local_storage_elem *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -365,6 +366,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); + if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) + return ERR_PTR(-EINVAL); + local_storage = rcu_dereference_check(*owner_storage(smap, owner), bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { @@ -373,11 +377,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true); + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); - err = bpf_local_storage_alloc(owner, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { kfree(selem); mem_uncharge(smap, owner, smap->elem_size); @@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } + if (gfp_flags == GFP_KERNEL) { + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!selem) + return ERR_PTR(-ENOMEM); + } + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ @@ -429,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; + if (gfp_flags != GFP_KERNEL) { + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } } /* First, link the new selem to the map */ @@ -463,6 +475,10 @@ unlock: unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + if (selem) { + mem_uncharge(smap, owner, smap->elem_size); + kfree(selem); + } return ERR_PTR(err); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 5da7bed0f5f6..6638a0ecc3d2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( - task, (struct bpf_local_storage_map *)map, value, map_flags); + task, (struct bpf_local_storage_map *)map, value, map_flags, + GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -226,8 +227,9 @@ out: return err; } -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); unlock: bpf_task_storage_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0287176bfe9a..6347dcdee1fd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13492,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + if (insn->imm == BPF_FUNC_task_storage_get || + insn->imm == BPF_FUNC_sk_storage_get || + insn->imm == BPF_FUNC_inode_storage_get) { + if (env->prog->aux->sleepable) + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__s32)GFP_KERNEL); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__s32)GFP_ATOMIC); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index d9c37fd10809..7aff1206a851 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -141,7 +141,7 @@ static int bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key, if (sock) { sdata = bpf_local_storage_update( sock->sk, (struct bpf_local_storage_map *)map, value, - map_flags); + map_flags, GFP_ATOMIC); sockfd_put(sock); return PTR_ERR_OR_ZERO(sdata); } @@ -172,7 +172,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -230,7 +230,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) bpf_selem_link_map(smap, copy_selem); bpf_selem_link_storage_nolock(new_sk_storage, copy_selem); } else { - ret = bpf_local_storage_alloc(newsk, smap, copy_selem); + ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { kfree(copy_selem); atomic_sub(smap->elem_size, @@ -255,8 +255,9 @@ out: return ret; } -BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; @@ -277,7 +278,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, refcount_inc_not_zero(&sk->sk_refcnt)) { sdata = bpf_local_storage_update( sk, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); /* sk must be a fullsock (guaranteed by verifier), * so sock_gen_put() is unnecessary. */ @@ -417,14 +418,16 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return false; } -BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags, gfp_t, gfp_flags) { WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; - return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags, + gfp_flags); } BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, -- cgit v1.2.3 From 351bdbb6419ca988802882badadc321d384d0254 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 21 Mar 2022 10:22:37 +0100 Subject: net: Revert the softirq will run annotation in ____napi_schedule(). The lockdep annotation lockdep_assert_softirq_will_run() expects that either hard or soft interrupts are disabled because both guaranty that the "raised" soft-interrupts will be processed once the context is left. This triggers in flush_smp_call_function_from_idle() but it this case it explicitly calls do_softirq() in case of pending softirqs. Revert the "softirq will run" annotation in ____napi_schedule() and move the check back to __netif_rx() as it was. Keep the IRQ-off assert in ____napi_schedule() because this is always required. Fixes: fbd9a2ceba5c7 ("net: Add lockdep asserts to ____napi_schedule().") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/YjhD3ZKWysyw8rc6@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/lockdep.h | 7 ------- net/core/dev.c | 3 +-- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 0cc65d216701..467b94257105 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -329,12 +329,6 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) -/* - * Ensure that softirq is handled within the callchain and not delayed and - * handled by chance. - */ -#define lockdep_assert_softirq_will_run() \ - lockdep_assert_once(hardirq_count() | softirq_count()) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) @@ -420,7 +414,6 @@ extern int lockdep_is_held(const void *); #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) -#define lockdep_assert_softirq_will_run() do { } while (0) #define lockdep_recursing(tsk) (0) diff --git a/net/core/dev.c b/net/core/dev.c index 8e0cc5f2020d..8a5109479dbe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4277,7 +4277,6 @@ static inline void ____napi_schedule(struct softnet_data *sd, { struct task_struct *thread; - lockdep_assert_softirq_will_run(); lockdep_assert_irqs_disabled(); if (test_bit(NAPI_STATE_THREADED, &napi->state)) { @@ -4887,7 +4886,7 @@ int __netif_rx(struct sk_buff *skb) { int ret; - lockdep_assert_softirq_will_run(); + lockdep_assert_once(hardirq_count() | softirq_count()); trace_netif_rx_entry(skb); ret = netif_rx_internal(skb); -- cgit v1.2.3 From 4a0cb83ba6e0cd73a50fa4f84736846bf0029f2b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 21 Mar 2022 22:10:53 -0700 Subject: netdevice: add missing dm_private kdoc Building htmldocs complains: include/linux/netdevice.h:2295: warning: Function parameter or member 'dm_private' not described in 'net_device' Fixes: b26ef81c46ed ("drop_monitor: remove quadratic behavior") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220322051053.1883186-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e01a8ce7181f..cd7a597c55b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1899,6 +1899,8 @@ enum netdev_ml_priv_type { * @garp_port: GARP * @mrp_port: MRP * + * @dm_private: Drop monitor private + * * @dev: Class/net/name entry * @sysfs_groups: Space for optional device, statistics and wireless * sysfs groups -- cgit v1.2.3 From 2af7e566a8616c278e1d7287ce86cd3900bed943 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 22 Mar 2022 10:22:24 -0700 Subject: net/mlx5e: Fix build warning, detected write beyond size of field When merged with Linus tree, the cited patch below will cause the following build warning: In function 'fortify_memset_chk', inlined from 'mlx5e_xmit_xdp_frame' at drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c:438:3: include/linux/fortify-string.h:242:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix that by grouping the fields to memeset in struct_group() to avoid the false alarm. Fixes: 9ded70fa1d81 ("net/mlx5e: Don't prefill WQEs in XDP SQ in the multi buffer mode") Reported-by: Stephen Rothwell Suggested-by: Stephen Rothwell Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20220322172224.31849-1-saeed@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 3 +-- include/linux/mlx5/qp.h | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index f35b62ce4c07..8f321a6c0809 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -435,8 +435,7 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, u8 num_pkts = 1 + num_frags; int i; - memset(&cseg->signature, 0, sizeof(*cseg) - - sizeof(cseg->opmod_idx_opcode) - sizeof(cseg->qpn_ds)); + memset(&cseg->trailer, 0, sizeof(cseg->trailer)); memset(eseg, 0, sizeof(*eseg) - sizeof(eseg->trailer)); eseg->inline_hdr.sz = cpu_to_be16(inline_hdr_sz); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 61e48d459b23..8bda3ba5b109 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -202,6 +202,9 @@ struct mlx5_wqe_fmr_seg { struct mlx5_wqe_ctrl_seg { __be32 opmod_idx_opcode; __be32 qpn_ds; + + struct_group(trailer, + u8 signature; u8 rsvd[2]; u8 fm_ce_se; @@ -211,6 +214,8 @@ struct mlx5_wqe_ctrl_seg { __be32 umr_mkey; __be32 tis_tir_num; }; + + ); /* end of trailer group */ }; #define MLX5_WQE_CTRL_DS_MASK 0x3f -- cgit v1.2.3