From 2463e073497385ef63c220571013a2b89e9b95cc Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 18 Feb 2021 20:49:41 +0000 Subject: netdevice: Add missing IFF_PHONY_HEADROOM self-definition This is harmless for now, but can be fatal for future refactors. Fixes: 871b642adebe3 ("netdev: introduce ndo_set_rx_headroom") Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210218204908.5455-2-alobakin@pm.me --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ddf4cfc12615..3b6f82c2c271 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1577,6 +1577,7 @@ enum netdev_priv_flags { #define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE #define IFF_TEAM IFF_TEAM #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED +#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM #define IFF_MACSEC IFF_MACSEC #define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER #define IFF_FAILOVER IFF_FAILOVER -- cgit v1.2.3 From c2ff53d8049f30098153cd2d1299a44d7b124c57 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Thu, 18 Feb 2021 20:50:02 +0000 Subject: net: Add priv_flags for allow tx skb without linear In some cases, we hope to construct skb directly based on the existing memory without copying data. In this case, the page will be placed directly in the skb, and the linear space of skb is empty. But unfortunately, many the network card does not support this operation. For example Mellanox Technologies MT27710 Family [ConnectX-4 Lx] will get the following error message: mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2 WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb So a priv_flag is added here to indicate whether the network card supports this feature. Suggested-by: Alexander Lobakin Signed-off-by: Xuan Zhuo Signed-off-by: Alexander Lobakin Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210218204908.5455-3-alobakin@pm.me --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3b6f82c2c271..6cef47b76cc6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1518,6 +1518,8 @@ struct net_device_ops { * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running + * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with + * skb_headlen(skb) == 0 (data starts from frag0) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1551,6 +1553,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, + IFF_TX_SKB_NO_LINEAR = 1<<31, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1584,6 +1587,7 @@ enum netdev_priv_flags { #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER #define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK +#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /** * struct net_device - The DEVICE structure. -- cgit v1.2.3 From a10787e6d58c24b51e91c19c6d16c5da89fcaa4b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:14 -0800 Subject: bpf: Enable task local storage for tracing programs To access per-task data, BPF programs usually creates a hash table with pid as the key. This is not ideal because: 1. The user need to estimate the proper size of the hash table, which may be inaccurate; 2. Big hash tables are slow; 3. To clean up the data properly during task terminations, the user need to write extra logic. Task local storage overcomes these issues and offers a better option for these per-task data. Task local storage is only available to BPF_LSM. Now enable it for tracing programs. Unlike LSM programs, tracing programs can be called in IRQ contexts. Helpers that access task local storage are updated to use raw_spin_lock_irqsave() instead of raw_spin_lock_bh(). Tracing programs can attach to functions on the task free path, e.g. exit_creds(). To avoid allocating task local storage after bpf_task_storage_free(). bpf_task_storage_get() is updated to not allocate new storage when the task is not refcounted (task->usage == 0). Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225234319.336131-2-songliubraving@fb.com --- include/linux/bpf.h | 7 +++++++ include/linux/bpf_lsm.h | 22 --------------------- include/linux/bpf_types.h | 2 +- include/linux/sched.h | 5 +++++ kernel/bpf/Makefile | 3 +-- kernel/bpf/bpf_local_storage.c | 28 ++++++++++++++++----------- kernel/bpf/bpf_lsm.c | 4 ---- kernel/bpf/bpf_task_storage.c | 43 +++++++++++------------------------------- kernel/fork.c | 5 +++++ kernel/trace/bpf_trace.c | 4 ++++ 10 files changed, 51 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cccaef1088ea..e2cfc4809219 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1499,6 +1499,7 @@ struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1684,6 +1685,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } + +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -1886,6 +1891,8 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 0d1c33ace398..479c101546ad 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -38,21 +38,9 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - if (unlikely(!task->security)) - return NULL; - - return task->security + bpf_lsm_blob_sizes.lbs_task; -} - extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; -extern const struct bpf_func_proto bpf_task_storage_get_proto; -extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); -void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -73,20 +61,10 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } -static inline struct bpf_storage_blob *bpf_task( - const struct task_struct *task) -{ - return NULL; -} - static inline void bpf_inode_storage_free(struct inode *inode) { } -static inline void bpf_task_storage_free(struct task_struct *task) -{ -} - #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 99f7fd657d87..b9edee336d80 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,8 +109,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d568288abf9..e5fbf8e6952a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,6 +42,7 @@ struct audit_context; struct backing_dev_info; struct bio_list; struct blk_plug; +struct bpf_local_storage; struct capture_control; struct cfs_rq; struct fs_struct; @@ -1348,6 +1349,10 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void *security; #endif +#ifdef CONFIG_BPF_SYSCALL + /* Used by BPF task local storage */ + struct bpf_local_storage __rcu *bpf_storage; +#endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index d1249340fd6b..7f33098ca63f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,8 +9,8 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o -obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o @@ -18,7 +18,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o obj-$(CONFIG_BPF_SYSCALL) += offload.o obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index dd5aedee99e7..9bd47ad2b26f 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -140,17 +140,18 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) { struct bpf_local_storage *local_storage; bool free_local_storage = false; + unsigned long flags; if (unlikely(!selem_linked_to_storage(selem))) /* selem has already been unlinked from sk */ return; local_storage = rcu_dereference(selem->local_storage); - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, true); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) kfree_rcu(local_storage, rcu); @@ -167,6 +168,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map *smap; struct bpf_local_storage_map_bucket *b; + unsigned long flags; if (unlikely(!selem_linked_to_map(selem))) /* selem has already be unlinked from smap */ @@ -174,21 +176,22 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) smap = rcu_dereference(SDATA(selem)->smap); b = select_bucket(smap, selem); - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) hlist_del_init_rcu(&selem->map_node); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem) { struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem); + unsigned long flags; - raw_spin_lock_bh(&b->lock); + raw_spin_lock_irqsave(&b->lock, flags); RCU_INIT_POINTER(SDATA(selem)->smap, smap); hlist_add_head_rcu(&selem->map_node, &b->list); - raw_spin_unlock_bh(&b->lock); + raw_spin_unlock_irqrestore(&b->lock, flags); } void bpf_selem_unlink(struct bpf_local_storage_elem *selem) @@ -224,16 +227,18 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, sdata = SDATA(selem); if (cacheit_lockit) { + unsigned long flags; + /* spinlock is needed to avoid racing with the * parallel delete. Otherwise, publishing an already * deleted sdata to the cache will become a use-after-free * problem in the next bpf_local_storage_lookup(). */ - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); if (selem_linked_to_storage(selem)) rcu_assign_pointer(local_storage->cache[smap->cache_idx], sdata); - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); } return sdata; @@ -327,6 +332,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, struct bpf_local_storage_data *old_sdata = NULL; struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; + unsigned long flags; int err; /* BPF_EXIST and BPF_NOEXIST cannot be both set */ @@ -374,7 +380,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ if (unlikely(hlist_empty(&local_storage->list))) { @@ -428,11 +434,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } unlock: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return SDATA(selem); unlock_err: - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); return ERR_PTR(err); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 1622a44d1617..9829f381b51c 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -115,10 +115,6 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_task_storage_get: - return &bpf_task_storage_get_proto; - case BPF_FUNC_task_storage_delete: - return &bpf_task_storage_delete_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index e0da0258b732..baf3566e2323 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -24,12 +23,8 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; - struct bpf_storage_blob *bsb; - bsb = bpf_task(task); - if (!bsb) - return NULL; - return &bsb->storage; + return &task->bpf_storage; } static struct bpf_local_storage_data * @@ -38,13 +33,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, { struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - struct bpf_storage_blob *bsb; - - bsb = bpf_task(task); - if (!bsb) - return NULL; - task_storage = rcu_dereference(bsb->storage); + task_storage = rcu_dereference(task->bpf_storage); if (!task_storage) return NULL; @@ -57,16 +47,12 @@ void bpf_task_storage_free(struct task_struct *task) struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct bpf_storage_blob *bsb; struct hlist_node *n; - - bsb = bpf_task(task); - if (!bsb) - return; + unsigned long flags; rcu_read_lock(); - local_storage = rcu_dereference(bsb->storage); + local_storage = rcu_dereference(task->bpf_storage); if (!local_storage) { rcu_read_unlock(); return; @@ -81,7 +67,7 @@ void bpf_task_storage_free(struct task_struct *task) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ - raw_spin_lock_bh(&local_storage->lock); + raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from * local_storage. @@ -90,7 +76,7 @@ void bpf_task_storage_free(struct task_struct *task) free_task_storage = bpf_selem_unlink_storage_nolock( local_storage, selem, false); } - raw_spin_unlock_bh(&local_storage->lock); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); rcu_read_unlock(); /* free_task_storage should always be true as long as @@ -150,7 +136,7 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task || !task_storage_ptr(task)) { + if (!task) { err = -ENOENT; goto out; } @@ -213,23 +199,16 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; - /* explicitly check that the task_storage_ptr is not - * NULL as task_storage_lookup returns NULL in this case and - * bpf_local_storage_update expects the owner to have a - * valid storage pointer. - */ - if (!task || !task_storage_ptr(task)) + if (!task) return (unsigned long)NULL; sdata = task_storage_lookup(task, map, true); if (sdata) return (unsigned long)sdata->data; - /* This helper must only be called from places where the lifetime of the task - * is guaranteed. Either by being refcounted or by being protected - * by an RCU read-side critical section. - */ - if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + /* only allocate new storage, when the task is refcounted */ + if (refcount_read(&task->usage) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); diff --git a/kernel/fork.c b/kernel/fork.c index d66cd1014211..181604db2d65 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include @@ -734,6 +735,7 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); + bpf_task_storage_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -2062,6 +2064,9 @@ static __latent_entropy struct task_struct *copy_process( p->sequential_io = 0; p->sequential_io_avg = 0; #endif +#ifdef CONFIG_BPF_SYSCALL + RCU_INIT_POINTER(p->bpf_storage, NULL); +#endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..e9701744d8e4 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1367,6 +1367,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; default: return NULL; } -- cgit v1.2.3 From bc235cdb423a2daed6f337676006a66557429cd1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:15 -0800 Subject: bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete] BPF helpers bpf_task_storage_[get|delete] could hold two locks: bpf_local_storage_map_bucket->lock and bpf_local_storage->lock. Calling these helpers from fentry/fexit programs on functions in bpf_*_storage.c may cause deadlock on either locks. Prevent such deadlock with a per cpu counter, bpf_task_storage_busy. We need this counter to be global, because the two locks here belong to two different objects: bpf_local_storage_map and bpf_local_storage. If we pick one of them as the owner of the counter, it is still possible to trigger deadlock on the other lock. For example, if bpf_local_storage_map owns the counters, it cannot prevent deadlock on bpf_local_storage->lock when two maps are used. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225234319.336131-3-songliubraving@fb.com --- include/linux/bpf_local_storage.h | 3 +- kernel/bpf/bpf_inode_storage.c | 2 +- kernel/bpf/bpf_local_storage.c | 11 +++++++- kernel/bpf/bpf_task_storage.c | 59 +++++++++++++++++++++++++++++++++------ net/core/bpf_sk_storage.c | 2 +- 5 files changed, 65 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index b2c9463f36a1..b902c580c48d 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -126,7 +126,8 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter); int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf *btf, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6639640523c0..da753721457c 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -237,7 +237,7 @@ static void inode_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static int inode_storage_map_btf_id; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 9bd47ad2b26f..b305270b7a4b 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -474,7 +474,8 @@ void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, + int __percpu *busy_counter) { struct bpf_local_storage_elem *selem; struct bpf_local_storage_map_bucket *b; @@ -503,7 +504,15 @@ void bpf_local_storage_map_free(struct bpf_local_storage_map *smap) while ((selem = hlist_entry_safe( rcu_dereference_raw(hlist_first_rcu(&b->list)), struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + __this_cpu_inc(*busy_counter); + } bpf_selem_unlink(selem); + if (busy_counter) { + __this_cpu_dec(*busy_counter); + migrate_enable(); + } cond_resched_rcu(); } rcu_read_unlock(); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index baf3566e2323..fd3c74ef608e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -20,6 +20,31 @@ DEFINE_BPF_STORAGE_CACHE(task_cache); +DEFINE_PER_CPU(int, bpf_task_storage_busy); + +static void bpf_task_storage_lock(void) +{ + migrate_disable(); + __this_cpu_inc(bpf_task_storage_busy); +} + +static void bpf_task_storage_unlock(void) +{ + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); +} + +static bool bpf_task_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) { + __this_cpu_dec(bpf_task_storage_busy); + migrate_enable(); + return false; + } + return true; +} + static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) { struct task_struct *task = owner; @@ -67,6 +92,7 @@ void bpf_task_storage_free(struct task_struct *task) * when unlinking elem from the local_storage->list and * the map's bucket->list. */ + bpf_task_storage_lock(); raw_spin_lock_irqsave(&local_storage->lock, flags); hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { /* Always unlink from map before unlinking from @@ -77,6 +103,7 @@ void bpf_task_storage_free(struct task_struct *task) local_storage, selem, false); } raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_task_storage_unlock(); rcu_read_unlock(); /* free_task_storage should always be true as long as @@ -109,7 +136,9 @@ static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); sdata = task_storage_lookup(task, map, true); + bpf_task_storage_unlock(); put_pid(pid); return sdata ? sdata->data : NULL; out: @@ -141,8 +170,10 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, goto out; } + bpf_task_storage_lock(); sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, map_flags); + bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); out: @@ -185,7 +216,9 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) goto out; } + bpf_task_storage_lock(); err = task_storage_delete(task, map); + bpf_task_storage_unlock(); out: put_pid(pid); return err; @@ -202,34 +235,44 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, if (!task) return (unsigned long)NULL; + if (!bpf_task_storage_trylock()) + return (unsigned long)NULL; + sdata = task_storage_lookup(task, map, true); if (sdata) - return (unsigned long)sdata->data; + goto unlock; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) { + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST); - return IS_ERR(sdata) ? (unsigned long)NULL : - (unsigned long)sdata->data; - } - return (unsigned long)NULL; +unlock: + bpf_task_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; } BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { + int ret; + if (!task) return -EINVAL; + if (!bpf_task_storage_trylock()) + return -EBUSY; + /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - return task_storage_delete(task, map); + ret = task_storage_delete(task, map); + bpf_task_storage_unlock(); + return ret; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -255,7 +298,7 @@ static void task_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, &bpf_task_storage_busy); } static int task_storage_map_btf_id; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4edd033e899c..cc3712ad8716 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) -- cgit v1.2.3 From 523a4cf491b3c9e2d546040d57250f1a0ca84f03 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Fri, 26 Feb 2021 00:26:29 +0400 Subject: bpf: Use MAX_BPF_FUNC_REG_ARGS macro Instead of using integer literal here and there use macro name for better context. Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225202629.585485-1-me@ubique.spb.ru --- include/linux/bpf.h | 5 +++++ kernel/bpf/btf.c | 25 ++++++++++++++----------- kernel/bpf/verifier.c | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e2cfc4809219..ae2c35641619 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -506,6 +506,11 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_ARGS 12 +/* The maximum number of arguments passed through registers + * a single function may have. + */ +#define MAX_BPF_FUNC_REG_ARGS 5 + struct btf_func_model { u8 ret_size; u8 nr_args; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2efeb5f4b343..16e8148a28e2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4594,8 +4594,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } arg = off / 8; args = (const struct btf_param *)(t + 1); - /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */ - nr_args = t ? btf_type_vlen(t) : 5; + /* if (t == NULL) Fall back to default BPF prog with + * MAX_BPF_FUNC_REG_ARGS u64 arguments. + */ + nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS; if (prog->aux->attach_btf_trace) { /* skip first 'void *__data' argument in btf_trace_##name typedef */ args++; @@ -4651,7 +4653,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } else { if (!t) - /* Default prog with 5 args */ + /* Default prog with MAX_BPF_FUNC_REG_ARGS args */ return true; t = btf_type_by_id(btf, args[arg].type); } @@ -5102,12 +5104,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, if (!func) { /* BTF function prototype doesn't match the verifier types. - * Fall back to 5 u64 args. + * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args. */ - for (i = 0; i < 5; i++) + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) m->arg_size[i] = 8; m->ret_size = 8; - m->nr_args = 5; + m->nr_args = MAX_BPF_FUNC_REG_ARGS; return 0; } args = (const struct btf_param *)(func + 1); @@ -5330,8 +5332,9 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Function %s has %d > %d args\n", tname, nargs, + MAX_BPF_FUNC_REG_ARGS); goto out; } @@ -5460,9 +5463,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); - if (nargs > 5) { - bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n", - tname, nargs); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n", + tname, nargs, MAX_BPF_FUNC_REG_ARGS); return -EINVAL; } /* check that function returns int */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1dda9d81f12c..9f7e35590fc6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5544,7 +5544,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn meta.func_id = func_id; /* check args */ - for (i = 0; i < 5; i++) { + for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { err = check_func_arg(env, i, &meta, fn); if (err) return err; -- cgit v1.2.3 From a83586a7ddba25065ec37323c05deb9019ce4fa9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 23 Feb 2021 21:14:57 +0800 Subject: bpf: Remove blank line in bpf helper description comment Commit 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") added an extra blank line in bpf helper description. This will make bpf_helpers_doc.py stop building bpf_helper_defs.h immediately after bpf_check_mtu(), which will affect future added functions. Fixes: 34b2021cc616 ("bpf: Add BPF-helper for MTU checking") Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20210223131457.1378978-1-liuhangbin@gmail.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 - tools/include/uapi/linux/bpf.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4c24daa43bac..79c893310492 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3850,7 +3850,6 @@ union bpf_attr { * * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) * Description - * Check ctx packet size against exceeding MTU of net device (based * on *ifindex*). This helper will likely be used in combination * with helpers that adjust/change the packet size. -- cgit v1.2.3 From 887596095ec2a9ea39ffcf98f27bf2e77c5eb512 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:26 -0800 Subject: bpf: Clean up sockmap related Kconfigs As suggested by John, clean up sockmap related Kconfigs: Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream parser, to reflect its name. Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL and CONFIG_INET, the latter is still needed at this point because of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched, as it is used by non-sockmap cases. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-2-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 26 ++++----- include/linux/bpf_types.h | 6 +- include/linux/skmsg.h | 18 ++++++ include/net/tcp.h | 16 ++--- include/net/udp.h | 4 +- init/Kconfig | 1 + net/Kconfig | 6 +- net/core/Makefile | 6 +- net/core/skmsg.c | 145 +++++++++++++++++++++++++--------------------- net/core/sock_map.c | 2 + net/ipv4/Makefile | 2 +- net/ipv4/tcp_bpf.c | 4 +- 12 files changed, 133 insertions(+), 103 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ae2c35641619..2be47ada5f2d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1778,7 +1778,7 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_BPF_STREAM_PARSER) +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); @@ -1786,7 +1786,18 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); + +void bpf_sk_reuseport_detach(struct sock *sk); +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, + void *value); +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags); #else +static inline void bpf_sk_reuseport_detach(struct sock *sk) +{ +} + +#ifdef CONFIG_BPF_SYSCALL static inline int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which) @@ -1811,20 +1822,7 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } -#endif /* CONFIG_BPF_STREAM_PARSER */ -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -void bpf_sk_reuseport_detach(struct sock *sk); -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, - void *value); -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, - void *value, u64 map_flags); -#else -static inline void bpf_sk_reuseport_detach(struct sock *sk) -{ -} - -#ifdef CONFIG_BPF_SYSCALL static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, void *value) { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b9edee336d80..f883f01a5061 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -103,10 +103,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) -#if defined(CONFIG_BPF_STREAM_PARSER) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) -BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) -#endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) #endif @@ -116,6 +112,8 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) #endif #ifdef CONFIG_INET +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 8edbbf5f2f93..db7a08be4725 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -71,7 +71,9 @@ struct sk_psock_link { }; struct sk_psock_parser { +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; +#endif bool enabled; void (*saved_data_ready)(struct sock *sk); }; @@ -305,9 +307,25 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) struct sk_psock *sk_psock_init(struct sock *sk, int node); +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +#else +static inline int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + return -EOPNOTSUPP; +} + +static inline void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ +} + +static inline void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ +} +#endif + void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); diff --git a/include/net/tcp.h b/include/net/tcp.h index 963cd86d12dd..c00e125dcfb9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2222,25 +2222,27 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) +#ifdef CONFIG_NET_SOCK_MSG struct sk_msg; struct sk_psock; -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); -#else -static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) -{ -} -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ -#ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) +static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) +{ +} +#endif + #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, diff --git a/include/net/udp.h b/include/net/udp.h index a132a02b2f2c..d4d064c59232 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -515,9 +515,9 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; } -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -#endif /* BPF_STREAM_PARSER */ +#endif #endif /* _UDP_H */ diff --git a/init/Kconfig b/init/Kconfig index 096e1af5c586..66cef5eac275 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1702,6 +1702,7 @@ config BPF_SYSCALL select BPF select IRQ_WORK select TASKS_TRACE_RCU + select NET_SOCK_MSG if INET default n help Enable the bpf() system call that allows to manipulate eBPF diff --git a/net/Kconfig b/net/Kconfig index 8cea808ad9e8..0ead7ec0d2bd 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -317,13 +317,9 @@ config BPF_STREAM_PARSER select STREAM_PARSER select NET_SOCK_MSG help - Enabling this allows a stream parser to be used with + Enabling this allows a TCP stream parser to be used with BPF_MAP_TYPE_SOCKMAP. - BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets. - It can be used to enforce socket policy, implement socket redirects, - etc. - config NET_FLOW_LIMIT bool depends on RPS diff --git a/net/core/Makefile b/net/core/Makefile index 3e2c378e5f31..0c2233c826fd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o -obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1261512d6807..e017744111e1 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -645,15 +645,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +static void sk_psock_done_strp(struct sk_psock *psock); + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - /* Parser has been stopped */ - if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + sk_psock_done_strp(psock); cancel_work_sync(&psock->work); @@ -750,14 +750,6 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, return bpf_prog_run_pin_on_cpu(prog, skb); } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -866,6 +858,24 @@ out_free: } } +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; @@ -897,6 +907,14 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = sk_psock_from_strp(strp); @@ -933,6 +951,56 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (psock->progs.skb_parser) + strp_done(&psock->parser.strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t orig_len) { @@ -984,35 +1052,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); } -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk) = NULL; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - } - rcu_read_unlock(); - if (write_space) - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -1026,32 +1065,6 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) parser->enabled = true; } -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; -} - void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d758fb83c884..ee3334dd3a38 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1461,9 +1461,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, case BPF_SK_MSG_VERDICT: pprog = &progs->msg_parser; break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: pprog = &progs->skb_parser; break; +#endif case BPF_SK_SKB_STREAM_VERDICT: pprog = &progs->skb_verdict; break; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b9..bbdd9c44f14e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += udp_bpf.o +obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index bc7d2a586e18..17c322b875fd 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -229,7 +229,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); -#ifdef CONFIG_BPF_STREAM_PARSER +#ifdef CONFIG_BPF_SYSCALL static bool tcp_bpf_stream_read(const struct sock *sk) { struct sk_psock *psock; @@ -629,4 +629,4 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) if (prot == &tcp_bpf_prots[family][TCP_BPF_BASE]) newsk->sk_prot = sk->sk_prot_creator; } -#endif /* CONFIG_BPF_STREAM_PARSER */ +#endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 5a685cd94b21a88efa6be77169eddef525368034 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:27 -0800 Subject: skmsg: Get rid of struct sk_psock_parser struct sk_psock_parser is embedded in sk_psock, it is unnecessary as skb verdict also uses ->saved_data_ready. We can simply fold these fields into sk_psock, and get rid of ->enabled. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 19 +++++++----------- net/core/skmsg.c | 53 ++++++++++++++++----------------------------------- net/core/sock_map.c | 8 ++++---- 3 files changed, 27 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index db7a08be4725..22e26f82de33 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -70,14 +70,6 @@ struct sk_psock_link { void *link_raw; }; -struct sk_psock_parser { -#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) - struct strparser strp; -#endif - bool enabled; - void (*saved_data_ready)(struct sock *sk); -}; - struct sk_psock_work_state { struct sk_buff *skb; u32 len; @@ -92,7 +84,9 @@ struct sk_psock { u32 eval; struct sk_msg *cork; struct sk_psock_progs progs; - struct sk_psock_parser parser; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + struct strparser strp; +#endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; unsigned long state; @@ -102,6 +96,7 @@ struct sk_psock { void (*saved_unhash)(struct sock *sk); void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); + void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; struct sk_psock_work_state work_state; struct work_struct work; @@ -418,8 +413,8 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) { - if (psock->parser.enabled) - psock->parser.saved_data_ready(sk); + if (psock->saved_data_ready) + psock->saved_data_ready(sk); else sk->sk_data_ready(sk); } @@ -458,6 +453,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) { if (!psock) return false; - return psock->parser.enabled; + return !!psock->saved_data_ready; } #endif /* _LINUX_SKMSG_H */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e017744111e1..d00c9a4b47e7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -907,17 +907,9 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; @@ -941,10 +933,10 @@ static void sk_psock_strp_data_ready(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { - psock->parser.saved_data_ready(sk); + psock->saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); + strp_data_ready(&psock->strp); write_unlock_bh(&sk->sk_callback_lock); } } @@ -959,41 +951,34 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) .parse_msg = sk_psock_strp_parse, }; - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); + return strp_init(&psock->strp, sk, &cb); } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) @@ -1054,25 +1039,19 @@ static void sk_psock_verdict_data_ready(struct sock *sk) void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ee3334dd3a38..1a28a5c2c61e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->parser.enabled && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.skb_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -283,14 +283,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.skb_verdict, skb_verdict); sk_psock_start_verdict(sk,psock); } -- cgit v1.2.3 From 16137b09a66f2b75090f1e56a9ba0e27ef845ebc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:28 -0800 Subject: bpf: Compute data_end dynamically with JIT code Currently, we compute ->data_end with a compile-time constant offset of skb. But as Jakub pointed out, we can actually compute it in eBPF JIT code at run-time, so that we can competely get rid of ->data_end. This is similar to skb_shinfo(skb) computation in bpf_convert_shinfo_access(). Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-4-xiyou.wangcong@gmail.com --- include/net/tcp.h | 6 ------ net/core/filter.c | 48 ++++++++++++++++++++++++++++-------------------- net/core/skmsg.c | 1 - 3 files changed, 28 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index c00e125dcfb9..947ef5da6867 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -886,18 +886,12 @@ struct tcp_skb_cb { struct { __u32 flags; struct sock *sk_redir; - void *data_end; } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..13bcf248ee7b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3577,7 +3574,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3738,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3801,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -9655,22 +9645,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d00c9a4b47e7..8822001ab3dc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -746,7 +746,6 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - bpf_compute_data_end_sk_skb(skb); return bpf_prog_run_pin_on_cpu(prog, skb); } -- cgit v1.2.3 From e3526bb92a2084cdaec6cb2855bcec98b280426c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:29 -0800 Subject: skmsg: Move sk_redir from TCP_SKB_CB to skb Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly does not work for any other non-TCP protocols. We can move them to skb ext, but it introduces a memory allocation on fast path. Fortunately, we only need to a word-size to store all the information, because the flags actually only contains 1 bit so can be just packed into the lowest bit of the "pointer", which is stored as unsigned long. Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is no longer needed after ->sk_data_ready() so we can just drop it. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-5-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 3 +++ include/linux/skmsg.h | 38 ++++++++++++++++++++++++++++++++++++++ include/net/tcp.h | 19 ------------------- net/core/skmsg.c | 31 +++++++++++++++++++------------ net/core/sock_map.c | 8 ++------ 5 files changed, 62 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..bd84f799c952 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -755,6 +755,9 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; +#ifdef CONFIG_NET_SOCK_MSG + unsigned long _sk_redir; +#endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 22e26f82de33..e0de45527bb6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -455,4 +455,42 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return false; return !!psock->saved_data_ready; } + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) + +/* We only have one bit so far. */ +#define BPF_F_PTR_MASK ~(BPF_F_INGRESS) + +static inline bool skb_bpf_ingress(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_INGRESS; +} + +static inline void skb_bpf_set_ingress(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, + bool ingress) +{ + skb->_sk_redir = (unsigned long)sk_redir; + if (ingress) + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return (struct sock *)(sk_redir & BPF_F_PTR_MASK); +} + +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) +{ + skb->_sk_redir = 0; +} +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 947ef5da6867..075de26f449d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,30 +883,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8822001ab3dc..409258367bea 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -754,7 +760,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -804,9 +810,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -818,7 +825,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -830,8 +836,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -892,9 +897,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1011,9 +1017,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..dbfcd7006338 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } -- cgit v1.2.3 From ae8b8332fbb512f53bf50ff6a7586dd0f90ed18a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:30 -0800 Subject: sock_map: Rename skb_parser and skb_verdict These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact they are only used for TCP. And save the name 'skb_verdict' for general use later. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 8 +-- net/core/skmsg.c | 14 ++--- net/core/sock_map.c | 60 +++++++++++----------- .../selftests/bpf/prog_tests/sockmap_listen.c | 8 +-- .../selftests/bpf/progs/test_sockmap_listen.c | 4 +- 5 files changed, 47 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e0de45527bb6..d9f6ec4a9cf2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -56,8 +56,8 @@ struct sk_msg { struct sk_psock_progs { struct bpf_prog *msg_parser; - struct bpf_prog *skb_parser; - struct bpf_prog *skb_verdict; + struct bpf_prog *stream_parser; + struct bpf_prog *stream_verdict; }; enum sk_psock_state_bits { @@ -443,8 +443,8 @@ static inline int psock_replace_prog(struct bpf_prog **pprog, static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); - psock_set_prog(&progs->skb_parser, NULL); - psock_set_prog(&progs->skb_verdict, NULL); + psock_set_prog(&progs->stream_parser, NULL); + psock_set_prog(&progs->stream_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 409258367bea..35f9caa3b125 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -691,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -803,7 +803,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { /* We skip full set_owner_r here because if we do a SK_PASS * or SK_DROP we can skip skb memory accounting and use the @@ -895,7 +895,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); @@ -919,7 +919,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); @@ -982,7 +982,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) strp_done(&psock->strp); } #else @@ -1015,7 +1015,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dbfcd7006338..69785070f02d 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->saved_data_ready && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -224,23 +224,23 @@ out: static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } @@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->saved_data_ready) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); @@ -303,12 +303,12 @@ out_drop: out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: - if (skb_verdict) - bpf_prog_put(skb_verdict); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: - pprog = &progs->skb_verdict; + pprog = &progs->stream_verdict; break; default: return -EOPNOTSUPP; diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index d7d65a700799..c26e6bf05e49 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1014,8 +1014,8 @@ static void test_skb_redir_to_connected(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; @@ -1125,8 +1125,8 @@ static void test_skb_redir_to_listening(struct test_sockmap_listen *skel, struct bpf_map *inner_map, int family, int sotype) { - int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); - int parser = bpf_program__fd(skel->progs.prog_skb_parser); + int verdict = bpf_program__fd(skel->progs.prog_stream_verdict); + int parser = bpf_program__fd(skel->progs.prog_stream_parser); int verdict_map = bpf_map__fd(skel->maps.verdict_map); int sock_map = bpf_map__fd(inner_map); int err; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index a3a366c57ce1..fa221141e9c1 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -31,13 +31,13 @@ struct { static volatile bool test_sockmap; /* toggled by user-space */ SEC("sk_skb/stream_parser") -int prog_skb_parser(struct __sk_buff *skb) +int prog_stream_parser(struct __sk_buff *skb) { return skb->len; } SEC("sk_skb/stream_verdict") -int prog_skb_verdict(struct __sk_buff *skb) +int prog_stream_verdict(struct __sk_buff *skb) { unsigned int *count; __u32 zero = 0; -- cgit v1.2.3 From 4675e234b9e15159894b90ead9340e1dc202b670 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:31 -0800 Subject: sock_map: Make sock_map_prog_update() static It is only used within sock_map.c so can become static. Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-7-xiyou.wangcong@gmail.com --- include/linux/bpf.h | 9 --------- net/core/sock_map.c | 7 +++++-- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2be47ada5f2d..e1e4d2f60527 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1779,8 +1779,6 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); @@ -1798,13 +1796,6 @@ static inline void bpf_sk_reuseport_detach(struct sock *sk) } #ifdef CONFIG_BPF_SYSCALL -static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, - struct bpf_prog *old, u32 which) -{ - return -EOPNOTSUPP; -} - static inline int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) { diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 69785070f02d..dd53a7771d7e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,6 +24,9 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); + static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; @@ -1444,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; -- cgit v1.2.3 From cd81cefb1abc52bd164f4d9760cd22eadc0e4468 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:32 -0800 Subject: skmsg: Make __sk_psock_purge_ingress_msg() static It is only used within skmsg.c so can become static. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-8-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 -- net/core/skmsg.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9f6ec4a9cf2..676d48e08159 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,8 +340,6 @@ static inline void sk_psock_free_link(struct sk_psock_link *link) struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock); -void __sk_psock_purge_ingress_msg(struct sk_psock *psock); - static inline void sk_psock_cork_free(struct sk_psock *psock) { if (psock->cork) { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 35f9caa3b125..46e29d2c0c48 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -619,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) return link; } -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; -- cgit v1.2.3 From ff9614b81be65d648ec4615b593c6e4b2dac6375 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:34 -0800 Subject: skmsg: Remove unused sk_psock_stop() declaration It is not defined or used anywhere. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 676d48e08159..6c09d94be2e9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -400,7 +400,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk) return psock; } -void sk_psock_stop(struct sock *sk, struct sk_psock *psock); void sk_psock_drop(struct sock *sk, struct sk_psock *psock); static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) -- cgit v1.2.3 From 69c087ba6225b574afb6e505b72cb75242a3d844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:25 -0800 Subject: bpf: Add bpf_for_each_map_elem() helper The bpf_for_each_map_elem() helper is introduced which iterates all map elements with a callback function. The helper signature looks like long bpf_for_each_map_elem(map, callback_fn, callback_ctx, flags) and for each map element, the callback_fn will be called. For example, like hashmap, the callback signature may look like long callback_fn(map, key, val, callback_ctx) There are two known use cases for this. One is from upstream ([1]) where a for_each_map_elem helper may help implement a timeout mechanism in a more generic way. Another is from our internal discussion for a firewall use case where a map contains all the rules. The packet data can be compared to all these rules to decide allow or deny the packet. For array maps, users can already use a bounded loop to traverse elements. Using this helper can avoid using bounded loop. For other type of maps (e.g., hash maps) where bounded loop is hard or impossible to use, this helper provides a convenient way to operate on all elements. For callback_fn, besides map and map element, a callback_ctx, allocated on caller stack, is also passed to the callback function. This callback_ctx argument can provide additional input and allow to write to caller stack for output. If the callback_fn returns 0, the helper will iterate through next element if available. If the callback_fn returns 1, the helper will stop iterating and returns to the bpf program. Other return values are not used for now. Currently, this helper is only available with jit. It is possible to make it work with interpreter with so effort but I leave it as the future work. [1]: https://lore.kernel.org/bpf/20210122205415.113822-1-xiyou.wangcong@gmail.com/ Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204925.3884923-1-yhs@fb.com --- include/linux/bpf.h | 13 +++ include/linux/bpf_verifier.h | 3 + include/uapi/linux/bpf.h | 38 ++++++++ kernel/bpf/bpf_iter.c | 16 ++++ kernel/bpf/helpers.c | 2 + kernel/bpf/verifier.c | 208 ++++++++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 2 + tools/include/uapi/linux/bpf.h | 38 ++++++++ 8 files changed, 307 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1e4d2f60527..aeb1b93a4d75 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,7 @@ struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; struct mem_cgroup; +struct bpf_func_state; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -129,6 +130,13 @@ struct bpf_map_ops { bool (*map_meta_equal)(const struct bpf_map *meta0, const struct bpf_map *meta1); + + int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -295,6 +303,8 @@ enum bpf_arg_type { ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ + ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ + ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ __BPF_ARG_TYPE_MAX, }; @@ -411,6 +421,8 @@ enum bpf_reg_type { PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ + PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ }; /* The information passed from prog-specific *_is_valid_access @@ -1887,6 +1899,7 @@ extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_for_each_map_elem_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 971b33aca13d..51c2ffa3d901 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -68,6 +68,8 @@ struct bpf_reg_state { unsigned long raw1; unsigned long raw2; } raw; + + u32 subprogno; /* for PTR_TO_FUNC */ }; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. @@ -204,6 +206,7 @@ struct bpf_func_state { int acquired_refs; struct bpf_reference_state *refs; int allocated_stack; + bool in_callback_fn; struct bpf_stack_state *stack; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index a0d9eade9c80..931870f9cf56 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -675,3 +675,19 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) */ return ret == 0 ? 0 : -EAGAIN; } + +BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, + void *, callback_ctx, u64, flags) +{ + return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); +} + +const struct bpf_func_proto bpf_for_each_map_elem_proto = { + .func = bpf_for_each_map_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 308427fe03a3..074800226327 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -708,6 +708,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dbdca49ac6cc..53afe9461b03 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_func(const struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && + insn->src_reg == BPF_PSEUDO_FUNC; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -248,6 +254,7 @@ struct bpf_call_arg_meta { u32 btf_id; struct btf *ret_btf; u32 ret_btf_id; + u32 subprogno; }; struct btf *btf_vmlinux; @@ -427,6 +434,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || type == PTR_TO_MAP_VALUE || + type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON; } @@ -469,7 +477,8 @@ static bool arg_type_may_be_null(enum bpf_arg_type type) type == ARG_PTR_TO_MEM_OR_NULL || type == ARG_PTR_TO_CTX_OR_NULL || type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL; + type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || + type == ARG_PTR_TO_STACK_OR_NULL; } /* Determine whether the function releases some resources allocated by another @@ -552,6 +561,8 @@ static const char * const reg_type_str[] = { [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", [PTR_TO_RDWR_BUF] = "rdwr_buf", [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", }; static char slot_type_char[] = { @@ -623,6 +634,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || + t == PTR_TO_MAP_KEY || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) verbose(env, ",ks=%d,vs=%d", @@ -1555,6 +1567,19 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { + if (bpf_pseudo_func(insn + i)) { + if (!env->bpf_capable) { + verbose(env, + "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + return -EPERM; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + /* remember subprog */ + insn[i + 1].imm = ret; + continue; + } if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { @@ -2286,6 +2311,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_MEM_OR_NULL: + case PTR_TO_FUNC: + case PTR_TO_MAP_KEY: return true; default: return false; @@ -2890,6 +2917,10 @@ static int __check_mem_access(struct bpf_verifier_env *env, int regno, reg = &cur_regs(env)[regno]; switch (reg->type) { + case PTR_TO_MAP_KEY: + verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n", + mem_size, off, size); + break; case PTR_TO_MAP_VALUE: verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", mem_size, off, size); @@ -3295,6 +3326,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_FLOW_KEYS: pointer_desc = "flow keys "; break; + case PTR_TO_MAP_KEY: + pointer_desc = "key "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -3396,7 +3430,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (!bpf_pseudo_call(insn + i)) + if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -3833,7 +3867,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn /* for access checks, reg->off is just part of off */ off += reg->off; - if (reg->type == PTR_TO_MAP_VALUE) { + if (reg->type == PTR_TO_MAP_KEY) { + if (t == BPF_WRITE) { + verbose(env, "write to change key R%d not allowed\n", regno); + return -EACCES; + } + + err = check_mem_region_access(env, regno, off, size, + reg->map_ptr->key_size, false); + if (err) + return err; + if (value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into map\n", value_regno); @@ -4249,6 +4295,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_MAP_KEY: + return check_mem_region_access(env, regno, reg->off, access_size, + reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: if (check_map_access_type(env, regno, reg->off, access_size, meta && meta->raw_mode ? BPF_WRITE : @@ -4465,6 +4514,7 @@ static const struct bpf_reg_types map_key_value_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4496,6 +4546,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_RDONLY_BUF, @@ -4508,6 +4559,7 @@ static const struct bpf_reg_types int_ptr_types = { PTR_TO_STACK, PTR_TO_PACKET, PTR_TO_PACKET_META, + PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, }, }; @@ -4520,6 +4572,8 @@ static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_T static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4548,6 +4602,8 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, + [ARG_PTR_TO_FUNC] = &func_ptr_types, + [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -4729,6 +4785,8 @@ skip_type_check: verbose(env, "verifier internal error\n"); return -EFAULT; } + } else if (arg_type == ARG_PTR_TO_FUNC) { + meta->subprogno = reg->subprogno; } else if (arg_type_is_mem_ptr(arg_type)) { /* The access to this pointer is only checked when we hit the * next is_mem_size argument below. @@ -5375,6 +5433,35 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, return __check_func_call(env, insn, insn_idx, subprog, set_callee_state); } +static int set_map_elem_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map; + int err; + + if (bpf_map_ptr_poisoned(insn_aux)) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + + map = BPF_MAP_PTR(insn_aux->map_ptr_state); + if (!map->ops->map_set_for_each_callback_args || + !map->ops->map_for_each_callback) { + verbose(env, "callback function not allowed for map\n"); + return -ENOTSUPP; + } + + err = map->ops->map_set_for_each_callback_args(env, caller, callee); + if (err) + return err; + + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -5397,8 +5484,22 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) state->curframe--; caller = state->frame[state->curframe]; - /* return to the caller whatever r0 had in the callee */ - caller->regs[BPF_REG_0] = *r0; + if (callee->in_callback_fn) { + /* enforce R0 return value range [0, 1]. */ + struct tnum range = tnum_range(0, 1); + + if (r0->type != SCALAR_VALUE) { + verbose(env, "R0 not a scalar value\n"); + return -EACCES; + } + if (!tnum_in(range, r0->var_off)) { + verbose_invalid_scalar(env, r0, &range, "callback return", "R0"); + return -EINVAL; + } + } else { + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + } /* Transfer references to the caller */ err = transfer_reference_state(caller, callee); @@ -5453,7 +5554,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && - func_id != BPF_FUNC_map_peek_elem) + func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_for_each_map_elem) return 0; if (map == NULL) { @@ -5534,15 +5636,18 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; + int insn_idx = *insn_idx_p; bool changes_data; - int i, err; + int i, err, func_id; /* find function prototype */ + func_id = insn->imm; if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { verbose(env, "invalid func %s#%d\n", func_id_name(func_id), func_id); @@ -5638,6 +5743,13 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return -EINVAL; } + if (func_id == BPF_FUNC_for_each_map_elem) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_map_elem_callback_state); + if (err < 0) + return -EINVAL; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -5891,6 +6003,19 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, else *ptr_limit = -off; return 0; + case PTR_TO_MAP_KEY: + /* Currently, this code is not exercised as the only use + * is bpf_for_each_map_elem() helper which requires + * bpf_capble. The code has been tested manually for + * future use. + */ + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->key_size - off; + } + return 0; case PTR_TO_MAP_VALUE: if (mask_to_left) { *ptr_limit = ptr_reg->umax_value + ptr_reg->off; @@ -6092,6 +6217,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", @@ -8271,6 +8397,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + if (insn->src_reg == BPF_PSEUDO_FUNC) { + struct bpf_prog_aux *aux = env->prog->aux; + u32 subprogno = insn[1].imm; + + if (!aux->func_info) { + verbose(env, "missing btf func_info\n"); + return -EINVAL; + } + if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) { + verbose(env, "callback function not static\n"); + return -EINVAL; + } + + dst_reg->type = PTR_TO_FUNC; + dst_reg->subprogno = subprogno; + return 0; + } + map = env->used_maps[aux->map_index]; mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; @@ -8657,6 +8801,9 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; int ret; + if (bpf_pseudo_func(insns + t)) + return visit_func_call_insn(t, insn_cnt, insns, env, true); + /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && BPF_CLASS(insns[t].code) != BPF_JMP32) @@ -9277,6 +9424,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, */ return false; } + case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. @@ -10123,10 +10271,9 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, env->insn_idx); + err = check_helper_call(env, insn, &env->insn_idx); if (err) return err; - } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || @@ -10555,6 +10702,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) goto next_insn; } + if (insn[0].src_reg == BPF_PSEUDO_FUNC) { + aux = &env->insn_aux_data[i]; + aux->ptr_type = PTR_TO_FUNC; + goto next_insn; + } + /* In final convert_pseudo_ld_imm64() step, this is * converted into regular 64-bit imm load insn. */ @@ -10687,9 +10840,13 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) int insn_cnt = env->prog->len; int i; - for (i = 0; i < insn_cnt; i++, insn++) - if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) - insn->src_reg = 0; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + if (insn->src_reg == BPF_PSEUDO_FUNC) + continue; + insn->src_reg = 0; + } } /* single env->prog->insni[off] instruction was replaced with the range @@ -11330,6 +11487,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + env->insn_aux_data[i].call_imm = insn->imm; + /* subprog is encoded in insn[1].imm */ + continue; + } + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but @@ -11459,6 +11622,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { + if (bpf_pseudo_func(insn)) { + subprog = insn[1].imm; + insn[0].imm = (u32)(long)func[subprog]->bpf_func; + insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32; + continue; + } if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; @@ -11504,6 +11673,11 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + insn[0].imm = env->insn_aux_data[i].call_imm; + insn[1].imm = find_subprog(env, i + insn[0].imm + 1); + continue; + } if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; @@ -11568,6 +11742,14 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { + if (bpf_pseudo_func(insn)) { + /* When JIT fails the progs with callback calls + * have to be rejected, since interpreter doesn't support them yet. + */ + verbose(env, "callbacks are not allowed in non-JITed programs\n"); + return -EINVAL; + } + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e9701744d8e4..0d23755c2747 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1371,6 +1371,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; + case BPF_FUNC_for_each_map_elem: + return &bpf_for_each_map_elem_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 79c893310492..b89af20cfa19 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -393,6 +393,15 @@ enum bpf_link_type { * is struct/union. */ #define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -3909,6 +3918,34 @@ union bpf_attr { * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4075,6 +4112,7 @@ union bpf_attr { FN(ima_inode_hash), \ FN(sock_from_file), \ FN(check_mtu), \ + FN(for_each_map_elem), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 314ee05e2fc601a7bece14376547d2b7a04bab67 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Feb 2021 12:49:27 -0800 Subject: bpf: Add hashtab support for bpf_for_each_map_elem() helper This patch added support for hashmap, percpu hashmap, lru hashmap and percpu lru hashmap. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210226204927.3885020-1-yhs@fb.com --- include/linux/bpf.h | 4 ++++ kernel/bpf/hashtab.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 27 +++++++++++++++++++++ 3 files changed, 96 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aeb1b93a4d75..4c730863fa77 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1397,6 +1397,10 @@ void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee); + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d63912e73ad9..330d721dd2af 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1869,6 +1869,63 @@ static const struct bpf_iter_seq_info iter_seq_info = { .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), }; +static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn, + void *callback_ctx, u64 flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_head *head; + struct hlist_nulls_node *n; + struct htab_elem *elem; + u32 roundup_key_size; + int i, num_elems = 0; + void __percpu *pptr; + struct bucket *b; + void *key, *val; + bool is_percpu; + u64 ret = 0; + + if (flags != 0) + return -EINVAL; + + is_percpu = htab_is_percpu(htab); + + roundup_key_size = round_up(map->key_size, 8); + /* disable migration so percpu value prepared here will be the + * same as the one seen by the bpf program with bpf_map_lookup_elem(). + */ + if (is_percpu) + migrate_disable(); + for (i = 0; i < htab->n_buckets; i++) { + b = &htab->buckets[i]; + rcu_read_lock(); + head = &b->head; + hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + key = elem->key; + if (is_percpu) { + /* current cpu value for percpu map */ + pptr = htab_elem_get_ptr(elem, map->key_size); + val = this_cpu_ptr(pptr); + } else { + val = elem->key + roundup_key_size; + } + num_elems++; + ret = BPF_CAST_CALL(callback_fn)((u64)(long)map, + (u64)(long)key, (u64)(long)val, + (u64)(long)callback_ctx, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) { + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + } +out: + if (is_percpu) + migrate_enable(); + return num_elems; +} + static int htab_map_btf_id; const struct bpf_map_ops htab_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -1881,6 +1938,8 @@ const struct bpf_map_ops htab_map_ops = { .map_delete_elem = htab_map_delete_elem, .map_gen_lookup = htab_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab), .map_btf_name = "bpf_htab", .map_btf_id = &htab_map_btf_id, @@ -1900,6 +1959,8 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, .map_seq_show_elem = htab_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_map_btf_id, @@ -2019,6 +2080,8 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_percpu_map_btf_id, @@ -2036,6 +2099,8 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_update_elem = htab_lru_percpu_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_seq_show_elem = htab_percpu_map_seq_show_elem, + .map_set_for_each_callback_args = map_set_for_each_callback_args, + .map_for_each_callback = bpf_for_each_hash_elem, BATCH_OPS(htab_lru_percpu), .map_btf_name = "bpf_htab", .map_btf_id = &htab_lru_percpu_map_btf_id, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53afe9461b03..9fe90ce52a65 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5403,6 +5403,33 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn return 0; } +int map_set_for_each_callback_args(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee) +{ + /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, + * void *callback_ctx, u64 flags); + * callback_fn(struct bpf_map *map, void *key, void *value, + * void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr; + + /* pointer to stack or null */ + callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + return 0; +} + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx) -- cgit v1.2.3 From e5b0ad69c97a04f42834b24a6a0323ab15ccc9bb Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Wed, 3 Mar 2021 08:34:04 -0800 Subject: Bluetooth: Remove unneeded commands for suspend During suspend, there are a few scan enable and set event filter commands that don't need to be sent unless there are actual BR/EDR devices capable of waking the system. Check the HCI_PSCAN bit before writing scan enable and use a new dev flag, HCI_EVENT_FILTER_CONFIGURED to control whether to clear the event filter. Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Archie Pusaka Reviewed-by: Alain Michaud Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_event.c | 27 +++++++++++++++++++++++++++ net/bluetooth/hci_request.c | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 55 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index ba2f439bc04d..ea4ae551c426 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -320,6 +320,7 @@ enum { HCI_BREDR_ENABLED, HCI_LE_SCAN_INTERRUPTED, HCI_WIDEBAND_SPEECH_ENABLED, + HCI_EVENT_FILTER_CONFIGURED, HCI_DUT_MODE, HCI_VENDOR_DIAG, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 67668be3461e..f4a734f8a9ac 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -395,6 +395,29 @@ done: hci_dev_unlock(hdev); } +static void hci_cc_set_event_filter(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *)skb->data); + struct hci_cp_set_event_filter *cp; + void *sent; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + sent = hci_sent_cmd_data(hdev, HCI_OP_SET_EVENT_FLT); + if (!sent) + return; + + cp = (struct hci_cp_set_event_filter *)sent; + + if (cp->flt_type == HCI_FLT_CLEAR_ALL) + hci_dev_clear_flag(hdev, HCI_EVENT_FILTER_CONFIGURED); + else + hci_dev_set_flag(hdev, HCI_EVENT_FILTER_CONFIGURED); +} + static void hci_cc_read_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_class_of_dev *rp = (void *) skb->data; @@ -3328,6 +3351,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_write_scan_enable(hdev, skb); break; + case HCI_OP_SET_EVENT_FLT: + hci_cc_set_event_filter(hdev, skb); + break; + case HCI_OP_READ_CLASS_OF_DEV: hci_cc_read_class_of_dev(hdev, skb); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index e55976db4403..75a42178c82d 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1131,14 +1131,14 @@ static void hci_req_clear_event_filter(struct hci_request *req) { struct hci_cp_set_event_filter f; - memset(&f, 0, sizeof(f)); - f.flt_type = HCI_FLT_CLEAR_ALL; - hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &f); + if (!hci_dev_test_flag(req->hdev, HCI_BREDR_ENABLED)) + return; - /* Update page scan state (since we may have modified it when setting - * the event filter). - */ - __hci_req_update_scan(req); + if (hci_dev_test_flag(req->hdev, HCI_EVENT_FILTER_CONFIGURED)) { + memset(&f, 0, sizeof(f)); + f.flt_type = HCI_FLT_CLEAR_ALL; + hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &f); + } } static void hci_req_set_event_filter(struct hci_request *req) @@ -1147,6 +1147,10 @@ static void hci_req_set_event_filter(struct hci_request *req) struct hci_cp_set_event_filter f; struct hci_dev *hdev = req->hdev; u8 scan = SCAN_DISABLED; + bool scanning = test_bit(HCI_PSCAN, &hdev->flags); + + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + return; /* Always clear event filter when starting */ hci_req_clear_event_filter(req); @@ -1167,12 +1171,13 @@ static void hci_req_set_event_filter(struct hci_request *req) scan = SCAN_PAGE; } - if (scan) + if (scan && !scanning) { set_bit(SUSPEND_SCAN_ENABLE, hdev->suspend_tasks); - else + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + } else if (!scan && scanning) { set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); - - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); + } } static void cancel_adv_timeout(struct hci_dev *hdev) @@ -1315,9 +1320,14 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hdev->advertising_paused = true; hdev->advertising_old_state = old_state; - /* Disable page scan */ - page_scan = SCAN_DISABLED; - hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &page_scan); + + /* Disable page scan if enabled */ + if (test_bit(HCI_PSCAN, &hdev->flags)) { + page_scan = SCAN_DISABLED; + hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, + &page_scan); + set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); + } /* Disable LE passive scan if enabled */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { @@ -1328,9 +1338,6 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) /* Disable advertisement filters */ hci_req_add_set_adv_filter_enable(&req, false); - /* Mark task needing completion */ - set_bit(SUSPEND_SCAN_DISABLE, hdev->suspend_tasks); - /* Prevent disconnects from causing scanning to be re-enabled */ hdev->scanning_paused = true; @@ -1364,7 +1371,10 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) hdev->suspended = false; hdev->scanning_paused = false; + /* Clear any event filters and restore scan state */ hci_req_clear_event_filter(&req); + __hci_req_update_scan(&req); + /* Reset passive/background scanning to normal */ __hci_update_background_scan(&req); /* Enable all of the advertisement filters */ -- cgit v1.2.3 From ff02db13e9bfa01e0d66c5fa53da29bd1f1b208a Mon Sep 17 00:00:00 2001 From: Daniel Winkler Date: Wed, 3 Mar 2021 11:15:23 -0800 Subject: Bluetooth: Allow scannable adv with extended MGMT APIs An issue was found, where if a bluetooth client requests a broadcast advertisement with scan response data, it will not be properly registered with the controller. This is because at the time that the hci_cp_le_set_scan_param structure is created, the scan response will not yet have been received since it comes in a second MGMT call. With empty scan response, the request defaults to a non-scannable PDU type. On some controllers, the subsequent scan response request will fail due to incorrect PDU type, and others will succeed and not use the scan response. This fix allows the advertising parameters MGMT call to include a flag to let the kernel know whether a scan response will be coming, so that the correct PDU type is used in the first place. A bluetoothd change is also incoming to take advantage of it. To test this, I created a broadcast advertisement with scan response data and registered it on the hatch chromebook. Without this change, the request fails, and with it will succeed. Reviewed-by: Alain Michaud Reviewed-by: Sonny Sasaka Reviewed-by: Miao-chen Chou Signed-off-by: Daniel Winkler Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 1 + net/bluetooth/hci_request.c | 3 ++- net/bluetooth/mgmt.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 839a2028009e..a7cffb069565 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -578,6 +578,7 @@ struct mgmt_rp_add_advertising { #define MGMT_ADV_PARAM_TIMEOUT BIT(13) #define MGMT_ADV_PARAM_INTERVALS BIT(14) #define MGMT_ADV_PARAM_TX_POWER BIT(15) +#define MGMT_ADV_PARAM_SCAN_RSP BIT(16) #define MGMT_ADV_FLAG_SEC_MASK (MGMT_ADV_FLAG_SEC_1M | MGMT_ADV_FLAG_SEC_2M | \ MGMT_ADV_FLAG_SEC_CODED) diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 75a42178c82d..d7ee11ef70d3 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -2180,7 +2180,8 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - } else if (adv_instance_is_scannable(hdev, instance)) { + } else if (adv_instance_is_scannable(hdev, instance) || + (flags & MGMT_ADV_PARAM_SCAN_RSP)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); else diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 74971b4bd457..90334ac4a135 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7432,6 +7432,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) flags |= MGMT_ADV_PARAM_TIMEOUT; flags |= MGMT_ADV_PARAM_INTERVALS; flags |= MGMT_ADV_PARAM_TX_POWER; + flags |= MGMT_ADV_PARAM_SCAN_RSP; /* In extended adv TX_POWER returned from Set Adv Param * will be always valid. -- cgit v1.2.3 From 6ed6e1c761f6c8391af654facbbbf1748ae9f386 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 1 Mar 2021 10:48:05 -0800 Subject: skmsg: Add function doc for skb->_sk_redir This should fix the following warning: include/linux/skbuff.h:932: warning: Function parameter or member '_sk_redir' not described in 'sk_buff' Reported-by: Lorenz Bauer Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210301184805.8174-1-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bd84f799c952..0503c917d773 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -656,6 +656,7 @@ typedef unsigned char *sk_buff_data_t; * @protocol: Packet protocol from driver * @destructor: Destruct function * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) + * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on -- cgit v1.2.3 From 8fd886911a6a99acf4a8facf619a2e7b5225be78 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 26 Feb 2021 21:22:47 +0100 Subject: bpf: Add BTF_KIND_FLOAT to uapi Add a new kind value and expand the kind bitfield. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210226202256.116518-2-iii@linux.ibm.com --- include/uapi/linux/btf.h | 5 +++-- tools/include/uapi/linux/btf.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index 5a667107ad2c..d27b1708efe9 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -52,7 +52,7 @@ struct btf_type { }; }; -#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) @@ -72,7 +72,8 @@ struct btf_type { #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ #define BTF_KIND_VAR 14 /* Variable */ #define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_MAX BTF_KIND_DATASEC +#define BTF_KIND_FLOAT 16 /* Floating point */ +#define BTF_KIND_MAX BTF_KIND_FLOAT #define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately -- cgit v1.2.3 From 7799e4d9d84f6f8231dfd9dca4da5f4b2f0aa932 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:33 -0800 Subject: bpf: Import syscall arg documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These descriptions are present in the man-pages project from the original submissions around 2015-2016. Import them so that they can be kept up to date as developers extend the bpf syscall commands. These descriptions follow the pattern used by scripts/bpf_helpers_doc.py so that we can take advantage of the parser to generate more up-to-date man page writing based upon these headers. Some minor wording adjustments were made to make the descriptions more consistent for the description / return format. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-2-joe@cilium.io Co-authored-by: Alexei Starovoitov Co-authored-by: Michael Kerrisk --- include/uapi/linux/bpf.h | 122 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b89af20cfa19..fb16c590e6d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -93,7 +93,127 @@ union bpf_iter_link_info { } map; }; -/* BPF syscall commands, see bpf(2) man-page for details. */ +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * For example, after **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. In addition, file descriptors + * referring to eBPF objects can be transferred over UNIX domain sockets. + * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. An eBPF object is + * deallocated only after all file descriptors referring to the object + * have been closed. + */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, -- cgit v1.2.3 From f67c9cbf6c581468f6c7144d497565cfc7918c31 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:34 -0800 Subject: bpf: Add minimal bpf() command documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce high-level descriptions of the intent and return codes of the bpf() syscall commands. Subsequent patches may further flesh out the content to provide a more useful programming reference. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-3-joe@cilium.io --- include/uapi/linux/bpf.h | 368 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 368 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fb16c590e6d9..052bbfe65f77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -204,6 +204,374 @@ union bpf_iter_link_info { * A new file descriptor (a nonnegative integer), or -1 if an * error occurred (in which case, *errno* is set appropriately). * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run an eBPF program a number of times against a provided + * program context and return the modified program context and + * duration of the test run. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Iterate and update multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_DELETE_BATCH + * Description + * Iterate and delete multiple elements in a map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * For example, after **fork**\ (2), the child inherits file descriptors -- cgit v1.2.3 From 6690523bccb3e44cfcc4b2c995767e6814046e34 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:35 -0800 Subject: bpf: Document BPF_F_LOCK in syscall commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the meaning of the BPF_F_LOCK flag for the map lookup/update descriptions. Based on commit 96049f3afd50 ("bpf: introduce BPF_F_LOCK flag"). Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-4-joe@cilium.io --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 052bbfe65f77..eb9f059f0569 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -123,6 +123,14 @@ union bpf_iter_link_info { * Look up an element with a given *key* in the map referred to * by the file descriptor *map_fd*. * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -140,6 +148,8 @@ union bpf_iter_link_info { * Create a new element only if it did not exist. * **BPF_EXIST** * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. * * Return * Returns zero on success. On error, -1 is returned and *errno* -- cgit v1.2.3 From 8aacb3c8d1a32b23c82645051bba55f0ae6c103b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:36 -0800 Subject: bpf: Document BPF_PROG_PIN syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b2197755b263 ("bpf: add support for persistent maps/progs") contains the original implementation and git logs, used as reference for this documentation. Also pull in the filename restriction as documented in commit 6d8cb045cde6 ("bpf: comment why dots in filenames under BPF virtual FS are not allowed") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-5-joe@cilium.io --- include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eb9f059f0569..6946dde90c56 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -219,6 +219,22 @@ union bpf_iter_link_info { * Pin an eBPF program or map referred by the specified *bpf_fd* * to the provided *pathname* on the filesystem. * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -584,13 +600,19 @@ union bpf_iter_link_info { * * NOTES * eBPF objects (maps and programs) can be shared between processes. - * For example, after **fork**\ (2), the child inherits file descriptors - * referring to the same eBPF objects. In addition, file descriptors - * referring to eBPF objects can be transferred over UNIX domain sockets. - * File descriptors referring to eBPF objects can be duplicated in the - * usual way, using **dup**\ (2) and similar calls. An eBPF object is - * deallocated only after all file descriptors referring to the object - * have been closed. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). */ enum bpf_cmd { BPF_MAP_CREATE, -- cgit v1.2.3 From 32e76b187a90de5809d68c2ef3e3964176dacaf0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:37 -0800 Subject: bpf: Document BPF_PROG_ATTACH syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the prog attach command in more detail, based on git commits: * commit f4324551489e ("bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands") * commit 4f738adba30a ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") * commit f4364dcfc86d ("media: rc: introduce BPF_PROG_LIRC_MODE2") * commit d58e468b1112 ("flow_dissector: implements flow dissector BPF hook") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-6-joe@cilium.io --- include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6946dde90c56..a8f2964ec885 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -253,6 +253,43 @@ union bpf_iter_link_info { * Attach an eBPF program to a *target_fd* at the specified * *attach_type* hook. * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. -- cgit v1.2.3 From 2a3fdca4e3bc7a01316277ba26f4090c4b19bf7c Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:38 -0800 Subject: bpf: Document BPF_PROG_TEST_RUN syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on a brief read of the corresponding source code. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-7-joe@cilium.io --- include/uapi/linux/bpf.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a8f2964ec885..a6cd6650e23d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -306,14 +306,22 @@ union bpf_iter_link_info { * * BPF_PROG_TEST_RUN * Description - * Run an eBPF program a number of times against a provided - * program context and return the modified program context and - * duration of the test run. + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * * BPF_PROG_GET_NEXT_ID * Description * Fetch the next eBPF program currently loaded into the kernel. -- cgit v1.2.3 From 5d999994e05d62d4f53059540652014cf83cddfe Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:39 -0800 Subject: bpf: Document BPF_PROG_QUERY syscall command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 468e2f64d220 ("bpf: introduce BPF_PROG_QUERY command") originally introduced this, but there have been several additions since then. Unlike BPF_PROG_ATTACH, it appears that the sockmap progs are not able to be queried so far. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-8-joe@cilium.io --- include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6cd6650e23d..0cf92ef011f1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -389,6 +389,43 @@ union bpf_iter_link_info { * Obtain information about eBPF programs associated with the * specified *attach_type* hook. * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. -- cgit v1.2.3 From 0cb804547927c05f6aa7e28c8d4a1e02fec1a6d4 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:40 -0800 Subject: bpf: Document BPF_MAP_*_BATCH syscall commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based roughly on the following commits: * Commit cb4d03ab499d ("bpf: Add generic support for lookup batch op") * Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") * Commit aa2e93b8e58e ("bpf: Add generic support for update and delete batch ops") Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Acked-by: Brian Vazquez Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210302171947.2268128-9-joe@cilium.io --- include/uapi/linux/bpf.h | 114 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0cf92ef011f1..c8b9d19fce22 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -553,13 +553,55 @@ union bpf_iter_link_info { * Description * Iterate and fetch multiple elements in a map. * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * * BPF_MAP_LOOKUP_AND_DELETE_BATCH * Description - * Iterate and delete multiple elements in a map. + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* @@ -567,15 +609,81 @@ union bpf_iter_link_info { * * BPF_MAP_UPDATE_BATCH * Description - * Iterate and update multiple elements in a map. + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * * BPF_MAP_DELETE_BATCH * Description - * Iterate and delete multiple elements in a map. + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. * * Return * Returns zero on success. On error, -1 is returned and *errno* -- cgit v1.2.3 From 923a932c982fd71856f80dbeaaa3ca41a75e89e0 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Mar 2021 09:19:41 -0800 Subject: scripts/bpf: Abstract eBPF API target parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Abstract out the target parameter so that upcoming commits, more than just the existing "helpers" target can be called to generate specific portions of docs from the eBPF UAPI headers. Signed-off-by: Joe Stringer Signed-off-by: Alexei Starovoitov Reviewed-by: Quentin Monnet Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210302171947.2268128-10-joe@cilium.io --- MAINTAINERS | 1 + include/uapi/linux/bpf.h | 2 +- scripts/bpf_doc.py | 650 +++++++++++++++++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 615 -------------------------------------- tools/bpf/Makefile.helpers | 2 +- tools/include/uapi/linux/bpf.h | 2 +- tools/lib/bpf/Makefile | 2 +- tools/perf/MANIFEST | 2 +- 8 files changed, 656 insertions(+), 620 deletions(-) create mode 100755 scripts/bpf_doc.py delete mode 100755 scripts/bpf_helpers_doc.py (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index a50a543e3c81..8d56c7044067 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3223,6 +3223,7 @@ F: net/core/filter.c F: net/sched/act_bpf.c F: net/sched/cls_bpf.c F: samples/bpf/ +F: scripts/bpf_doc.py F: tools/bpf/ F: tools/lib/bpf/ F: tools/testing/selftests/bpf/ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c8b9d19fce22..63a56ed6a785 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1439,7 +1439,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py new file mode 100755 index 000000000000..5a4f68aab335 --- /dev/null +++ b/scripts/bpf_doc.py @@ -0,0 +1,650 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0-only +# +# Copyright (C) 2018-2019 Netronome Systems, Inc. +# Copyright (C) 2021 Isovalent, Inc. + +# In case user attempts to run with Python 2. +from __future__ import print_function + +import argparse +import re +import sys, os + +class NoHelperFound(BaseException): + pass + +class ParsingError(BaseException): + def __init__(self, line='', reader=None): + if reader: + BaseException.__init__(self, + 'Error at file offset %d, parsing line: %s' % + (reader.tell(), line)) + else: + BaseException.__init__(self, 'Error parsing line: %s' % line) + +class Helper(object): + """ + An object representing the description of an eBPF helper function. + @proto: function prototype of the helper function + @desc: textual description of the helper function + @ret: description of the return value of the helper function + """ + def __init__(self, proto='', desc='', ret=''): + self.proto = proto + self.desc = desc + self.ret = ret + + def proto_break_down(self): + """ + Break down helper function protocol into smaller chunks: return type, + name, distincts arguments. + """ + arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$') + res = {} + proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') + + capture = proto_re.match(self.proto) + res['ret_type'] = capture.group(1) + res['ret_star'] = capture.group(2) + res['name'] = capture.group(3) + res['args'] = [] + + args = capture.group(4).split(', ') + for a in args: + capture = arg_re.match(a) + res['args'].append({ + 'type' : capture.group(1), + 'star' : capture.group(5), + 'name' : capture.group(6) + }) + + return res + +class HeaderParser(object): + """ + An object used to parse a file in order to extract the documentation of a + list of eBPF helper functions. All the helpers that can be retrieved are + stored as Helper object, in the self.helpers() array. + @filename: name of file to parse, usually include/uapi/linux/bpf.h in the + kernel tree + """ + def __init__(self, filename): + self.reader = open(filename, 'r') + self.line = '' + self.helpers = [] + + def parse_helper(self): + proto = self.parse_proto() + desc = self.parse_desc() + ret = self.parse_ret() + return Helper(proto=proto, desc=desc, ret=ret) + + def parse_proto(self): + # Argument can be of shape: + # - "void" + # - "type name" + # - "type *name" + # - Same as above, with "const" and/or "struct" in front of type + # - "..." (undefined number of arguments, for bpf_trace_printk()) + # There is at least one term ("void"), and at most five arguments. + p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') + capture = p.match(self.line) + if not capture: + raise NoHelperFound + self.line = self.reader.readline() + return capture.group(1) + + def parse_desc(self): + p = re.compile(' \* ?(?:\t| {5,8})Description$') + capture = p.match(self.line) + if not capture: + # Helper can have empty description and we might be parsing another + # attribute: return but do not consume. + return '' + # Description can be several lines, some of them possibly empty, and it + # stops when another subsection title is met. + desc = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + desc += '\n' + else: + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') + capture = p.match(self.line) + if capture: + desc += capture.group(1) + '\n' + else: + break + return desc + + def parse_ret(self): + p = re.compile(' \* ?(?:\t| {5,8})Return$') + capture = p.match(self.line) + if not capture: + # Helper can have empty retval and we might be parsing another + # attribute: return but do not consume. + return '' + # Return value description can be several lines, some of them possibly + # empty, and it stops when another subsection title is met. + ret = '' + while True: + self.line = self.reader.readline() + if self.line == ' *\n': + ret += '\n' + else: + p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') + capture = p.match(self.line) + if capture: + ret += capture.group(1) + '\n' + else: + break + return ret + + def run(self): + # Advance to start of helper function descriptions. + offset = self.reader.read().find('* Start of BPF helper function descriptions:') + if offset == -1: + raise Exception('Could not find start of eBPF helper descriptions list') + self.reader.seek(offset) + self.reader.readline() + self.reader.readline() + self.line = self.reader.readline() + + while True: + try: + helper = self.parse_helper() + self.helpers.append(helper) + except NoHelperFound: + break + + self.reader.close() + +############################################################################### + +class Printer(object): + """ + A generic class for printers. Printers should be created with an array of + Helper objects, and implement a way to print them in the desired fashion. + @parser: A HeaderParser with objects to print to standard output + """ + def __init__(self, parser): + self.parser = parser + self.elements = [] + + def print_header(self): + pass + + def print_footer(self): + pass + + def print_one(self, helper): + pass + + def print_all(self): + self.print_header() + for elem in self.elements: + self.print_one(elem) + self.print_footer() + + +class PrinterRST(Printer): + """ + A generic class for printers that print ReStructured Text. Printers should + be created with a HeaderParser object, and implement a way to print API + elements in the desired fashion. + @parser: A HeaderParser with objects to print to standard output + """ + def __init__(self, parser): + self.parser = parser + + def print_license(self): + license = '''\ +.. Copyright (C) All BPF authors and contributors from 2014 to present. +.. See git log include/uapi/linux/bpf.h in kernel tree for details. +.. +.. %%%LICENSE_START(VERBATIM) +.. Permission is granted to make and distribute verbatim copies of this +.. manual provided the copyright notice and this permission notice are +.. preserved on all copies. +.. +.. Permission is granted to copy and distribute modified versions of this +.. manual under the conditions for verbatim copying, provided that the +.. entire resulting derived work is distributed under the terms of a +.. permission notice identical to this one. +.. +.. Since the Linux kernel and libraries are constantly changing, this +.. manual page may be incorrect or out-of-date. The author(s) assume no +.. responsibility for errors or omissions, or for damages resulting from +.. the use of the information contained herein. The author(s) may not +.. have taken the same level of care in the production of this manual, +.. which is licensed free of charge, as they might when working +.. professionally. +.. +.. Formatted or processed versions of this manual, if unaccompanied by +.. the source, must acknowledge the copyright and authors of this work. +.. %%%LICENSE_END +.. +.. Please do not edit this file. It was generated from the documentation +.. located in file include/uapi/linux/bpf.h of the Linux kernel sources +.. (helpers description), and from scripts/bpf_doc.py in the same +.. repository (header and footer). +''' + print(license) + + def print_elem(self, elem): + if (elem.desc): + print('\tDescription') + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', elem.desc, count=1).split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + if (elem.ret): + print('\tReturn') + for line in elem.ret.rstrip().split('\n'): + print('{}{}'.format('\t\t' if line else '', line)) + + print('') + + +class PrinterHelpersRST(PrinterRST): + """ + A printer for dumping collected information about helpers as a ReStructured + Text page compatible with the rst2man program, which can be used to + generate a manual page for the helpers. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + def print_header(self): + header = '''\ +=========== +BPF-HELPERS +=========== +------------------------------------------------------------------------------- +list of eBPF helper functions +------------------------------------------------------------------------------- + +:Manual section: 7 + +DESCRIPTION +=========== + +The extended Berkeley Packet Filter (eBPF) subsystem consists in programs +written in a pseudo-assembly language, then attached to one of the several +kernel hooks and run in reaction of specific events. This framework differs +from the older, "classic" BPF (or "cBPF") in several aspects, one of them being +the ability to call special functions (or "helpers") from within a program. +These functions are restricted to a white-list of helpers defined in the +kernel. + +These helpers are used by eBPF programs to interact with the system, or with +the context in which they work. For instance, they can be used to print +debugging messages, to get the time since the system was booted, to interact +with eBPF maps, or to manipulate network packets. Since there are several eBPF +program types, and that they do not run in the same context, each program type +can only call a subset of those helpers. + +Due to eBPF conventions, a helper can not have more than five arguments. + +Internally, eBPF programs call directly into the compiled helper functions +without requiring any foreign-function interface. As a result, calling helpers +introduces no overhead, thus offering excellent performance. + +This document is an attempt to list and document the helpers available to eBPF +developers. They are sorted by chronological order (the oldest helpers in the +kernel at the top). + +HELPERS +======= +''' + PrinterRST.print_license(self) + print(header) + + def print_footer(self): + footer = ''' +EXAMPLES +======== + +Example usage for most of the eBPF helpers listed in this manual page are +available within the Linux kernel sources, at the following locations: + +* *samples/bpf/* +* *tools/testing/selftests/bpf/* + +LICENSE +======= + +eBPF programs can have an associated license, passed along with the bytecode +instructions to the kernel when the programs are loaded. The format for that +string is identical to the one in use for kernel modules (Dual licenses, such +as "Dual BSD/GPL", may be used). Some helper functions are only accessible to +programs that are compatible with the GNU Privacy License (GPL). + +In order to use such helpers, the eBPF program must be loaded with the correct +license string passed (via **attr**) to the **bpf**\ () system call, and this +generally translates into the C source code of the program containing a line +similar to the following: + +:: + + char ____license[] __attribute__((section("license"), used)) = "GPL"; + +IMPLEMENTATION +============== + +This manual page is an effort to document the existing eBPF helper functions. +But as of this writing, the BPF sub-system is under heavy development. New eBPF +program or map types are added, along with new helper functions. Some helpers +are occasionally made available for additional program types. So in spite of +the efforts of the community, this page might not be up-to-date. If you want to +check by yourself what helper functions exist in your kernel, or what types of +programs they can support, here are some files among the kernel tree that you +may be interested in: + +* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list + of all helper functions, as well as many other BPF definitions including most + of the flags, structs or constants used by the helpers. +* *net/core/filter.c* contains the definition of most network-related helper + functions, and the list of program types from which they can be used. +* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related + helpers. +* *kernel/bpf/verifier.c* contains the functions used to check that valid types + of eBPF maps are used with a given helper function. +* *kernel/bpf/* directory contains other files in which additional helpers are + defined (for cgroups, sockmaps, etc.). +* The bpftool utility can be used to probe the availability of helper functions + on the system (as well as supported program and map types, and a number of + other parameters). To do so, run **bpftool feature probe** (see + **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to + list features available to unprivileged users. + +Compatibility between helper functions and program types can generally be found +in the files where helper functions are defined. Look for the **struct +bpf_func_proto** objects and for functions returning them: these functions +contain a list of helpers that a given program type can call. Note that the +**default:** label of the **switch ... case** used to filter helpers can call +other functions, themselves allowing access to additional helpers. The +requirement for GPL license is also in those **struct bpf_func_proto**. + +Compatibility between helper functions and map types can be found in the +**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*. + +Helper functions that invalidate the checks on **data** and **data_end** +pointers for network processing are listed in function +**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*. + +SEE ALSO +======== + +**bpf**\ (2), +**bpftool**\ (8), +**cgroups**\ (7), +**ip**\ (8), +**perf_event_open**\ (2), +**sendmsg**\ (2), +**socket**\ (7), +**tc-bpf**\ (8)''' + print(footer) + + def print_proto(self, helper): + """ + Format function protocol with bold and italics markers. This makes RST + file less readable, but gives nice results in the manual page. + """ + proto = helper.proto_break_down() + + print('**%s %s%s(' % (proto['ret_type'], + proto['ret_star'].replace('*', '\\*'), + proto['name']), + end='') + + comma = '' + for a in proto['args']: + one_arg = '{}{}'.format(comma, a['type']) + if a['name']: + if a['star']: + one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*')) + else: + one_arg += '** ' + one_arg += '*{}*\\ **'.format(a['name']) + comma = ', ' + print(one_arg, end='') + + print(')**') + + def print_one(self, helper): + self.print_proto(helper) + self.print_elem(helper) + + + + +class PrinterHelpers(Printer): + """ + A printer for dumping collected information about helpers as C header to + be included from BPF program. + @parser: A HeaderParser with Helper objects to print to standard output + """ + def __init__(self, parser): + self.elements = parser.helpers + + type_fwds = [ + 'struct bpf_fib_lookup', + 'struct bpf_sk_lookup', + 'struct bpf_perf_event_data', + 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', + 'struct bpf_sock', + 'struct bpf_sock_addr', + 'struct bpf_sock_ops', + 'struct bpf_sock_tuple', + 'struct bpf_spin_lock', + 'struct bpf_sysctl', + 'struct bpf_tcp_sock', + 'struct bpf_tunnel_key', + 'struct bpf_xfrm_state', + 'struct linux_binprm', + 'struct pt_regs', + 'struct sk_reuseport_md', + 'struct sockaddr', + 'struct tcphdr', + 'struct seq_file', + 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', + 'struct udp6_sock', + 'struct task_struct', + + 'struct __sk_buff', + 'struct sk_msg_md', + 'struct xdp_md', + 'struct path', + 'struct btf_ptr', + 'struct inode', + 'struct socket', + 'struct file', + ] + known_types = { + '...', + 'void', + 'const void', + 'char', + 'const char', + 'int', + 'long', + 'unsigned long', + + '__be16', + '__be32', + '__wsum', + + 'struct bpf_fib_lookup', + 'struct bpf_perf_event_data', + 'struct bpf_perf_event_value', + 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', + 'struct bpf_sk_lookup', + 'struct bpf_sock', + 'struct bpf_sock_addr', + 'struct bpf_sock_ops', + 'struct bpf_sock_tuple', + 'struct bpf_spin_lock', + 'struct bpf_sysctl', + 'struct bpf_tcp_sock', + 'struct bpf_tunnel_key', + 'struct bpf_xfrm_state', + 'struct linux_binprm', + 'struct pt_regs', + 'struct sk_reuseport_md', + 'struct sockaddr', + 'struct tcphdr', + 'struct seq_file', + 'struct tcp6_sock', + 'struct tcp_sock', + 'struct tcp_timewait_sock', + 'struct tcp_request_sock', + 'struct udp6_sock', + 'struct task_struct', + 'struct path', + 'struct btf_ptr', + 'struct inode', + 'struct socket', + 'struct file', + } + mapped_types = { + 'u8': '__u8', + 'u16': '__u16', + 'u32': '__u32', + 'u64': '__u64', + 's8': '__s8', + 's16': '__s16', + 's32': '__s32', + 's64': '__s64', + 'size_t': 'unsigned long', + 'struct bpf_map': 'void', + 'struct sk_buff': 'struct __sk_buff', + 'const struct sk_buff': 'const struct __sk_buff', + 'struct sk_msg_buff': 'struct sk_msg_md', + 'struct xdp_buff': 'struct xdp_md', + } + # Helpers overloaded for different context types. + overloaded_helpers = [ + 'bpf_get_socket_cookie', + 'bpf_sk_assign', + ] + + def print_header(self): + header = '''\ +/* This is auto-generated file. See bpf_doc.py for details. */ + +/* Forward declarations of BPF structs */''' + + print(header) + for fwd in self.type_fwds: + print('%s;' % fwd) + print('') + + def print_footer(self): + footer = '' + print(footer) + + def map_type(self, t): + if t in self.known_types: + return t + if t in self.mapped_types: + return self.mapped_types[t] + print("Unrecognized type '%s', please add it to known types!" % t, + file=sys.stderr) + sys.exit(1) + + seen_helpers = set() + + def print_one(self, helper): + proto = helper.proto_break_down() + + if proto['name'] in self.seen_helpers: + return + self.seen_helpers.add(proto['name']) + + print('/*') + print(" * %s" % proto['name']) + print(" *") + if (helper.desc): + # Do not strip all newline characters: formatted code at the end of + # a section must be followed by a blank line. + for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): + print(' *{}{}'.format(' \t' if line else '', line)) + + if (helper.ret): + print(' *') + print(' * Returns') + for line in helper.ret.rstrip().split('\n'): + print(' *{}{}'.format(' \t' if line else '', line)) + + print(' */') + print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']), + proto['ret_star'], proto['name']), end='') + comma = '' + for i, a in enumerate(proto['args']): + t = a['type'] + n = a['name'] + if proto['name'] in self.overloaded_helpers and i == 0: + t = 'void' + n = 'ctx' + one_arg = '{}{}'.format(comma, self.map_type(t)) + if n: + if a['star']: + one_arg += ' {}'.format(a['star']) + else: + one_arg += ' ' + one_arg += '{}'.format(n) + comma = ', ' + print(one_arg, end='') + + print(') = (void *) %d;' % len(self.seen_helpers)) + print('') + +############################################################################### + +# If script is launched from scripts/ from kernel tree and can access +# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse, +# otherwise the --filename argument will be required from the command line. +script = os.path.abspath(sys.argv[0]) +linuxRoot = os.path.dirname(os.path.dirname(script)) +bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') + +printers = { + 'helpers': PrinterHelpersRST, +} + +argParser = argparse.ArgumentParser(description=""" +Parse eBPF header file and generate documentation for the eBPF API. +The RST-formatted output produced can be turned into a manual page with the +rst2man utility. +""") +argParser.add_argument('--header', action='store_true', + help='generate C header file') +if (os.path.isfile(bpfh)): + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h', + default=bpfh) +else: + argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') +argParser.add_argument('target', nargs='?', default='helpers', + choices=printers.keys(), help='eBPF API target') +args = argParser.parse_args() + +# Parse file. +headerParser = HeaderParser(args.filename) +headerParser.run() + +# Print formatted output to standard output. +if args.header: + printer = PrinterHelpers(headerParser) +else: + printer = printers[args.target](headerParser) +printer.print_all() diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py deleted file mode 100755 index 867ada23281c..000000000000 --- a/scripts/bpf_helpers_doc.py +++ /dev/null @@ -1,615 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: GPL-2.0-only -# -# Copyright (C) 2018-2019 Netronome Systems, Inc. - -# In case user attempts to run with Python 2. -from __future__ import print_function - -import argparse -import re -import sys, os - -class NoHelperFound(BaseException): - pass - -class ParsingError(BaseException): - def __init__(self, line='', reader=None): - if reader: - BaseException.__init__(self, - 'Error at file offset %d, parsing line: %s' % - (reader.tell(), line)) - else: - BaseException.__init__(self, 'Error parsing line: %s' % line) - -class Helper(object): - """ - An object representing the description of an eBPF helper function. - @proto: function prototype of the helper function - @desc: textual description of the helper function - @ret: description of the return value of the helper function - """ - def __init__(self, proto='', desc='', ret=''): - self.proto = proto - self.desc = desc - self.ret = ret - - def proto_break_down(self): - """ - Break down helper function protocol into smaller chunks: return type, - name, distincts arguments. - """ - arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$') - res = {} - proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$') - - capture = proto_re.match(self.proto) - res['ret_type'] = capture.group(1) - res['ret_star'] = capture.group(2) - res['name'] = capture.group(3) - res['args'] = [] - - args = capture.group(4).split(', ') - for a in args: - capture = arg_re.match(a) - res['args'].append({ - 'type' : capture.group(1), - 'star' : capture.group(5), - 'name' : capture.group(6) - }) - - return res - -class HeaderParser(object): - """ - An object used to parse a file in order to extract the documentation of a - list of eBPF helper functions. All the helpers that can be retrieved are - stored as Helper object, in the self.helpers() array. - @filename: name of file to parse, usually include/uapi/linux/bpf.h in the - kernel tree - """ - def __init__(self, filename): - self.reader = open(filename, 'r') - self.line = '' - self.helpers = [] - - def parse_helper(self): - proto = self.parse_proto() - desc = self.parse_desc() - ret = self.parse_ret() - return Helper(proto=proto, desc=desc, ret=ret) - - def parse_proto(self): - # Argument can be of shape: - # - "void" - # - "type name" - # - "type *name" - # - Same as above, with "const" and/or "struct" in front of type - # - "..." (undefined number of arguments, for bpf_trace_printk()) - # There is at least one term ("void"), and at most five arguments. - p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$') - capture = p.match(self.line) - if not capture: - raise NoHelperFound - self.line = self.reader.readline() - return capture.group(1) - - def parse_desc(self): - p = re.compile(' \* ?(?:\t| {5,8})Description$') - capture = p.match(self.line) - if not capture: - # Helper can have empty description and we might be parsing another - # attribute: return but do not consume. - return '' - # Description can be several lines, some of them possibly empty, and it - # stops when another subsection title is met. - desc = '' - while True: - self.line = self.reader.readline() - if self.line == ' *\n': - desc += '\n' - else: - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') - capture = p.match(self.line) - if capture: - desc += capture.group(1) + '\n' - else: - break - return desc - - def parse_ret(self): - p = re.compile(' \* ?(?:\t| {5,8})Return$') - capture = p.match(self.line) - if not capture: - # Helper can have empty retval and we might be parsing another - # attribute: return but do not consume. - return '' - # Return value description can be several lines, some of them possibly - # empty, and it stops when another subsection title is met. - ret = '' - while True: - self.line = self.reader.readline() - if self.line == ' *\n': - ret += '\n' - else: - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)') - capture = p.match(self.line) - if capture: - ret += capture.group(1) + '\n' - else: - break - return ret - - def run(self): - # Advance to start of helper function descriptions. - offset = self.reader.read().find('* Start of BPF helper function descriptions:') - if offset == -1: - raise Exception('Could not find start of eBPF helper descriptions list') - self.reader.seek(offset) - self.reader.readline() - self.reader.readline() - self.line = self.reader.readline() - - while True: - try: - helper = self.parse_helper() - self.helpers.append(helper) - except NoHelperFound: - break - - self.reader.close() - -############################################################################### - -class Printer(object): - """ - A generic class for printers. Printers should be created with an array of - Helper objects, and implement a way to print them in the desired fashion. - @helpers: array of Helper objects to print to standard output - """ - def __init__(self, helpers): - self.helpers = helpers - - def print_header(self): - pass - - def print_footer(self): - pass - - def print_one(self, helper): - pass - - def print_all(self): - self.print_header() - for helper in self.helpers: - self.print_one(helper) - self.print_footer() - -class PrinterRST(Printer): - """ - A printer for dumping collected information about helpers as a ReStructured - Text page compatible with the rst2man program, which can be used to - generate a manual page for the helpers. - @helpers: array of Helper objects to print to standard output - """ - def print_header(self): - header = '''\ -.. Copyright (C) All BPF authors and contributors from 2014 to present. -.. See git log include/uapi/linux/bpf.h in kernel tree for details. -.. -.. %%%LICENSE_START(VERBATIM) -.. Permission is granted to make and distribute verbatim copies of this -.. manual provided the copyright notice and this permission notice are -.. preserved on all copies. -.. -.. Permission is granted to copy and distribute modified versions of this -.. manual under the conditions for verbatim copying, provided that the -.. entire resulting derived work is distributed under the terms of a -.. permission notice identical to this one. -.. -.. Since the Linux kernel and libraries are constantly changing, this -.. manual page may be incorrect or out-of-date. The author(s) assume no -.. responsibility for errors or omissions, or for damages resulting from -.. the use of the information contained herein. The author(s) may not -.. have taken the same level of care in the production of this manual, -.. which is licensed free of charge, as they might when working -.. professionally. -.. -.. Formatted or processed versions of this manual, if unaccompanied by -.. the source, must acknowledge the copyright and authors of this work. -.. %%%LICENSE_END -.. -.. Please do not edit this file. It was generated from the documentation -.. located in file include/uapi/linux/bpf.h of the Linux kernel sources -.. (helpers description), and from scripts/bpf_helpers_doc.py in the same -.. repository (header and footer). - -=========== -BPF-HELPERS -=========== -------------------------------------------------------------------------------- -list of eBPF helper functions -------------------------------------------------------------------------------- - -:Manual section: 7 - -DESCRIPTION -=========== - -The extended Berkeley Packet Filter (eBPF) subsystem consists in programs -written in a pseudo-assembly language, then attached to one of the several -kernel hooks and run in reaction of specific events. This framework differs -from the older, "classic" BPF (or "cBPF") in several aspects, one of them being -the ability to call special functions (or "helpers") from within a program. -These functions are restricted to a white-list of helpers defined in the -kernel. - -These helpers are used by eBPF programs to interact with the system, or with -the context in which they work. For instance, they can be used to print -debugging messages, to get the time since the system was booted, to interact -with eBPF maps, or to manipulate network packets. Since there are several eBPF -program types, and that they do not run in the same context, each program type -can only call a subset of those helpers. - -Due to eBPF conventions, a helper can not have more than five arguments. - -Internally, eBPF programs call directly into the compiled helper functions -without requiring any foreign-function interface. As a result, calling helpers -introduces no overhead, thus offering excellent performance. - -This document is an attempt to list and document the helpers available to eBPF -developers. They are sorted by chronological order (the oldest helpers in the -kernel at the top). - -HELPERS -======= -''' - print(header) - - def print_footer(self): - footer = ''' -EXAMPLES -======== - -Example usage for most of the eBPF helpers listed in this manual page are -available within the Linux kernel sources, at the following locations: - -* *samples/bpf/* -* *tools/testing/selftests/bpf/* - -LICENSE -======= - -eBPF programs can have an associated license, passed along with the bytecode -instructions to the kernel when the programs are loaded. The format for that -string is identical to the one in use for kernel modules (Dual licenses, such -as "Dual BSD/GPL", may be used). Some helper functions are only accessible to -programs that are compatible with the GNU Privacy License (GPL). - -In order to use such helpers, the eBPF program must be loaded with the correct -license string passed (via **attr**) to the **bpf**\ () system call, and this -generally translates into the C source code of the program containing a line -similar to the following: - -:: - - char ____license[] __attribute__((section("license"), used)) = "GPL"; - -IMPLEMENTATION -============== - -This manual page is an effort to document the existing eBPF helper functions. -But as of this writing, the BPF sub-system is under heavy development. New eBPF -program or map types are added, along with new helper functions. Some helpers -are occasionally made available for additional program types. So in spite of -the efforts of the community, this page might not be up-to-date. If you want to -check by yourself what helper functions exist in your kernel, or what types of -programs they can support, here are some files among the kernel tree that you -may be interested in: - -* *include/uapi/linux/bpf.h* is the main BPF header. It contains the full list - of all helper functions, as well as many other BPF definitions including most - of the flags, structs or constants used by the helpers. -* *net/core/filter.c* contains the definition of most network-related helper - functions, and the list of program types from which they can be used. -* *kernel/trace/bpf_trace.c* is the equivalent for most tracing program-related - helpers. -* *kernel/bpf/verifier.c* contains the functions used to check that valid types - of eBPF maps are used with a given helper function. -* *kernel/bpf/* directory contains other files in which additional helpers are - defined (for cgroups, sockmaps, etc.). -* The bpftool utility can be used to probe the availability of helper functions - on the system (as well as supported program and map types, and a number of - other parameters). To do so, run **bpftool feature probe** (see - **bpftool-feature**\ (8) for details). Add the **unprivileged** keyword to - list features available to unprivileged users. - -Compatibility between helper functions and program types can generally be found -in the files where helper functions are defined. Look for the **struct -bpf_func_proto** objects and for functions returning them: these functions -contain a list of helpers that a given program type can call. Note that the -**default:** label of the **switch ... case** used to filter helpers can call -other functions, themselves allowing access to additional helpers. The -requirement for GPL license is also in those **struct bpf_func_proto**. - -Compatibility between helper functions and map types can be found in the -**check_map_func_compatibility**\ () function in file *kernel/bpf/verifier.c*. - -Helper functions that invalidate the checks on **data** and **data_end** -pointers for network processing are listed in function -**bpf_helper_changes_pkt_data**\ () in file *net/core/filter.c*. - -SEE ALSO -======== - -**bpf**\ (2), -**bpftool**\ (8), -**cgroups**\ (7), -**ip**\ (8), -**perf_event_open**\ (2), -**sendmsg**\ (2), -**socket**\ (7), -**tc-bpf**\ (8)''' - print(footer) - - def print_proto(self, helper): - """ - Format function protocol with bold and italics markers. This makes RST - file less readable, but gives nice results in the manual page. - """ - proto = helper.proto_break_down() - - print('**%s %s%s(' % (proto['ret_type'], - proto['ret_star'].replace('*', '\\*'), - proto['name']), - end='') - - comma = '' - for a in proto['args']: - one_arg = '{}{}'.format(comma, a['type']) - if a['name']: - if a['star']: - one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*')) - else: - one_arg += '** ' - one_arg += '*{}*\\ **'.format(a['name']) - comma = ', ' - print(one_arg, end='') - - print(')**') - - def print_one(self, helper): - self.print_proto(helper) - - if (helper.desc): - print('\tDescription') - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - - if (helper.ret): - print('\tReturn') - for line in helper.ret.rstrip().split('\n'): - print('{}{}'.format('\t\t' if line else '', line)) - - print('') - -class PrinterHelpers(Printer): - """ - A printer for dumping collected information about helpers as C header to - be included from BPF program. - @helpers: array of Helper objects to print to standard output - """ - - type_fwds = [ - 'struct bpf_fib_lookup', - 'struct bpf_sk_lookup', - 'struct bpf_perf_event_data', - 'struct bpf_perf_event_value', - 'struct bpf_pidns_info', - 'struct bpf_redir_neigh', - 'struct bpf_sock', - 'struct bpf_sock_addr', - 'struct bpf_sock_ops', - 'struct bpf_sock_tuple', - 'struct bpf_spin_lock', - 'struct bpf_sysctl', - 'struct bpf_tcp_sock', - 'struct bpf_tunnel_key', - 'struct bpf_xfrm_state', - 'struct linux_binprm', - 'struct pt_regs', - 'struct sk_reuseport_md', - 'struct sockaddr', - 'struct tcphdr', - 'struct seq_file', - 'struct tcp6_sock', - 'struct tcp_sock', - 'struct tcp_timewait_sock', - 'struct tcp_request_sock', - 'struct udp6_sock', - 'struct task_struct', - - 'struct __sk_buff', - 'struct sk_msg_md', - 'struct xdp_md', - 'struct path', - 'struct btf_ptr', - 'struct inode', - 'struct socket', - 'struct file', - ] - known_types = { - '...', - 'void', - 'const void', - 'char', - 'const char', - 'int', - 'long', - 'unsigned long', - - '__be16', - '__be32', - '__wsum', - - 'struct bpf_fib_lookup', - 'struct bpf_perf_event_data', - 'struct bpf_perf_event_value', - 'struct bpf_pidns_info', - 'struct bpf_redir_neigh', - 'struct bpf_sk_lookup', - 'struct bpf_sock', - 'struct bpf_sock_addr', - 'struct bpf_sock_ops', - 'struct bpf_sock_tuple', - 'struct bpf_spin_lock', - 'struct bpf_sysctl', - 'struct bpf_tcp_sock', - 'struct bpf_tunnel_key', - 'struct bpf_xfrm_state', - 'struct linux_binprm', - 'struct pt_regs', - 'struct sk_reuseport_md', - 'struct sockaddr', - 'struct tcphdr', - 'struct seq_file', - 'struct tcp6_sock', - 'struct tcp_sock', - 'struct tcp_timewait_sock', - 'struct tcp_request_sock', - 'struct udp6_sock', - 'struct task_struct', - 'struct path', - 'struct btf_ptr', - 'struct inode', - 'struct socket', - 'struct file', - } - mapped_types = { - 'u8': '__u8', - 'u16': '__u16', - 'u32': '__u32', - 'u64': '__u64', - 's8': '__s8', - 's16': '__s16', - 's32': '__s32', - 's64': '__s64', - 'size_t': 'unsigned long', - 'struct bpf_map': 'void', - 'struct sk_buff': 'struct __sk_buff', - 'const struct sk_buff': 'const struct __sk_buff', - 'struct sk_msg_buff': 'struct sk_msg_md', - 'struct xdp_buff': 'struct xdp_md', - } - # Helpers overloaded for different context types. - overloaded_helpers = [ - 'bpf_get_socket_cookie', - 'bpf_sk_assign', - ] - - def print_header(self): - header = '''\ -/* This is auto-generated file. See bpf_helpers_doc.py for details. */ - -/* Forward declarations of BPF structs */''' - - print(header) - for fwd in self.type_fwds: - print('%s;' % fwd) - print('') - - def print_footer(self): - footer = '' - print(footer) - - def map_type(self, t): - if t in self.known_types: - return t - if t in self.mapped_types: - return self.mapped_types[t] - print("Unrecognized type '%s', please add it to known types!" % t, - file=sys.stderr) - sys.exit(1) - - seen_helpers = set() - - def print_one(self, helper): - proto = helper.proto_break_down() - - if proto['name'] in self.seen_helpers: - return - self.seen_helpers.add(proto['name']) - - print('/*') - print(" * %s" % proto['name']) - print(" *") - if (helper.desc): - # Do not strip all newline characters: formatted code at the end of - # a section must be followed by a blank line. - for line in re.sub('\n$', '', helper.desc, count=1).split('\n'): - print(' *{}{}'.format(' \t' if line else '', line)) - - if (helper.ret): - print(' *') - print(' * Returns') - for line in helper.ret.rstrip().split('\n'): - print(' *{}{}'.format(' \t' if line else '', line)) - - print(' */') - print('static %s %s(*%s)(' % (self.map_type(proto['ret_type']), - proto['ret_star'], proto['name']), end='') - comma = '' - for i, a in enumerate(proto['args']): - t = a['type'] - n = a['name'] - if proto['name'] in self.overloaded_helpers and i == 0: - t = 'void' - n = 'ctx' - one_arg = '{}{}'.format(comma, self.map_type(t)) - if n: - if a['star']: - one_arg += ' {}'.format(a['star']) - else: - one_arg += ' ' - one_arg += '{}'.format(n) - comma = ', ' - print(one_arg, end='') - - print(') = (void *) %d;' % len(self.seen_helpers)) - print('') - -############################################################################### - -# If script is launched from scripts/ from kernel tree and can access -# ../include/uapi/linux/bpf.h, use it as a default name for the file to parse, -# otherwise the --filename argument will be required from the command line. -script = os.path.abspath(sys.argv[0]) -linuxRoot = os.path.dirname(os.path.dirname(script)) -bpfh = os.path.join(linuxRoot, 'include/uapi/linux/bpf.h') - -argParser = argparse.ArgumentParser(description=""" -Parse eBPF header file and generate documentation for eBPF helper functions. -The RST-formatted output produced can be turned into a manual page with the -rst2man utility. -""") -argParser.add_argument('--header', action='store_true', - help='generate C header file') -if (os.path.isfile(bpfh)): - argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h', - default=bpfh) -else: - argParser.add_argument('--filename', help='path to include/uapi/linux/bpf.h') -args = argParser.parse_args() - -# Parse file. -headerParser = HeaderParser(args.filename) -headerParser.run() - -# Print formatted output to standard output. -if args.header: - printer = PrinterHelpers(headerParser.helpers) -else: - printer = PrinterRST(headerParser.helpers) -printer.print_all() diff --git a/tools/bpf/Makefile.helpers b/tools/bpf/Makefile.helpers index 854d084026dd..a26599022fd6 100644 --- a/tools/bpf/Makefile.helpers +++ b/tools/bpf/Makefile.helpers @@ -35,7 +35,7 @@ man7: $(DOC_MAN7) RST2MAN_DEP := $(shell command -v rst2man 2>/dev/null) $(OUTPUT)$(HELPERS_RST): $(UP2DIR)../../include/uapi/linux/bpf.h - $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_helpers_doc.py --filename $< > $@ + $(QUIET_GEN)$(UP2DIR)../../scripts/bpf_doc.py --filename $< > $@ $(OUTPUT)%.7: $(OUTPUT)%.rst ifndef RST2MAN_DEP diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b89af20cfa19..b4c5c529ad17 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -729,7 +729,7 @@ union bpf_attr { * parsed and used to produce a manual page. The workflow is the following, * and requires the rst2man utility: * - * $ ./scripts/bpf_helpers_doc.py \ + * $ ./scripts/bpf_doc.py \ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 * $ man /tmp/bpf-helpers.7 diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 887a494ad5fc..8170f88e8ea6 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -158,7 +158,7 @@ $(BPF_IN_STATIC): force $(BPF_HELPER_DEFS) $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR) $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h - $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \ + $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \ --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS) $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION) diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST index 5d7b947320fb..f05c4d48fd7e 100644 --- a/tools/perf/MANIFEST +++ b/tools/perf/MANIFEST @@ -20,4 +20,4 @@ tools/lib/bitmap.c tools/lib/str_error_r.c tools/lib/vsprintf.c tools/lib/zalloc.c -scripts/bpf_helpers_doc.py +scripts/bpf_doc.py -- cgit v1.2.3 From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:13 +0000 Subject: bpf: Add PROG_TEST_RUN support for sk_lookup programs Allow to pass sk_lookup programs to PROG_TEST_RUN. User space provides the full bpf_sk_lookup struct as context. Since the context includes a socket pointer that can't be exposed to user space we define that PROG_TEST_RUN returns the cookie of the selected socket or zero in place of the socket pointer. We don't support testing programs that select a reuseport socket, since this would mean running another (unrelated) BPF program from the sk_lookup test handler. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com --- include/linux/bpf.h | 10 ++++ include/uapi/linux/bpf.h | 5 +- net/bpf/test_run.c | 105 +++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 1 + tools/include/uapi/linux/bpf.h | 5 +- 5 files changed, 124 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c730863fa77..c931bc97019d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1491,6 +1491,9 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1692,6 +1695,13 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, return -ENOTSUPP; } +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index eb3c78cd4d7c..0abdd67f44b1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -10,8 +10,10 @@ #include #include #include +#include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -781,3 +783,106 @@ out: kfree(data); return ret; } + +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; + + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; + } + + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; + + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); + +out: + bpf_prog_array_free(progs); + kfree(user_ctx); + return ret; +} diff --git a/net/core/filter.c b/net/core/filter.c index 13bcf248ee7b..a526db494c62 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 63a56ed6a785..7f530e349aff 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5953,7 +5953,10 @@ struct bpf_pidns_info { /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ struct bpf_sk_lookup { - __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; __u32 family; /* Protocol family (AF_INET, AF_INET6) */ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ -- cgit v1.2.3 From d01b59c9ae94560fbcceaafeef39784d72765033 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Thu, 4 Mar 2021 14:40:46 +0800 Subject: bpf: Add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets encapsulation. But that is not appropriate when pushing Ethernet header. Add an option to further specify encap L2 type and set the inner_protocol as ETH_P_TEB. Suggested-by: Willem de Bruijn Signed-off-by: Xuesen Huang Signed-off-by: Zhiyong Cheng Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210304064046.6232-1-hxseverything@gmail.com --- include/uapi/linux/bpf.h | 5 +++++ net/core/filter.c | 11 ++++++++++- tools/include/uapi/linux/bpf.h | 5 +++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index a526db494c62..588b19ba0da8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3409,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3445,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3463,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7f530e349aff..2d3036e292a9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2484,6 +2484,10 @@ union bpf_attr { * Use with ENCAP_L3/L4 flags to further specify the tunnel * type; *len* is the length of the inner MAC header. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -4916,6 +4920,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), }; enum { -- cgit v1.2.3 From e6a4750ffe9d701c4d55212b14b615e63571d235 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 8 Mar 2021 12:29:06 +0100 Subject: bpf, xdp: Make bpf_redirect_map() a map operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the bpf_redirect_map() implementation dispatches to the correct map-lookup function via a switch-statement. To avoid the dispatching, this change adds bpf_redirect_map() as a map operation. Each map provides its bpf_redirect_map() version, and correct function is automatically selected by the BPF verifier. A nice side-effect of the code movement is that the map lookup functions are now local to the map implementation files, which removes one additional function call. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-2-bjorn.topel@gmail.com --- include/linux/bpf.h | 26 ++++++-------------------- include/linux/filter.h | 27 +++++++++++++++++++++++++++ include/net/xdp_sock.h | 19 ------------------- kernel/bpf/cpumap.c | 8 +++++++- kernel/bpf/devmap.c | 16 ++++++++++++++-- kernel/bpf/verifier.c | 13 +++++++++++-- net/core/filter.c | 39 +-------------------------------------- net/xdp/xskmap.c | 16 ++++++++++++++++ 8 files changed, 82 insertions(+), 82 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c931bc97019d..a25730eaa148 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -118,6 +118,9 @@ struct bpf_map_ops { void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + /* Misc helpers.*/ + int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure * an inner map can be inserted to an outer map. @@ -1450,9 +1453,9 @@ struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ struct xdp_buff; struct sk_buff; +struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_flush(void); int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1462,7 +1465,6 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); bool dev_map_can_have_prog(struct bpf_map *map); -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -1593,17 +1595,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - -static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} static inline bool dev_map_can_have_prog(struct bpf_map *map) { return false; @@ -1615,6 +1606,7 @@ static inline void __dev_flush(void) struct xdp_buff; struct bpf_dtab_netdev; +struct bpf_cpu_map_entry; static inline int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, @@ -1639,12 +1631,6 @@ static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, return 0; } -static inline -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) -{ - return NULL; -} - static inline void __cpu_map_flush(void) { } diff --git a/include/linux/filter.h b/include/linux/filter.h index 3b00fc906ccd..008691fd3b58 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1472,4 +1472,31 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, u64 flags, + void *lookup_elem(struct bpf_map *map, u32 key)) +{ + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + /* Lower bits of the flags are used as return code on lookup failure */ + if (unlikely(flags > XDP_TX)) + return XDP_ABORTED; + + ri->tgt_value = lookup_elem(map, ifindex); + if (unlikely(!ri->tgt_value)) { + /* If the lookup fails we want to clear out the state in the + * redirect_info struct completely, so that if an eBPF program + * performs multiple lookups, the last one always takes + * precedence. + */ + WRITE_ONCE(ri->map, NULL); + return flags; + } + + ri->flags = flags; + ri->tgt_index = ifindex; + WRITE_ONCE(ri->map, map); + + return XDP_REDIRECT; +} + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index cc17bc957548..9c0722c6d7ac 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -80,19 +80,6 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - struct xsk_map *m = container_of(map, struct xsk_map, map); - struct xdp_sock *xs; - - if (key >= map->max_entries) - return NULL; - - xs = READ_ONCE(m->xsk_map[key]); - return xs; -} - #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -109,12 +96,6 @@ static inline void __xsk_map_flush(void) { } -static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, - u32 key) -{ - return NULL; -} - #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 5d1469de6921..7352d4160b7f 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -563,7 +563,7 @@ static void cpu_map_free(struct bpf_map *map) kfree(cmap); } -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -600,6 +600,11 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __cpu_map_lookup_elem); +} + static int cpu_map_btf_id; const struct bpf_map_ops cpu_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -612,6 +617,7 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_cpu_map", .map_btf_id = &cpu_map_btf_id, + .map_redirect = cpu_map_redirect, }; static void bq_flush_to_queue(struct xdp_bulk_queue *bq) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 85d9d1b72a33..f7f42448259f 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -258,7 +258,7 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); @@ -392,7 +392,7 @@ void __dev_flush(void) * update happens in parallel here a dev_put wont happen until after reading the * ifindex. */ -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; @@ -735,6 +735,16 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } +static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_lookup_elem); +} + +static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __dev_map_hash_lookup_elem); +} + static int dev_map_btf_id; const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, @@ -747,6 +757,7 @@ const struct bpf_map_ops dev_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_btf_id, + .map_redirect = dev_map_redirect, }; static int dev_map_hash_map_btf_id; @@ -761,6 +772,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "bpf_dtab", .map_btf_id = &dev_map_hash_map_btf_id, + .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9fe90ce52a65..97eb0b2435b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5582,7 +5582,8 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_push_elem && func_id != BPF_FUNC_map_pop_elem && func_id != BPF_FUNC_map_peek_elem && - func_id != BPF_FUNC_for_each_map_elem) + func_id != BPF_FUNC_for_each_map_elem && + func_id != BPF_FUNC_redirect_map) return 0; if (map == NULL) { @@ -12017,7 +12018,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->imm == BPF_FUNC_map_delete_elem || insn->imm == BPF_FUNC_map_push_elem || insn->imm == BPF_FUNC_map_pop_elem || - insn->imm == BPF_FUNC_map_peek_elem)) { + insn->imm == BPF_FUNC_map_peek_elem || + insn->imm == BPF_FUNC_redirect_map)) { aux = &env->insn_aux_data[i + delta]; if (bpf_map_ptr_poisoned(aux)) goto patch_call_imm; @@ -12059,6 +12061,9 @@ static int do_misc_fixups(struct bpf_verifier_env *env) (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); + BUILD_BUG_ON(!__same_type(ops->map_redirect, + (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + patch_map_ops_generic: switch (insn->imm) { case BPF_FUNC_map_lookup_elem: @@ -12085,6 +12090,10 @@ patch_map_ops_generic: insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - __bpf_call_base; continue; + case BPF_FUNC_redirect_map: + insn->imm = BPF_CAST_CALL(ops->map_redirect) - + __bpf_call_base; + continue; } goto patch_call_imm; diff --git a/net/core/filter.c b/net/core/filter.c index 588b19ba0da8..183b0aa6b027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3943,22 +3943,6 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - return __dev_map_lookup_elem(map, index); - case BPF_MAP_TYPE_DEVMAP_HASH: - return __dev_map_hash_lookup_elem(map, index); - case BPF_MAP_TYPE_CPUMAP: - return __cpu_map_lookup_elem(map, index); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_lookup_elem(map, index); - default: - return NULL; - } -} - void bpf_clear_redirect_map(struct bpf_map *map) { struct bpf_redirect_info *ri; @@ -4112,28 +4096,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) - return XDP_ABORTED; - - ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { - /* If the lookup fails we want to clear out the state in the - * redirect_info struct completely, so that if an eBPF program - * performs multiple lookups, the last one always takes - * precedence. - */ - WRITE_ONCE(ri->map, NULL); - return flags; - } - - ri->flags = flags; - ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); - - return XDP_REDIRECT; + return map->ops->map_redirect(map, ifindex, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 113fd9017203..fbeb4870f798 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -125,6 +125,16 @@ static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } +static void *__xsk_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(m->xsk_map[key]); +} + static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -215,6 +225,11 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +{ + return __bpf_xdp_redirect_map(map, ifindex, flags, __xsk_map_lookup_elem); +} + void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, struct xdp_sock **map_entry) { @@ -247,4 +262,5 @@ const struct bpf_map_ops xsk_map_ops = { .map_check_btf = map_check_no_btf, .map_btf_name = "xsk_map", .map_btf_id = &xsk_map_btf_id, + .map_redirect = xsk_map_redirect, }; -- cgit v1.2.3 From ee75aef23afe6e88497151c127c13ed69f41aaa2 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 8 Mar 2021 12:29:07 +0100 Subject: bpf, xdp: Restructure redirect actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XDP_REDIRECT implementations for maps and non-maps are fairly similar, but obviously need to take different code paths depending on if the target is using a map or not. Today, the redirect targets for XDP either uses a map, or is based on ifindex. Here, the map type and id are added to bpf_redirect_info, instead of the actual map. Map type, map item/ifindex, and the map_id (if any) is passed to xdp_do_redirect(). For ifindex-based redirect, used by the bpf_redirect() XDP BFP helper, a special map type/id are used. Map type of UNSPEC together with map id equal to INT_MAX has the special meaning of an ifindex based redirect. Note that valid map ids are 1 inclusive, INT_MAX exclusive ([1,INT_MAX[). In addition to making the code easier to follow, using explicit type and id in bpf_redirect_info has a slight positive performance impact by avoiding a pointer indirection for the map type lookup, and instead use the cacheline for bpf_redirect_info. Since the actual map is not passed via bpf_redirect_info anymore, the map lookup is only done in the BPF helper. This means that the bpf_clear_redirect_map() function can be removed. The actual map item is RCU protected. The bpf_redirect_info flags member is not used by XDP, and not read/written any more. The map member is only written to when required/used, and not unconditionally. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Maciej Fijalkowski Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-3-bjorn.topel@gmail.com --- include/linux/filter.h | 10 +-- include/trace/events/xdp.h | 62 ++++++++++------- kernel/bpf/cpumap.c | 1 - kernel/bpf/devmap.c | 1 - net/core/filter.c | 170 ++++++++++++++++++++------------------------- net/xdp/xskmap.c | 1 - 6 files changed, 116 insertions(+), 129 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 008691fd3b58..b2b85b2cad8e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -646,7 +646,8 @@ struct bpf_redirect_info { u32 flags; u32 tgt_index; void *tgt_value; - struct bpf_map *map; + u32 map_id; + enum bpf_map_type map_type; u32 kern_flags; struct bpf_nh_params nh; }; @@ -1488,13 +1489,14 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind * performs multiple lookups, the last one always takes * precedence. */ - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags; } - ri->flags = flags; ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); + ri->map_id = map->id; + ri->map_type = map->map_type; return XDP_REDIRECT; } diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 76a97176ab81..fcad3645a70b 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -86,19 +86,15 @@ struct _bpf_dtab_netdev { }; #endif /* __DEVMAP_OBJ_TYPE */ -#define devmap_ifindex(tgt, map) \ - (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ - ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) - DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), + enum bpf_map_type map_type, + u32 map_id, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index), TP_STRUCT__entry( __field(int, prog_id) @@ -111,14 +107,22 @@ DECLARE_EVENT_CLASS(xdp_redirect_template, ), TP_fast_assign( + u32 ifindex = 0, map_index = index; + + if (map_type == BPF_MAP_TYPE_DEVMAP || map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + ifindex = ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex; + } else if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + ifindex = index; + map_index = 0; + } + __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; - __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : - index; - __entry->map_id = map ? map->id : 0; - __entry->map_index = map ? index : 0; + __entry->to_ifindex = ifindex; + __entry->map_id = map_id; + __entry->map_index = map_index; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" @@ -133,45 +137,49 @@ DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); -#define _trace_xdp_redirect(dev, xdp, to) \ - trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to) +#define _trace_xdp_redirect(dev, xdp, to) \ + trace_xdp_redirect(dev, xdp, NULL, 0, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_err(dev, xdp, to, err) \ - trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to) +#define _trace_xdp_redirect_err(dev, xdp, to, err) \ + trace_xdp_redirect_err(dev, xdp, NULL, err, BPF_MAP_TYPE_UNSPEC, INT_MAX, to) -#define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ - trace_xdp_redirect(dev, xdp, to, 0, map, index) +#define _trace_xdp_redirect_map(dev, xdp, to, map_type, map_id, index) \ + trace_xdp_redirect(dev, xdp, to, 0, map_type, map_id, index) -#define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ - trace_xdp_redirect_err(dev, xdp, to, err, map, index) +#define _trace_xdp_redirect_map_err(dev, xdp, to, map_type, map_id, index, err) \ + trace_xdp_redirect_err(dev, xdp, to, err, map_type, map_id, index) /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, - const struct bpf_map *map, u32 index), - TP_ARGS(dev, xdp, tgt, err, map, index) + enum bpf_map_type map_type, + u32 map_id, u32 index), + TP_ARGS(dev, xdp, tgt, err, map_type, map_id, index) ); TRACE_EVENT(xdp_cpumap_kthread, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 7352d4160b7f..0cf2791d5099 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -543,7 +543,6 @@ static void cpu_map_free(struct bpf_map *map) * complete. */ - bpf_clear_redirect_map(map); synchronize_rcu(); /* For cpu_map the remote CPUs can still be using the entries diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f7f42448259f..7a5ad7331c3b 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -197,7 +197,6 @@ static void dev_map_free(struct bpf_map *map) list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); - bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ diff --git a/net/core/filter.c b/net/core/filter.c index 183b0aa6b027..b6732000d8a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3918,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, struct xdp_buff *xdp) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - return dev_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_CPUMAP: - return cpu_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_redirect(fwd, xdp); - default: - return -EBADRQC; - } - return 0; -} - void xdp_do_flush(void) { __dev_flush(); @@ -3943,55 +3926,52 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; + enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; int err; - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - if (unlikely(!map)) { - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_XSKMAP: + err = __xsk_map_redirect(fwd, xdp); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdp, dev); + break; } - - err = dev_xdp_enqueue(fwd, xdp, dev); - } else { - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + fallthrough; + default: + err = -EBADRQC; } if (unlikely(err)) goto err; - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -4000,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, - struct bpf_map *map) + void *fwd, + enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->tgt_index; - void *fwd = ri->tgt_value; - int err = 0; - - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); - - if (map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - struct bpf_dtab_netdev *dst = fwd; + int err; - err = dev_map_generic_redirect(dst, skb, xdp_prog); + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_generic_redirect(fwd, skb, xdp_prog); if (unlikely(err)) goto err; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - struct xdp_sock *xs = fwd; - - err = xsk_generic_rcv(xs, xdp); + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); - } else { + break; + default: /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } @@ -4042,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; - struct net_device *fwd; - int err = 0; - - if (map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, - map); - ri->tgt_index = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; - err = xdp_ok_fwd_dev(fwd, skb->len); - if (unlikely(err)) - goto err; + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); - generic_xdp_tx(skb, xdp_prog); - return 0; + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; + } + + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } @@ -4077,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->flags = flags; + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ ri->tgt_index = ifindex; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index fbeb4870f798..67b4ce504852 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -87,7 +87,6 @@ static void xsk_map_free(struct bpf_map *map) { struct xsk_map *m = container_of(map, struct xsk_map, map); - bpf_clear_redirect_map(map); synchronize_net(); bpf_map_area_free(m); } -- cgit v1.2.3 From 0bb3262c0248d44aea3be31076f44beb82a7b120 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 9 Mar 2021 17:51:35 -0800 Subject: net: socket: use BIT() for MSG_* The bit mask for MSG_* seems a little confused here. Replace it with BIT() to make it clear to understand. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/socket.h | 71 ++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 385894b4a8bb..e88859f38cd0 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -283,42 +283,45 @@ struct ucred { Added those for 1003.1g not all are supported yet */ -#define MSG_OOB 1 -#define MSG_PEEK 2 -#define MSG_DONTROUTE 4 -#define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ -#define MSG_CTRUNC 8 -#define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ -#define MSG_TRUNC 0x20 -#define MSG_DONTWAIT 0x40 /* Nonblocking io */ -#define MSG_EOR 0x80 /* End of record */ -#define MSG_WAITALL 0x100 /* Wait for a full request */ -#define MSG_FIN 0x200 -#define MSG_SYN 0x400 -#define MSG_CONFIRM 0x800 /* Confirm path validity */ -#define MSG_RST 0x1000 -#define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ -#define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ -#define MSG_MORE 0x8000 /* Sender will send more */ -#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ -#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ -#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ -#define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ -#define MSG_EOF MSG_FIN -#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ -#define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry - * plain text and require encryption - */ - -#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ -#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ -#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file - descriptor received through - SCM_RIGHTS */ +#define MSG_OOB BIT(0) +#define MSG_PEEK BIT(1) +#define MSG_DONTROUTE BIT(2) +#define MSG_TRYHARD BIT(2) /* Synonym for MSG_DONTROUTE for DECnet */ +#define MSG_CTRUNC BIT(3) +#define MSG_PROBE BIT(4) /* Do not send. Only probe path f.e. for MTU */ +#define MSG_TRUNC BIT(5) +#define MSG_DONTWAIT BIT(6) /* Nonblocking io */ +#define MSG_EOR BIT(7) /* End of record */ +#define MSG_WAITALL BIT(8) /* Wait for a full request */ +#define MSG_FIN BIT(9) +#define MSG_SYN BIT(10) +#define MSG_CONFIRM BIT(11) /* Confirm path validity */ +#define MSG_RST BIT(12) +#define MSG_ERRQUEUE BIT(13) /* Fetch message from error queue */ +#define MSG_NOSIGNAL BIT(14) /* Do not generate SIGPIPE */ +#define MSG_MORE BIT(15) /* Sender will send more */ +#define MSG_WAITFORONE BIT(16) /* recvmmsg(): block until 1+ packets avail */ +#define MSG_SENDPAGE_NOPOLICY BIT(16) /* sendpage() internal : do no apply policy */ +#define MSG_SENDPAGE_NOTLAST BIT(17) /* sendpage() internal : not the last page */ +#define MSG_BATCH BIT(18) /* sendmmsg(): more messages coming */ +#define MSG_EOF MSG_FIN +#define MSG_NO_SHARED_FRAGS BIT(19) /* sendpage() internal : page frags + * are not shared + */ +#define MSG_SENDPAGE_DECRYPTED BIT(20) /* sendpage() internal : page may carry + * plain text and require encryption + */ + +#define MSG_ZEROCOPY BIT(26) /* Use user data in kernel path */ +#define MSG_FASTOPEN BIT(29) /* Send data in TCP SYN */ +#define MSG_CMSG_CLOEXEC BIT(30) /* Set close_on_exec for file + * descriptor received through + * SCM_RIGHTS + */ #if defined(CONFIG_COMPAT) -#define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */ +#define MSG_CMSG_COMPAT BIT(31) /* This message needs 32 bit fixups */ #else -#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ +#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif -- cgit v1.2.3 From e56763ee50a3f28d2f70355ca43fb78d8539a183 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 10 Mar 2021 13:02:42 +0100 Subject: FDDI: if_fddi.h: Update my e-mail address Following the recent update to MAINTAINERS update my e-mail address. Signed-off-by: Maciej W. Rozycki Signed-off-by: David S. Miller --- include/uapi/linux/if_fddi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_fddi.h b/include/uapi/linux/if_fddi.h index 7239aa9c0766..8df2d9934bcd 100644 --- a/include/uapi/linux/if_fddi.h +++ b/include/uapi/linux/if_fddi.h @@ -9,7 +9,7 @@ * Version: @(#)if_fddi.h 1.0.3 Oct 6 2018 * * Author: Lawrence V. Stefani, - * Maintainer: Maciej W. Rozycki, + * Maintainer: Maciej W. Rozycki, * * if_fddi.h is based on previous if_ether.h and if_tr.h work by * Fred N. van Kempen, -- cgit v1.2.3 From 847cbfc014adafeac401e19e349b0fd524f201c3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 10 Mar 2021 16:50:44 +0200 Subject: net: add a helper to avoid issues with HW TX timestamping and SO_TXTIME As explained in commit 29d98f54a4fe ("net: enetc: allow hardware timestamping on TX queues with tc-etf enabled"), hardware TX timestamping requires an skb with skb->tstamp = 0. When a packet is sent with SO_TXTIME, the skb->skb_mstamp_ns corrupts the value of skb->tstamp, so the drivers need to explicitly reset skb->tstamp to zero after consuming the TX time. Create a helper named skb_txtime_consumed() which does just that. All drivers which offload TC_SETUP_QDISC_ETF should implement it, and it would make it easier to assess during review whether they do the right thing in order to be compatible with hardware timestamping or not. Suggested-by: Vinicius Costa Gomes Signed-off-by: Vladimir Oltean Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 8 ++------ drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- include/net/pkt_sched.h | 9 +++++++++ 4 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 09471329f3a3..89d2cb348271 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -5,6 +5,7 @@ #include #include #include +#include /* ENETC overhead: optional extension BD + 1 BD gap */ #define ENETC_TXBDS_NEEDED(val) ((val) + 2) @@ -344,12 +345,7 @@ static void enetc_tstamp_tx(struct sk_buff *skb, u64 tstamp) if (skb_shinfo(skb)->tx_flags & SKBTX_IN_PROGRESS) { memset(&shhwtstamps, 0, sizeof(shhwtstamps)); shhwtstamps.hwtstamp = ns_to_ktime(tstamp); - /* Ensure skb_mstamp_ns, which might have been populated with - * the txtime, is not mistaken for a software timestamp, - * because this will prevent the dispatch of our hardware - * timestamp to the socket. - */ - skb->tstamp = ktime_set(0, 0); + skb_txtime_consumed(skb); skb_tstamp_tx(skb, &shhwtstamps); } } diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 878b31d534ec..369533feb4f2 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -5856,7 +5856,7 @@ static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, */ if (tx_ring->launchtime_enable) { ts = ktime_to_timespec64(first->skb->tstamp); - first->skb->tstamp = ktime_set(0, 0); + skb_txtime_consumed(first->skb); context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32); } else { context_desc->seqnum_seed = 0; diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 7ac9597ddb84..059ffcfb0bda 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -941,7 +941,7 @@ static void igc_tx_ctxtdesc(struct igc_ring *tx_ring, struct igc_adapter *adapter = netdev_priv(tx_ring->netdev); ktime_t txtime = first->skb->tstamp; - first->skb->tstamp = ktime_set(0, 0); + skb_txtime_consumed(first->skb); context_desc->launch_time = igc_tx_launchtime(adapter, txtime); } else { diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 15b1b30f454e..f5c1bee0cd6a 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -188,4 +188,13 @@ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); +/* Ensure skb_mstamp_ns, which might have been populated with the txtime, is + * not mistaken for a software timestamp, because this will otherwise prevent + * the dispatch of hardware timestamps to the socket. + */ +static inline void skb_txtime_consumed(struct sk_buff *skb) +{ + skb->tstamp = ktime_set(0, 0); +} + #endif -- cgit v1.2.3 From ee90c6ba341f7f72858196f15912c8b6b7d032e8 Mon Sep 17 00:00:00 2001 From: Julien Massonneau Date: Thu, 11 Mar 2021 16:53:18 +0100 Subject: seg6: add support for IPv4 decapsulation in ipv6_srh_rcv() As specified in IETF RFC 8754, section 4.3.1.2, if the upper layer header is IPv4 or IPv6, perform IPv6 decapsulation and resubmit the decapsulated packet to the IPv4 or IPv6 module. Only IPv6 decapsulation was implemented. This patch adds support for IPv4 decapsulation. Link: https://tools.ietf.org/html/rfc8754#section-4.3.1.2 Signed-off-by: Julien Massonneau Signed-off-by: David S. Miller --- include/net/ipv6.h | 1 + net/ipv6/exthdrs.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index bd1f396cc9c7..448bf2b34759 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -30,6 +30,7 @@ */ #define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ +#define NEXTHDR_IPV4 4 /* IPv4 in IPv6 */ #define NEXTHDR_TCP 6 /* TCP segment. */ #define NEXTHDR_UDP 17 /* UDP message. */ #define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6126f8bf94b3..56e479d158b7 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -381,7 +381,7 @@ static int ipv6_srh_rcv(struct sk_buff *skb) looped_back: if (hdr->segments_left == 0) { - if (hdr->nexthdr == NEXTHDR_IPV6) { + if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), @@ -397,7 +397,8 @@ looped_back: skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; - + if (hdr->nexthdr == NEXTHDR_IPV4) + skb->protocol = htons(ETH_P_IP); __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); -- cgit v1.2.3 From 90e1a9e21326887fe0aef6c25ad36464953a961e Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 11 Mar 2021 19:03:14 +0100 Subject: nexthop: Add a dedicated flag for multipath next-hop groups With the introduction of resilient nexthop groups, there will be two types of multipath groups: the current hash-threshold "mpath" ones, and resilient groups. Both are multipath, but to determine the fact, the system needs to consider two flags. This might prove costly in the datapath. Therefore, introduce a new flag, that should be set for next-hop groups that have more than one nexthop, and should be considered multipath. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 7 ++++--- net/ipv4/nexthop.c | 5 ++++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 7bc057aee40b..5062c2c08e2b 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -80,6 +80,7 @@ struct nh_grp_entry { struct nh_group { struct nh_group *spare; /* spare group for removals */ u16 num_nh; + bool is_multipath; bool mpath; bool fdb_nh; bool has_v4; @@ -212,7 +213,7 @@ static inline bool nexthop_is_multipath(const struct nexthop *nh) struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); - return nh_grp->mpath; + return nh_grp->is_multipath; } return false; } @@ -227,7 +228,7 @@ static inline unsigned int nexthop_num_path(const struct nexthop *nh) struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); - if (nh_grp->mpath) + if (nh_grp->is_multipath) rc = nh_grp->num_nh; } @@ -308,7 +309,7 @@ struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); - if (nh_grp->mpath) { + if (nh_grp->is_multipath) { nh = nexthop_mpath_select(nh_grp, nhsel); if (!nh) return NULL; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 69c8b50a936e..56c54d0fbacc 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -967,6 +967,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, } newg->has_v4 = false; + newg->is_multipath = nhg->is_multipath; newg->mpath = nhg->mpath; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -1488,8 +1489,10 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->nh_entries[i].nh_parent = nh; } - if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) + if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->mpath = 1; + nhg->is_multipath = true; + } WARN_ON_ONCE(nhg->mpath != 1); -- cgit v1.2.3 From 710ec5622306de8c071637ee41ddf4c9bd17e75a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 11 Mar 2021 19:03:15 +0100 Subject: nexthop: Add netlink defines and enumerators for resilient NH groups - RTM_NEWNEXTHOP et.al. that handle resilient groups will have a new nested attribute, NHA_RES_GROUP, whose elements are attributes NHA_RES_GROUP_*. - RTM_NEWNEXTHOPBUCKET et.al. is a suite of new messages that will currently serve only for dumping of individual buckets of resilient next hop groups. For nexthop group buckets, these messages will carry a nested attribute NHA_RES_BUCKET, whose elements are attributes NHA_RES_BUCKET_*. There are several reasons why a new suite of messages is created for nexthop buckets instead of overloading the information on the existing RTM_{NEW,DEL,GET}NEXTHOP messages. First, a nexthop group can contain a large number of nexthop buckets (4k is not unheard of). This imposes limits on the amount of information that can be encoded for each nexthop bucket given a netlink message is limited to 64k bytes. Second, while RTM_NEWNEXTHOPBUCKET is only used for notifications at this point, in the future it can be extended to provide user space with control over nexthop buckets configuration. - The new group type is NEXTHOP_GRP_TYPE_RES. Note that nexthop code is adjusted to bounce groups with that type for now. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/uapi/linux/nexthop.h | 47 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/rtnetlink.h | 7 +++++++ net/ipv4/nexthop.c | 2 ++ security/selinux/nlmsgtab.c | 5 ++++- 4 files changed, 59 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h index 2d4a1e784cf0..d8ffa8c9ca78 100644 --- a/include/uapi/linux/nexthop.h +++ b/include/uapi/linux/nexthop.h @@ -21,7 +21,10 @@ struct nexthop_grp { }; enum { - NEXTHOP_GRP_TYPE_MPATH, /* default type if not specified */ + NEXTHOP_GRP_TYPE_MPATH, /* hash-threshold nexthop group + * default type if not specified + */ + NEXTHOP_GRP_TYPE_RES, /* resilient nexthop group */ __NEXTHOP_GRP_TYPE_MAX, }; @@ -52,8 +55,50 @@ enum { NHA_FDB, /* flag; nexthop belongs to a bridge fdb */ /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */ + /* nested; resilient nexthop group attributes */ + NHA_RES_GROUP, + /* nested; nexthop bucket attributes */ + NHA_RES_BUCKET, + __NHA_MAX, }; #define NHA_MAX (__NHA_MAX - 1) + +enum { + NHA_RES_GROUP_UNSPEC, + /* Pad attribute for 64-bit alignment. */ + NHA_RES_GROUP_PAD = NHA_RES_GROUP_UNSPEC, + + /* u16; number of nexthop buckets in a resilient nexthop group */ + NHA_RES_GROUP_BUCKETS, + /* clock_t as u32; nexthop bucket idle timer (per-group) */ + NHA_RES_GROUP_IDLE_TIMER, + /* clock_t as u32; nexthop unbalanced timer */ + NHA_RES_GROUP_UNBALANCED_TIMER, + /* clock_t as u64; nexthop unbalanced time */ + NHA_RES_GROUP_UNBALANCED_TIME, + + __NHA_RES_GROUP_MAX, +}; + +#define NHA_RES_GROUP_MAX (__NHA_RES_GROUP_MAX - 1) + +enum { + NHA_RES_BUCKET_UNSPEC, + /* Pad attribute for 64-bit alignment. */ + NHA_RES_BUCKET_PAD = NHA_RES_BUCKET_UNSPEC, + + /* u16; nexthop bucket index */ + NHA_RES_BUCKET_INDEX, + /* clock_t as u64; nexthop bucket idle time */ + NHA_RES_BUCKET_IDLE_TIME, + /* u32; nexthop id assigned to the nexthop bucket */ + NHA_RES_BUCKET_NH_ID, + + __NHA_RES_BUCKET_MAX, +}; + +#define NHA_RES_BUCKET_MAX (__NHA_RES_BUCKET_MAX - 1) + #endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 91e4ca064d61..d35953bc7d53 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -178,6 +178,13 @@ enum { RTM_GETVLAN, #define RTM_GETVLAN RTM_GETVLAN + RTM_NEWNEXTHOPBUCKET = 116, +#define RTM_NEWNEXTHOPBUCKET RTM_NEWNEXTHOPBUCKET + RTM_DELNEXTHOPBUCKET, +#define RTM_DELNEXTHOPBUCKET RTM_DELNEXTHOPBUCKET + RTM_GETNEXTHOPBUCKET, +#define RTM_GETNEXTHOPBUCKET RTM_GETNEXTHOPBUCKET + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 56c54d0fbacc..7a94591da856 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1492,6 +1492,8 @@ static struct nexthop *nexthop_create_group(struct net *net, if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->mpath = 1; nhg->is_multipath = true; + } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { + goto out_no_nh; } WARN_ON_ONCE(nhg->mpath != 1); diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index b69231918686..d59276f48d4f 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -88,6 +88,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { RTM_NEWVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELVLAN, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETVLAN, NETLINK_ROUTE_SOCKET__NLMSG_READ }, + { RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ }, }; static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = @@ -171,7 +174,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm) * structures at the top of this file with the new mappings * before updating the BUILD_BUG_ON() macro! */ - BUILD_BUG_ON(RTM_MAX != (RTM_NEWVLAN + 3)); + BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOPBUCKET + 3)); err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms, sizeof(nlmsg_route_perms)); break; -- cgit v1.2.3 From 283a72a5599e80750699d2021830a294ed9ab3f3 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 11 Mar 2021 19:03:16 +0100 Subject: nexthop: Add implementation of resilient next-hop groups At this moment, there is only one type of next-hop group: an mpath group, which implements the hash-threshold algorithm. To select a next hop, hash-threshold algorithm first assigns a range of hashes to each next hop in the group, and then selects the next hop by comparing the SKB hash with the individual ranges. When a next hop is removed from the group, the ranges are recomputed, which leads to reassignment of parts of hash space from one next hop to another. While there will usually be some overlap between the previous and the new distribution, some traffic flows change the next hop that they resolve to. That causes problems e.g. as established TCP connections are reset, because the traffic is forwarded to a server that is not familiar with the connection. Resilient hashing is a technique to address the above problem. Resilient next-hop group has another layer of indirection between the group itself and its constituent next hops: a hash table. The selection algorithm uses a straightforward modulo operation to choose a hash bucket, and then reads the next hop that this bucket contains, and forwards traffic there. This indirection brings an important feature. In the hash-threshold algorithm, the range of hashes associated with a next hop must be continuous. With a hash table, mapping between the hash table buckets and the individual next hops is arbitrary. Therefore when a next hop is deleted the buckets that held it are simply reassigned to other next hops. When weights of next hops in a group are altered, it may be possible to choose a subset of buckets that are currently not used for forwarding traffic, and use those to satisfy the new next-hop distribution demands, keeping the "busy" buckets intact. This way, established flows are ideally kept being forwarded to the same endpoints through the same paths as before the next-hop group change. In a nutshell, the algorithm works as follows. Each next hop has a number of buckets that it wants to have, according to its weight and the number of buckets in the hash table. In case of an event that might cause bucket allocation change, the numbers for individual next hops are updated, similarly to how ranges are updated for mpath group next hops. Following that, a new "upkeep" algorithm runs, and for idle buckets that belong to a next hop that is currently occupying more buckets than it wants (it is "overweight"), it migrates the buckets to one of the next hops that has fewer buckets than it wants (it is "underweight"). If, after this, there are still underweight next hops, another upkeep run is scheduled to a future time. Chances are there are not enough "idle" buckets to satisfy the new demands. The algorithm has knobs to select both what it means for a bucket to be idle, and for whether and when to forcefully migrate buckets if there keeps being an insufficient number of idle buckets. There are three users of the resilient data structures. - The forwarding code accesses them under RCU, and does not modify them except for updating the time a selected bucket was last used. - Netlink code, running under RTNL, which may modify the data. - The delayed upkeep code, which may modify the data. This runs unlocked, and mutual exclusion between the RTNL code and the delayed upkeep is maintained by canceling the delayed work synchronously before the RTNL code touches anything. Later it restarts the delayed work if necessary. The RTNL code has to implement next-hop group replacement, next hop removal, etc. For removal, the mpath code uses a neat trick of having a backup next hop group structure, doing the necessary changes offline, and then RCU-swapping them in. However, the hash tables for resilient hashing are about an order of magnitude larger than the groups themselves (the size might be e.g. 4K entries), and it was felt that keeping two of them is an overkill. Both the primary next-hop group and the spare therefore use the same resilient table, and writers are careful to keep all references valid for the forwarding code. The hash table references next-hop group entries from the next-hop group that is currently in the primary role (i.e. not spare). During the transition from primary to spare, the table references a mix of both the primary group and the spare. When a next hop is deleted, the corresponding buckets are not set to NULL, but instead marked as empty, so that the pointer is valid and can be used by the forwarding code. The buckets are then migrated to a new next-hop group entry during upkeep. The only times that the hash table is invalid is the very beginning and very end of its lifetime. Between those points, it is always kept valid. This patch introduces the core support code itself. It does not handle notifications towards drivers, which are kept as if the group were an mpath one. It does not handle netlink either. The only bit currently exposed to user space is the new next-hop group type, and that is currently bounced. There is therefore no way to actually access this code. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 42 ++++ net/ipv4/nexthop.c | 517 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 546 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 5062c2c08e2b..b78505c9031e 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -40,6 +40,12 @@ struct nh_config { struct nlattr *nh_grp; u16 nh_grp_type; + u16 nh_grp_res_num_buckets; + unsigned long nh_grp_res_idle_timer; + unsigned long nh_grp_res_unbalanced_timer; + bool nh_grp_res_has_num_buckets; + bool nh_grp_res_has_idle_timer; + bool nh_grp_res_has_unbalanced_timer; struct nlattr *nh_encap; u16 nh_encap_type; @@ -63,6 +69,32 @@ struct nh_info { }; }; +struct nh_res_bucket { + struct nh_grp_entry __rcu *nh_entry; + atomic_long_t used_time; + unsigned long migrated_time; + bool occupied; + u8 nh_flags; +}; + +struct nh_res_table { + struct net *net; + u32 nhg_id; + struct delayed_work upkeep_dw; + + /* List of NHGEs that have too few buckets ("uw" for underweight). + * Reclaimed buckets will be given to entries in this list. + */ + struct list_head uw_nh_entries; + unsigned long unbalanced_since; + + u32 idle_timer; + u32 unbalanced_timer; + + u16 num_nh_buckets; + struct nh_res_bucket nh_buckets[]; +}; + struct nh_grp_entry { struct nexthop *nh; u8 weight; @@ -71,6 +103,13 @@ struct nh_grp_entry { struct { atomic_t upper_bound; } mpath; + struct { + /* Member on uw_nh_entries. */ + struct list_head uw_nh_entry; + + u16 count_buckets; + u16 wants_buckets; + } res; }; struct list_head nh_list; @@ -82,8 +121,11 @@ struct nh_group { u16 num_nh; bool is_multipath; bool mpath; + bool resilient; bool fdb_nh; bool has_v4; + + struct nh_res_table __rcu *res_table; struct nh_grp_entry nh_entries[]; }; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 7a94591da856..0e2ff72e10c0 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -183,6 +183,30 @@ static int call_nexthop_notifiers(struct net *net, return notifier_to_errno(err); } +/* There are three users of RES_TABLE, and NHs etc. referenced from there: + * + * 1) a collection of callbacks for NH maintenance. This operates under + * RTNL, + * 2) the delayed work that gradually balances the resilient table, + * 3) and nexthop_select_path(), operating under RCU. + * + * Both the delayed work and the RTNL block are writers, and need to + * maintain mutual exclusion. Since there are only two and well-known + * writers for each table, the RTNL code can make sure it has exclusive + * access thus: + * + * - Have the DW operate without locking; + * - synchronously cancel the DW; + * - do the writing; + * - if the write was not actually a delete, call upkeep, which schedules + * DW again if necessary. + * + * The functions that are always called from the RTNL context use + * rtnl_dereference(). The functions that can also be called from the DW do + * a raw dereference and rely on the above mutual exclusion scheme. + */ +#define nh_res_dereference(p) (rcu_dereference_raw(p)) + static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, @@ -241,6 +265,9 @@ static void nexthop_free_group(struct nexthop *nh) WARN_ON(nhg->spare == nhg); + if (nhg->resilient) + vfree(rcu_dereference_raw(nhg->res_table)); + kfree(nhg->spare); kfree(nhg); } @@ -299,6 +326,30 @@ static struct nh_group *nexthop_grp_alloc(u16 num_nh) return nhg; } +static void nh_res_table_upkeep_dw(struct work_struct *work); + +static struct nh_res_table * +nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) +{ + const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; + struct nh_res_table *res_table; + unsigned long size; + + size = struct_size(res_table, nh_buckets, num_nh_buckets); + res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); + if (!res_table) + return NULL; + + res_table->net = net; + res_table->nhg_id = nhg_id; + INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); + INIT_LIST_HEAD(&res_table->uw_nh_entries); + res_table->idle_timer = cfg->nh_grp_res_idle_timer; + res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; + res_table->num_nh_buckets = num_nh_buckets; + return res_table; +} + static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) @@ -347,6 +398,13 @@ static u32 nh_find_unused_id(struct net *net) return 0; } +static void nh_res_time_set_deadline(unsigned long next_time, + unsigned long *deadline) +{ + if (time_before(next_time, *deadline)) + *deadline = next_time; +} + static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) { struct nexthop_grp *p; @@ -540,20 +598,62 @@ errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } +static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) +{ + return (unsigned long)atomic_long_read(&bucket->used_time); +} + +static unsigned long +nh_res_bucket_idle_point(const struct nh_res_table *res_table, + const struct nh_res_bucket *bucket, + unsigned long now) +{ + unsigned long time = nh_res_bucket_used_time(bucket); + + /* Bucket was not used since it was migrated. The idle time is now. */ + if (time == bucket->migrated_time) + return now; + + return time + res_table->idle_timer; +} + +static unsigned long +nh_res_table_unb_point(const struct nh_res_table *res_table) +{ + return res_table->unbalanced_since + res_table->unbalanced_timer; +} + +static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, + struct nh_res_bucket *bucket) +{ + unsigned long now = jiffies; + + atomic_long_set(&bucket->used_time, (long)now); + bucket->migrated_time = now; +} + +static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) +{ + atomic_long_set(&bucket->used_time, (long)jiffies); +} + static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - /* nested multipath (group within a group) is not - * supported - */ + /* Nesting groups within groups is not supported. */ if (nhg->mpath) { NL_SET_ERR_MSG(extack, "Multipath group can not be a nexthop within a group"); return false; } + if (nhg->resilient) { + NL_SET_ERR_MSG(extack, + "Resilient group can not be a nexthop within a group"); + return false; + } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); @@ -734,6 +834,22 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) return rc; } +static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) +{ + struct nh_res_table *res_table = rcu_dereference(nhg->res_table); + u16 bucket_index = hash % res_table->num_nh_buckets; + struct nh_res_bucket *bucket; + struct nh_grp_entry *nhge; + + /* nexthop_select_path() is expected to return a non-NULL value, so + * skip protocol validation and just hand out whatever there is. + */ + bucket = &res_table->nh_buckets[bucket_index]; + nh_res_bucket_set_busy(bucket); + nhge = rcu_dereference(bucket->nh_entry); + return nhge->nh; +} + struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; @@ -744,6 +860,8 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) nhg = rcu_dereference(nh->nh_grp); if (nhg->mpath) return nexthop_select_path_mp(nhg, hash); + else if (nhg->resilient) + return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; @@ -926,7 +1044,289 @@ static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, return 0; } -static void nh_group_rebalance(struct nh_group *nhg) +static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets == nhge->res.wants_buckets; +} + +static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets > nhge->res.wants_buckets; +} + +static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) +{ + return nhge->res.count_buckets < nhge->res.wants_buckets; +} + +static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) +{ + return list_empty(&res_table->uw_nh_entries); +} + +static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) +{ + struct nh_grp_entry *nhge; + + if (bucket->occupied) { + nhge = nh_res_dereference(bucket->nh_entry); + nhge->res.count_buckets--; + bucket->occupied = false; + } +} + +static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, + struct nh_grp_entry *nhge) +{ + nh_res_bucket_unset_nh(bucket); + + bucket->occupied = true; + rcu_assign_pointer(bucket->nh_entry, nhge); + nhge->res.count_buckets++; +} + +static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, + struct nh_res_bucket *bucket, + unsigned long *deadline, bool *force) +{ + unsigned long now = jiffies; + struct nh_grp_entry *nhge; + unsigned long idle_point; + + if (!bucket->occupied) { + /* The bucket is not occupied, its NHGE pointer is either + * NULL or obsolete. We _have to_ migrate: set force. + */ + *force = true; + return true; + } + + nhge = nh_res_dereference(bucket->nh_entry); + + /* If the bucket is populated by an underweight or balanced + * nexthop, do not migrate. + */ + if (!nh_res_nhge_is_ow(nhge)) + return false; + + /* At this point we know that the bucket is populated with an + * overweight nexthop. It needs to be migrated to a new nexthop if + * the idle timer of unbalanced timer expired. + */ + + idle_point = nh_res_bucket_idle_point(res_table, bucket, now); + if (time_after_eq(now, idle_point)) { + /* The bucket is idle. We _can_ migrate: unset force. */ + *force = false; + return true; + } + + /* Unbalanced timer of 0 means "never force". */ + if (res_table->unbalanced_timer) { + unsigned long unb_point; + + unb_point = nh_res_table_unb_point(res_table); + if (time_after(now, unb_point)) { + /* The bucket is not idle, but the unbalanced timer + * expired. We _can_ migrate, but set force anyway, + * so that drivers know to ignore activity reports + * from the HW. + */ + *force = true; + return true; + } + + nh_res_time_set_deadline(unb_point, deadline); + } + + nh_res_time_set_deadline(idle_point, deadline); + return false; +} + +static bool nh_res_bucket_migrate(struct nh_res_table *res_table, + u16 bucket_index, bool force) +{ + struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; + struct nh_grp_entry *new_nhge; + + new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, + struct nh_grp_entry, + res.uw_nh_entry); + if (WARN_ON_ONCE(!new_nhge)) + /* If this function is called, "bucket" is either not + * occupied, or it belongs to a next hop that is + * overweight. In either case, there ought to be a + * corresponding underweight next hop. + */ + return false; + + nh_res_bucket_set_nh(bucket, new_nhge); + nh_res_bucket_set_idle(res_table, bucket); + + if (nh_res_nhge_is_balanced(new_nhge)) + list_del(&new_nhge->res.uw_nh_entry); + return true; +} + +#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) + +static void nh_res_table_upkeep(struct nh_res_table *res_table) +{ + unsigned long now = jiffies; + unsigned long deadline; + u16 i; + + /* Deadline is the next time that upkeep should be run. It is the + * earliest time at which one of the buckets might be migrated. + * Start at the most pessimistic estimate: either unbalanced_timer + * from now, or if there is none, idle_timer from now. For each + * encountered time point, call nh_res_time_set_deadline() to + * refine the estimate. + */ + if (res_table->unbalanced_timer) + deadline = now + res_table->unbalanced_timer; + else + deadline = now + res_table->idle_timer; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + bool force; + + if (nh_res_bucket_should_migrate(res_table, bucket, + &deadline, &force)) { + if (!nh_res_bucket_migrate(res_table, i, force)) { + unsigned long idle_point; + + /* A driver can override the migration + * decision if the HW reports that the + * bucket is actually not idle. Therefore + * remark the bucket as busy again and + * update the deadline. + */ + nh_res_bucket_set_busy(bucket); + idle_point = nh_res_bucket_idle_point(res_table, + bucket, + now); + nh_res_time_set_deadline(idle_point, &deadline); + } + } + } + + /* If the group is still unbalanced, schedule the next upkeep to + * either the deadline computed above, or the minimum deadline, + * whichever comes later. + */ + if (!nh_res_table_is_balanced(res_table)) { + unsigned long now = jiffies; + unsigned long min_deadline; + + min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; + if (time_before(deadline, min_deadline)) + deadline = min_deadline; + + queue_delayed_work(system_power_efficient_wq, + &res_table->upkeep_dw, deadline - now); + } +} + +static void nh_res_table_upkeep_dw(struct work_struct *work) +{ + struct delayed_work *dw = to_delayed_work(work); + struct nh_res_table *res_table; + + res_table = container_of(dw, struct nh_res_table, upkeep_dw); + nh_res_table_upkeep(res_table); +} + +static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) +{ + cancel_delayed_work_sync(&res_table->upkeep_dw); +} + +static void nh_res_group_rebalance(struct nh_group *nhg, + struct nh_res_table *res_table) +{ + int prev_upper_bound = 0; + int total = 0; + int w = 0; + int i; + + INIT_LIST_HEAD(&res_table->uw_nh_entries); + + for (i = 0; i < nhg->num_nh; ++i) + total += nhg->nh_entries[i].weight; + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; + int upper_bound; + + w += nhge->weight; + upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, + total); + nhge->res.wants_buckets = upper_bound - prev_upper_bound; + prev_upper_bound = upper_bound; + + if (nh_res_nhge_is_uw(nhge)) { + if (list_empty(&res_table->uw_nh_entries)) + res_table->unbalanced_since = jiffies; + list_add(&nhge->res.uw_nh_entry, + &res_table->uw_nh_entries); + } + } +} + +/* Migrate buckets in res_table so that they reference NHGE's from NHG with + * the right NH ID. Set those buckets that do not have a corresponding NHGE + * entry in NHG as not occupied. + */ +static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, + struct nh_group *nhg) +{ + u16 i; + + for (i = 0; i < res_table->num_nh_buckets; i++) { + struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; + u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; + bool found = false; + int j; + + for (j = 0; j < nhg->num_nh; j++) { + struct nh_grp_entry *nhge = &nhg->nh_entries[j]; + + if (nhge->nh->id == id) { + nh_res_bucket_set_nh(bucket, nhge); + found = true; + break; + } + } + + if (!found) + nh_res_bucket_unset_nh(bucket); + } +} + +static void replace_nexthop_grp_res(struct nh_group *oldg, + struct nh_group *newg) +{ + /* For NH group replacement, the new NHG might only have a stub + * hash table with 0 buckets, because the number of buckets was not + * specified. For NH removal, oldg and newg both reference the same + * res_table. So in any case, in the following, we want to work + * with oldg->res_table. + */ + struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); + unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; + bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); + + nh_res_table_cancel_upkeep(old_res_table); + nh_res_table_migrate_buckets(old_res_table, newg); + nh_res_group_rebalance(newg, old_res_table); + if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) + old_res_table->unbalanced_since = prev_unbalanced_since; + nh_res_table_upkeep(old_res_table); +} + +static void nh_mp_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; @@ -969,6 +1369,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->mpath = nhg->mpath; + newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -996,7 +1397,11 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, j++; } - nh_group_rebalance(newg); + if (newg->mpath) + nh_mp_group_rebalance(newg); + else if (newg->resilient) + replace_nexthop_grp_res(nhg, newg); + rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); @@ -1025,6 +1430,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); + struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { @@ -1035,6 +1441,11 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) list_del_init(&nhge->nh_list); } + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + nh_res_table_cancel_upkeep(res_table); + } } /* not called for nexthop replace */ @@ -1113,6 +1524,9 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { + struct nh_res_table *tmp_table = NULL; + struct nh_res_table *new_res_table; + struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; @@ -1121,19 +1535,57 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, return -EINVAL; } - err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); - if (err) - return err; - oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); + if (newg->mpath != oldg->mpath) { + NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); + return -EINVAL; + } + + if (newg->mpath) { + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, + extack); + if (err) + return err; + } else if (newg->resilient) { + new_res_table = rtnl_dereference(newg->res_table); + old_res_table = rtnl_dereference(oldg->res_table); + + /* Accept if num_nh_buckets was not given, but if it was + * given, demand that the value be correct. + */ + if (cfg->nh_grp_res_has_num_buckets && + cfg->nh_grp_res_num_buckets != + old_res_table->num_nh_buckets) { + NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); + return -EINVAL; + } + + if (cfg->nh_grp_res_has_idle_timer) + old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; + if (cfg->nh_grp_res_has_unbalanced_timer) + old_res_table->unbalanced_timer = + cfg->nh_grp_res_unbalanced_timer; + + replace_nexthop_grp_res(oldg, newg); + + tmp_table = new_res_table; + rcu_assign_pointer(newg->res_table, old_res_table); + rcu_assign_pointer(newg->spare->res_table, old_res_table); + } + /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); + if (newg->resilient) { + rcu_assign_pointer(oldg->res_table, tmp_table); + rcu_assign_pointer(oldg->spare->res_table, tmp_table); + } + for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; @@ -1383,6 +1835,27 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, goto out; } + if (new_nh->is_group) { + struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); + struct nh_res_table *res_table; + + if (nhg->resilient) { + res_table = rtnl_dereference(nhg->res_table); + + /* Not passing the number of buckets is OK when + * replacing, but not when creating a new group. + */ + if (!cfg->nh_grp_res_has_num_buckets) { + NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); + rc = -EINVAL; + goto out; + } + + nh_res_group_rebalance(nhg, res_table); + nh_res_table_upkeep(res_table); + } + } + rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); @@ -1445,6 +1918,7 @@ static struct nexthop *nexthop_create_group(struct net *net, u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; + int err; int i; if (WARN_ON(!num_nh)) @@ -1476,8 +1950,10 @@ static struct nexthop *nexthop_create_group(struct net *net, struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); - if (!nexthop_get(nhe)) + if (!nexthop_get(nhe)) { + err = -ENOENT; goto out_no_nh; + } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) @@ -1493,13 +1969,28 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->mpath = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { + struct nh_res_table *res_table; + + /* Bounce resilient groups for now. */ + err = -EINVAL; goto out_no_nh; + + res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); + if (!res_table) { + err = -ENOMEM; + goto out_no_nh; + } + + rcu_assign_pointer(nhg->spare->res_table, res_table); + rcu_assign_pointer(nhg->res_table, res_table); + nhg->resilient = true; + nhg->is_multipath = true; } - WARN_ON_ONCE(nhg->mpath != 1); + WARN_ON_ONCE(nhg->mpath + nhg->resilient != 1); if (nhg->mpath) - nh_group_rebalance(nhg); + nh_mp_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; @@ -1518,7 +2009,7 @@ out_no_nh: kfree(nhg); kfree(nh); - return ERR_PTR(-ENOENT); + return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, -- cgit v1.2.3 From b8f090d0beb185007e5305f7c8aaf3f38fba3dda Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 11 Mar 2021 19:03:17 +0100 Subject: nexthop: Add data structures for resilient group notifications Add data structures that will be used for in-kernel notifications about addition / deletion of a resilient nexthop group and about changes to a hash bucket within a resilient group. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/nexthop.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index b78505c9031e..fd3c0debe8bf 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -155,11 +155,15 @@ struct nexthop { enum nexthop_event_type { NEXTHOP_EVENT_DEL, NEXTHOP_EVENT_REPLACE, + NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, + NEXTHOP_EVENT_BUCKET_REPLACE, }; enum nh_notifier_info_type { NH_NOTIFIER_INFO_TYPE_SINGLE, NH_NOTIFIER_INFO_TYPE_GRP, + NH_NOTIFIER_INFO_TYPE_RES_TABLE, + NH_NOTIFIER_INFO_TYPE_RES_BUCKET, }; struct nh_notifier_single_info { @@ -186,6 +190,19 @@ struct nh_notifier_grp_info { struct nh_notifier_grp_entry_info nh_entries[]; }; +struct nh_notifier_res_bucket_info { + u16 bucket_index; + unsigned int idle_timer_ms; + bool force; + struct nh_notifier_single_info old_nh; + struct nh_notifier_single_info new_nh; +}; + +struct nh_notifier_res_table_info { + u16 num_nh_buckets; + struct nh_notifier_single_info nhs[]; +}; + struct nh_notifier_info { struct net *net; struct netlink_ext_ack *extack; @@ -194,6 +211,8 @@ struct nh_notifier_info { union { struct nh_notifier_single_info *nh; struct nh_notifier_grp_info *nh_grp; + struct nh_notifier_res_table_info *nh_res_table; + struct nh_notifier_res_bucket_info *nh_res_bucket; }; }; -- cgit v1.2.3 From 56ad5ba344dea9c914331da8754f5ba7cede9941 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 11 Mar 2021 19:03:19 +0100 Subject: nexthop: Allow setting "offload" and "trap" indication of nexthop buckets Add a function that can be called by device drivers to set "offload" or "trap" indication on nexthop buckets following nexthop notifications and other changes such as a neighbour becoming invalid. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/nexthop.h | 2 ++ net/ipv4/nexthop.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index fd3c0debe8bf..685f208d26b5 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -220,6 +220,8 @@ int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap); +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, + bool offload, bool trap); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 8b06aafc2e9e..1fce4ff39390 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3072,6 +3072,40 @@ out: } EXPORT_SYMBOL(nexthop_set_hw_flags); +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, + bool offload, bool trap) +{ + struct nh_res_table *res_table; + struct nh_res_bucket *bucket; + struct nexthop *nexthop; + struct nh_group *nhg; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + if (bucket_index >= nhg->res_table->num_nh_buckets) + goto out; + + res_table = rcu_dereference(nhg->res_table); + bucket = &res_table->nh_buckets[bucket_index]; + bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); + if (offload) + bucket->nh_flags |= RTNH_F_OFFLOAD; + if (trap) + bucket->nh_flags |= RTNH_F_TRAP; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); -- cgit v1.2.3 From cfc15c1dbb0b7574498eef453b8ddb534e408551 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 11 Mar 2021 19:03:20 +0100 Subject: nexthop: Allow reporting activity of nexthop buckets The kernel periodically checks the idle time of nexthop buckets to determine if they are idle and can be re-populated with a new nexthop. When the resilient nexthop group is offloaded to hardware, the kernel will not see activity on nexthop buckets unless it is reported from hardware. Add a function that can be periodically called by device drivers to report activity on nexthop buckets after querying it from the underlying device. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- include/net/nexthop.h | 2 ++ net/ipv4/nexthop.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 685f208d26b5..ba94868a21d5 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -222,6 +222,8 @@ int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap); +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, + unsigned long *activity); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 1fce4ff39390..495b5e69ffcd 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3106,6 +3106,41 @@ out: } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, + unsigned long *activity) +{ + struct nh_res_table *res_table; + struct nexthop *nexthop; + struct nh_group *nhg; + u16 i; + + rcu_read_lock(); + + nexthop = nexthop_find_by_id(net, id); + if (!nexthop || !nexthop->is_group) + goto out; + + nhg = rcu_dereference(nexthop->nh_grp); + if (!nhg->resilient) + goto out; + + /* Instead of silently ignoring some buckets, demand that the sizes + * be the same. + */ + res_table = rcu_dereference(nhg->res_table); + if (num_buckets != res_table->num_nh_buckets) + goto out; + + for (i = 0; i < num_buckets; i++) { + if (test_bit(i, activity)) + nh_res_bucket_set_busy(&res_table->nh_buckets[i]); + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(nexthop_res_grp_activity_update); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); -- cgit v1.2.3 From f4dae54e486d528d4dd98df116e7a522bbf12667 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Mar 2021 12:35:04 -0800 Subject: tcp: plug skb_still_in_host_queue() to TSQ Jakub and Neil reported an increase of RTO timers whenever TX completions are delayed a bit more (by increasing NIC TX coalescing parameters) Main issue is that TCP stack has a logic preventing a packet being retransmit if the prior clone has not yet been orphaned or freed. This logic came with commit 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host queues") Thankfully, in the case skb_still_in_host_queue() detects the initial clone is still in flight, it can use TSQ logic that will eventually retry later, at the moment the clone is freed or orphaned. Signed-off-by: Eric Dumazet Reported-by: Neil Spring Reported-by: Jakub Kicinski Cc: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/ipv4/tcp_output.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0503c917d773..483e89348f78 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1140,7 +1140,7 @@ static inline bool skb_fclone_busy(const struct sock *sk, return skb->fclone == SKB_FCLONE_ORIG && refcount_read(&fclones->fclone_ref) > 1 && - fclones->skb2.sk == sk; + READ_ONCE(fclones->skb2.sk) == sk; } /** diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fbf140a770d8..0dbf208a4f2f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2775,13 +2775,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! */ -static bool skb_still_in_host_queue(const struct sock *sk, +static bool skb_still_in_host_queue(struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); - return true; + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); + smp_mb__after_atomic(); + if (skb_fclone_busy(sk, skb)) { + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + return true; + } } return false; } -- cgit v1.2.3 From 59079438a664559bb1f6f5fe85e306962ef9286e Mon Sep 17 00:00:00 2001 From: Mikhael Goikhman Date: Wed, 10 Mar 2021 23:09:09 -0800 Subject: net/mlx5: Remove unused mlx5_core_health member recover_work The code related to health->recover_work was removed in commit 63cbc552eebf ("net/mlx5: Handle SW reset of FW in error flow") Fix struct mlx5_core_health accordingly. Signed-off-by: Mikhael Goikhman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 53b89631a1d9..8fe51b4a781e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -438,7 +438,6 @@ struct mlx5_core_health { unsigned long flags; struct work_struct fatal_report_work; struct work_struct report_work; - struct delayed_work recover_work; struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_fatal_reporter; }; -- cgit v1.2.3 From 59c904c8fffd903c1dae5fc6a402b88fa6dfc874 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Mar 2021 23:09:11 -0800 Subject: net/mlx5: E-Switch, Add eswitch pointer to each representor Store the managing E-Switch of each representor. This will be used when a representor is created on eswitch manager 0 but the vport belongs to eswitch manager 1. Signed-off-by: Mark Bloch Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 1 + include/linux/mlx5/eswitch.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index fd5f8b830584..f6c0e7e05ad5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3153,6 +3153,7 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw, esw->offloads.rep_ops[rep_type] = ops; mlx5_esw_for_all_reps(esw, i, rep) { if (likely(mlx5_eswitch_vport_has_rep(esw, i))) { + rep->esw = esw; rep_data = &rep->rep_data[rep_type]; atomic_set(&rep_data->state, REP_REGISTERED); } diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 994c2c8cb4fd..72d480df2a03 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -48,6 +48,7 @@ struct mlx5_eswitch_rep { /* Only IB rep is using vport_index */ u16 vport_index; u32 vlan_refcount; + struct mlx5_eswitch *esw; }; void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw, -- cgit v1.2.3 From 3a46f4fb55ffd46e475e3fc53b1252f722cf647e Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Mar 2021 23:09:13 -0800 Subject: net/mlx5: E-Switch, Refactor send to vport to be more generic Now that each representor stores a pointer to the managing E-Switch use that information when creating the send-to-vport rules. Signed-off-by: Mark Bloch Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/ib_rep.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 3 +-- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 20 ++++++++++++-------- include/linux/mlx5/eswitch.h | 4 ++-- 4 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 4eae7131b0ce..db5de720bb12 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -123,8 +123,7 @@ struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev, rep = dev->port[port - 1].rep; - return mlx5_eswitch_add_send_to_vport_rule(esw, rep->vport, - sq->base.mqp.qpn); + return mlx5_eswitch_add_send_to_vport_rule(esw, rep, sq->base.mqp.qpn); } static int mlx5r_rep_probe(struct auxiliary_device *adev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index a132fff7a980..3d6c2bce67d2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -411,8 +411,7 @@ static int mlx5e_sqs2vport_start(struct mlx5_eswitch *esw, } /* Add re-inject rule to the PF/representor sqs */ - flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, - rep->vport, + flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw, rep, sqns_array[i]); if (IS_ERR(flow_rule)) { err = PTR_ERR(flow_rule); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index f6c0e7e05ad5..6090b2609089 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1036,7 +1036,8 @@ out: } struct mlx5_flow_handle * -mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, u16 vport, +mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, + struct mlx5_eswitch_rep *rep, u32 sqn) { struct mlx5_flow_act flow_act = {0}; @@ -1054,27 +1055,30 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, u16 vport, misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn); /* source vport is the esw manager */ - MLX5_SET(fte_match_set_misc, misc, source_port, esw->manager_vport); - if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + MLX5_SET(fte_match_set_misc, misc, source_port, rep->esw->manager_vport); + if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch)) MLX5_SET(fte_match_set_misc, misc, source_eswitch_owner_vhca_id, - MLX5_CAP_GEN(esw->dev, vhca_id)); + MLX5_CAP_GEN(rep->esw->dev, vhca_id)); misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn); MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port); - if (MLX5_CAP_ESW(esw->dev, merged_eswitch)) + if (MLX5_CAP_ESW(on_esw->dev, merged_eswitch)) MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_eswitch_owner_vhca_id); spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS; dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; - dest.vport.num = vport; + dest.vport.num = rep->vport; + dest.vport.vhca_id = MLX5_CAP_GEN(rep->esw->dev, vhca_id); + dest.vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, + flow_rule = mlx5_add_flow_rules(on_esw->fdb_table.offloads.slow_fdb, spec, &flow_act, &dest, 1); if (IS_ERR(flow_rule)) - esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule)); + esw_warn(on_esw->dev, "FDB: Failed to add send to vport rule err %ld\n", + PTR_ERR(flow_rule)); out: kvfree(spec); return flow_rule; diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 72d480df2a03..2ec0527991c8 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -62,8 +62,8 @@ struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw, u16 vport_num); void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type); struct mlx5_flow_handle * -mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, - u16 vport_num, u32 sqn); +mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, + struct mlx5_eswitch_rep *rep, u32 sqn); u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); -- cgit v1.2.3 From c3e666f1ada9cbfbe5465f122f9a2d63ddfd25ed Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Mar 2021 23:09:14 -0800 Subject: net/mlx5: Add IFC bits needed for single FDB mode Currently we operate in a mode where each eswitch manager has a separate FDB. In order to combine these multiple FDBs we expose new caps to allow this: - Set root flow table which isn't native. - Set FDB a different selection mode when in LAG mode. Signed-off-by: Mark Bloch Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index df5d91c8b2d4..3ee7a86f39e4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -806,9 +806,11 @@ struct mlx5_ifc_e_switch_cap_bits { u8 vport_svlan_insert[0x1]; u8 vport_cvlan_insert_if_not_exist[0x1]; u8 vport_cvlan_insert_overwrite[0x1]; - u8 reserved_at_5[0x3]; + u8 reserved_at_5[0x2]; + u8 esw_shared_ingress_acl[0x1]; u8 esw_uplink_ingress_acl[0x1]; - u8 reserved_at_9[0x10]; + u8 root_ft_on_other_esw[0x1]; + u8 reserved_at_a[0xf]; u8 esw_functions_changed[0x1]; u8 reserved_at_1a[0x1]; u8 ecpf_vport_exists[0x1]; @@ -1502,7 +1504,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_270[0x6]; u8 lag_dct[0x2]; u8 lag_tx_port_affinity[0x1]; - u8 reserved_at_279[0x2]; + u8 lag_native_fdb_selection[0x1]; + u8 reserved_at_27a[0x1]; u8 lag_master[0x1]; u8 num_lag_ports[0x4]; @@ -10036,14 +10039,19 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x7]; + u8 table_of_other_vport[0x1]; + u8 table_vport_number[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; u8 reserved_at_c0[0x8]; u8 underlay_qpn[0x18]; - u8 reserved_at_e0[0x120]; + u8 table_eswitch_owner_vhca_id_valid[0x1]; + u8 reserved_at_e1[0xf]; + u8 table_eswitch_owner_vhca_id[0x10]; + u8 reserved_at_100[0x100]; }; enum { @@ -10273,7 +10281,8 @@ struct mlx5_ifc_dcbx_param_bits { }; struct mlx5_ifc_lagc_bits { - u8 reserved_at_0[0x1d]; + u8 fdb_selection_mode[0x1]; + u8 reserved_at_1[0x1c]; u8 lag_state[0x3]; u8 reserved_at_20[0x14]; -- cgit v1.2.3 From 26bf30902c10473ba38f220d3401a61c56d8db3b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 10 Mar 2021 23:09:15 -0800 Subject: net/mlx5: Use order-0 allocations for EQs Currently we are allocating high-order page for EQs. In case of fragmented system, VF hot remove/add in VMs for example, there isn't enough contiguous memory for EQs allocation, which results in crashing of the VM. Therefore, use order-0 fragments for the EQ allocations instead. Performance tests: ConnectX-5 100Gbps, CPU: Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz Performance tests show no sensible degradation. Signed-off-by: Tariq Toukan Signed-off-by: Shay Drory Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/health.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 27 +++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h | 15 ++++++++---- drivers/net/ethernet/mellanox/mlx5/core/wq.c | 5 ---- include/linux/mlx5/driver.h | 5 ++++ 5 files changed, 32 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c index 84e501e057b4..6f4e6c34b2a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.c @@ -128,7 +128,7 @@ int mlx5e_health_eq_diag_fmsg(struct mlx5_eq_comp *eq, struct devlink_fmsg *fmsg if (err) return err; - err = devlink_fmsg_u32_pair_put(fmsg, "size", eq->core.nent); + err = devlink_fmsg_u32_pair_put(fmsg, "size", eq_get_size(&eq->core)); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 174dfbc996c6..4e8381030d77 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -271,7 +271,7 @@ static void init_eq_buf(struct mlx5_eq *eq) struct mlx5_eqe *eqe; int i; - for (i = 0; i < eq->nent; i++) { + for (i = 0; i < eq_get_size(eq); i++) { eqe = get_eqe(eq, i); eqe->owner = MLX5_EQE_OWNER_INIT_VAL; } @@ -281,8 +281,10 @@ static int create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, struct mlx5_eq_param *param) { + u8 log_eq_size = order_base_2(param->nent + MLX5_NUM_SPARE_EQE); struct mlx5_cq_table *cq_table = &eq->cq_table; u32 out[MLX5_ST_SZ_DW(create_eq_out)] = {0}; + u8 log_eq_stride = ilog2(MLX5_EQE_SIZE); struct mlx5_priv *priv = &dev->priv; u8 vecidx = param->irq_index; __be64 *pas; @@ -297,16 +299,18 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, spin_lock_init(&cq_table->lock); INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC); - eq->nent = roundup_pow_of_two(param->nent + MLX5_NUM_SPARE_EQE); eq->cons_index = 0; - err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, &eq->buf); + + err = mlx5_frag_buf_alloc_node(dev, wq_get_byte_sz(log_eq_size, log_eq_stride), + &eq->frag_buf, dev->priv.numa_node); if (err) return err; + mlx5_init_fbc(eq->frag_buf.frags, log_eq_stride, log_eq_size, &eq->fbc); init_eq_buf(eq); inlen = MLX5_ST_SZ_BYTES(create_eq_in) + - MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->buf.npages; + MLX5_FLD_SZ_BYTES(create_eq_in, pas[0]) * eq->frag_buf.npages; in = kvzalloc(inlen, GFP_KERNEL); if (!in) { @@ -315,7 +319,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, } pas = (__be64 *)MLX5_ADDR_OF(create_eq_in, in, pas); - mlx5_fill_page_array(&eq->buf, pas); + mlx5_fill_page_frag_array(&eq->frag_buf, pas); MLX5_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ); if (!param->mask[0] && MLX5_CAP_GEN(dev, log_max_uctx)) @@ -326,11 +330,11 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, param->mask[i]); eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); - MLX5_SET(eqc, eqc, log_eq_size, ilog2(eq->nent)); + MLX5_SET(eqc, eqc, log_eq_size, eq->fbc.log_sz); MLX5_SET(eqc, eqc, uar_page, priv->uar->index); MLX5_SET(eqc, eqc, intr, vecidx); MLX5_SET(eqc, eqc, log_page_size, - eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + eq->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); err = mlx5_cmd_exec(dev, in, inlen, out, sizeof(out)); if (err) @@ -356,7 +360,7 @@ err_in: kvfree(in); err_buf: - mlx5_buf_free(dev, &eq->buf); + mlx5_frag_buf_free(dev, &eq->frag_buf); return err; } @@ -413,7 +417,7 @@ static int destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) eq->eqn); synchronize_irq(eq->irqn); - mlx5_buf_free(dev, &eq->buf); + mlx5_frag_buf_free(dev, &eq->frag_buf); return err; } @@ -764,10 +768,11 @@ EXPORT_SYMBOL(mlx5_eq_destroy_generic); struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, u32 cc) { u32 ci = eq->cons_index + cc; + u32 nent = eq_get_size(eq); struct mlx5_eqe *eqe; - eqe = get_eqe(eq, ci & (eq->nent - 1)); - eqe = ((eqe->owner & 1) ^ !!(ci & eq->nent)) ? NULL : eqe; + eqe = get_eqe(eq, ci & (nent - 1)); + eqe = ((eqe->owner & 1) ^ !!(ci & nent)) ? NULL : eqe; /* Make sure we read EQ entry contents after we've * checked the ownership bit. */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h index 81f2cc4ca1da..f607a3858ef5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h @@ -22,15 +22,15 @@ struct mlx5_cq_table { }; struct mlx5_eq { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; struct mlx5_core_dev *dev; struct mlx5_cq_table cq_table; __be32 __iomem *doorbell; u32 cons_index; - struct mlx5_frag_buf buf; unsigned int vecidx; unsigned int irqn; u8 eqn; - int nent; struct mlx5_rsc_debug *dbg; }; @@ -47,16 +47,21 @@ struct mlx5_eq_comp { struct list_head list; }; +static inline u32 eq_get_size(struct mlx5_eq *eq) +{ + return eq->fbc.sz_m1 + 1; +} + static inline struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry) { - return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE); + return mlx5_frag_buf_get_wqe(&eq->fbc, entry); } static inline struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq) { - struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1)); + struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & eq->fbc.sz_m1); - return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe; + return (eqe->owner ^ (eq->cons_index >> eq->fbc.log_sz)) & 1 ? NULL : eqe; } static inline void eq_update_ci(struct mlx5_eq *eq, int arm) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.c b/drivers/net/ethernet/mellanox/mlx5/core/wq.c index 01f075fac276..3091dd014650 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.c @@ -34,11 +34,6 @@ #include "wq.h" #include "mlx5_core.h" -static u32 wq_get_byte_sz(u8 log_sz, u8 log_stride) -{ - return ((u32)1 << log_sz) << log_stride; -} - int mlx5_wq_cyc_create(struct mlx5_core_dev *mdev, struct mlx5_wq_param *param, void *wqc, struct mlx5_wq_cyc *wq, struct mlx5_wq_ctrl *wq_ctrl) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8fe51b4a781e..5c0422930b01 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -873,6 +873,11 @@ static inline u32 mlx5_base_mkey(const u32 key) return key & 0xffffff00u; } +static inline u32 wq_get_byte_sz(u8 log_sz, u8 log_stride) +{ + return ((u32)1 << log_sz) << log_stride; +} + static inline void mlx5_init_fbc_offset(struct mlx5_buf_list *frags, u8 log_stride, u8 log_sz, u16 strides_offset, -- cgit v1.2.3 From a3222a2da0a2d6c7682252d4bfdff05721a82b95 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Sun, 24 Jan 2021 15:56:36 +0200 Subject: net/mlx5e: Allow to match on ICMP parameters Support matching on ICMPv4/6 type and code parameters using misc3 section of match parameters. Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 47 +++++++++++++++++++++++++ include/linux/mlx5/device.h | 2 ++ 2 files changed, 49 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 121f0a744e55..54ea0dae7ded 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1961,6 +1961,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, misc_parameters); void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); + void *misc_c_3 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_3); + void *misc_v_3 = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_3); struct flow_rule *rule = flow_cls_offload_flow_rule(f); struct flow_dissector *dissector = rule->match.dissector; u16 addr_type = 0; @@ -1990,6 +1994,7 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, BIT(FLOW_DISSECTOR_KEY_CT) | BIT(FLOW_DISSECTOR_KEY_ENC_IP) | BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) | + BIT(FLOW_DISSECTOR_KEY_ICMP) | BIT(FLOW_DISSECTOR_KEY_MPLS))) { NL_SET_ERR_MSG_MOD(extack, "Unsupported key"); netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n", @@ -2309,7 +2314,49 @@ static int __parse_cls_flower(struct mlx5e_priv *priv, if (match.mask->flags) *match_level = MLX5_MATCH_L4; } + if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP)) { + struct flow_match_icmp match; + flow_rule_match_icmp(rule, &match); + switch (ip_proto) { + case IPPROTO_ICMP: + if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_ICMP)) + return -EOPNOTSUPP; + MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_type, + match.mask->type); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_type, + match.key->type); + MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_code, + match.mask->code); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_code, + match.key->code); + break; + case IPPROTO_ICMPV6: + if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_ICMPV6)) + return -EOPNOTSUPP; + MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_type, + match.mask->type); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_type, + match.key->type); + MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_code, + match.mask->code); + MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_code, + match.key->code); + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Code and type matching only with ICMP and ICMPv6"); + netdev_err(priv->netdev, + "Code and type matching only with ICMP and ICMPv6\n"); + return -EINVAL; + } + if (match.mask->code || match.mask->type) { + *match_level = MLX5_MATCH_L4; + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3; + } + } return 0; } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index dc3d2508f5c6..92a029a800a0 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1142,6 +1142,8 @@ enum mlx5_flex_parser_protos { MLX5_FLEX_PROTO_GENEVE = 1 << 3, MLX5_FLEX_PROTO_CW_MPLS_GRE = 1 << 4, MLX5_FLEX_PROTO_CW_MPLS_UDP = 1 << 5, + MLX5_FLEX_PROTO_ICMP = 1 << 8, + MLX5_FLEX_PROTO_ICMPV6 = 1 << 9, }; /* MLX5 DEV CAPs */ -- cgit v1.2.3 From f90fc37f289cd0886ef3a12b2ea33b93b8d9d360 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 11:09:08 +0000 Subject: ptp_pch: Move 'pch_*()' prototypes to shared header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following W=1 kernel build warning(s): drivers/ptp/ptp_pch.c:193:6: warning: no previous prototype for ‘pch_ch_control_write’ [-Wmissing-prototypes] drivers/ptp/ptp_pch.c:201:5: warning: no previous prototype for ‘pch_ch_event_read’ [-Wmissing-prototypes] drivers/ptp/ptp_pch.c:212:6: warning: no previous prototype for ‘pch_ch_event_write’ [-Wmissing-prototypes] drivers/ptp/ptp_pch.c:220:5: warning: no previous prototype for ‘pch_src_uuid_lo_read’ [-Wmissing-prototypes] drivers/ptp/ptp_pch.c:231:5: warning: no previous prototype for ‘pch_src_uuid_hi_read’ [-Wmissing-prototypes] drivers/ptp/ptp_pch.c:242:5: warning: no previous prototype for ‘pch_rx_snap_read’ [-Wmissing-prototypes] drivers/ptp/ptp_pch.c:259:5: warning: no previous prototype for ‘pch_tx_snap_read’ [-Wmissing-prototypes] drivers/ptp/ptp_pch.c:300:5: warning: no previous prototype for ‘pch_set_station_address’ [-Wmissing-prototypes] Cc: Richard Cochran (maintainer:PTP HARDWARE CLOCK SUPPORT) Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Flavio Suligoi Cc: netdev@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: David S. Miller --- drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h | 8 -------- .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c | 1 + drivers/ptp/ptp_pch.c | 1 + include/linux/ptp_pch.h | 22 ++++++++++++++++++++++ 4 files changed, 24 insertions(+), 8 deletions(-) create mode 100644 include/linux/ptp_pch.h (limited to 'include') diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h index 3ce4899a0417..a6823c4d355d 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe.h @@ -612,14 +612,6 @@ void pch_gbe_free_tx_resources(struct pch_gbe_adapter *adapter, void pch_gbe_free_rx_resources(struct pch_gbe_adapter *adapter, struct pch_gbe_rx_ring *rx_ring); void pch_gbe_update_stats(struct pch_gbe_adapter *adapter); -void pch_ch_control_write(struct pci_dev *pdev, u32 val); -u32 pch_ch_event_read(struct pci_dev *pdev); -void pch_ch_event_write(struct pci_dev *pdev, u32 val); -u32 pch_src_uuid_lo_read(struct pci_dev *pdev); -u32 pch_src_uuid_hi_read(struct pci_dev *pdev); -u64 pch_rx_snap_read(struct pci_dev *pdev); -u64 pch_tx_snap_read(struct pci_dev *pdev); -int pch_set_station_address(u8 *addr, struct pci_dev *pdev); /* pch_gbe_param.c */ void pch_gbe_check_options(struct pch_gbe_adapter *adapter); diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c index 140cee7c459d..334af49e5add 100644 --- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c +++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #define DRV_VERSION "1.01" diff --git a/drivers/ptp/ptp_pch.c b/drivers/ptp/ptp_pch.c index f7ff7230623e..fa4417ad02e0 100644 --- a/drivers/ptp/ptp_pch.c +++ b/drivers/ptp/ptp_pch.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #define STATION_ADDR_LEN 20 diff --git a/include/linux/ptp_pch.h b/include/linux/ptp_pch.h new file mode 100644 index 000000000000..51818198c292 --- /dev/null +++ b/include/linux/ptp_pch.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * PTP PCH + * + * Copyright 2019 Linaro Ltd. + * + * Author Lee Jones + */ + +#ifndef _PTP_PCH_H_ +#define _PTP_PCH_H_ + +void pch_ch_control_write(struct pci_dev *pdev, u32 val); +u32 pch_ch_event_read(struct pci_dev *pdev); +void pch_ch_event_write(struct pci_dev *pdev, u32 val); +u32 pch_src_uuid_lo_read(struct pci_dev *pdev); +u32 pch_src_uuid_hi_read(struct pci_dev *pdev); +u64 pch_rx_snap_read(struct pci_dev *pdev); +u64 pch_tx_snap_read(struct pci_dev *pdev); +int pch_set_station_address(u8 *addr, struct pci_dev *pdev); + +#endif /* _PTP_PCH_H_ */ -- cgit v1.2.3 From 6445e17af7c58b8a9be8ebf400b04c65202f6497 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Mar 2021 17:16:11 -0800 Subject: mptcp: add rm_list in mptcp_out_options This patch defined a new struct mptcp_rm_list, the ids field was an array of the removing address ids, the nr field was the valid number of removing address ids in the array. The array size was definced as a new macro MPTCP_RM_IDS_MAX. Changed the member rm_id of struct mptcp_out_options to rm_list. In mptcp_established_options_rm_addr, invoked mptcp_pm_rm_addr_signal to get the rm_list. According the number of addresses in it, calculated the padded RM_ADDR suboption length. And saved the ids array in struct mptcp_out_options's rm_list member. In mptcp_write_options, iterated each address id from struct mptcp_out_options's rm_list member, set the invalid ones as TCPOPT_NOP, then filled them into the RM_ADDR suboption. Changed TCPOLEN_MPTCP_RM_ADDR_BASE from 4 to 3. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 9 ++++++++- net/mptcp/options.c | 35 +++++++++++++++++++++++++++-------- net/mptcp/pm.c | 5 +++-- net/mptcp/protocol.h | 12 ++++++++++-- 4 files changed, 48 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 5694370be3d4..cea69c801595 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -34,6 +34,13 @@ struct mptcp_ext { /* one byte hole */ }; +#define MPTCP_RM_IDS_MAX 8 + +struct mptcp_rm_list { + u8 ids[MPTCP_RM_IDS_MAX]; + u8 nr; +}; + struct mptcp_out_options { #if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; @@ -48,7 +55,7 @@ struct mptcp_out_options { u8 addr_id; u16 port; u64 ahmac; - u8 rm_id; + struct mptcp_rm_list rm_list; u8 join_id; u8 backup; u32 nonce; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 444a38681e93..e74d0513187f 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -674,20 +674,25 @@ static bool mptcp_established_options_rm_addr(struct sock *sk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); - u8 rm_id; + struct mptcp_rm_list rm_list; + int i, len; if (!mptcp_pm_should_rm_signal(msk) || - !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id))) + !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; - if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE) + len = mptcp_rm_addr_len(&rm_list); + if (len < 0) + return false; + if (remaining < len) return false; - *size = TCPOLEN_MPTCP_RM_ADDR_BASE; + *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; - opts->rm_id = rm_id; + opts->rm_list = rm_list; - pr_debug("rm_id=%d", opts->rm_id); + for (i = 0; i < opts->rm_list.nr; i++) + pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); return true; } @@ -1217,9 +1222,23 @@ mp_capable_done: } if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { + u8 i = 1; + *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, - TCPOLEN_MPTCP_RM_ADDR_BASE, - 0, opts->rm_id); + TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, + 0, opts->rm_list.ids[0]); + + while (i < opts->rm_list.nr) { + u8 id1, id2, id3, id4; + + id1 = opts->rm_list.ids[i]; + id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; + id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; + id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; + put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); + ptr += 1; + i += 4; + } } if (OPTION_MPTCP_PRIO & opts->suboptions) { diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 6fd4b2c1b076..0654c86cd5ff 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -258,7 +258,7 @@ out_unlock: } bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - u8 *rm_id) + struct mptcp_rm_list *rm_list) { int ret = false; @@ -271,7 +271,8 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE) goto out_unlock; - *rm_id = msk->pm.rm_id; + rm_list->ids[0] = msk->pm.rm_id; + rm_list->nr = 1; WRITE_ONCE(msk->pm.addr_signal, 0); ret = true; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e21a5bc36cf0..c896bcf3e70f 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -61,7 +61,7 @@ #define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 #define TCPOLEN_MPTCP_PORT_LEN 2 #define TCPOLEN_MPTCP_PORT_ALIGN 2 -#define TCPOLEN_MPTCP_RM_ADDR_BASE 4 +#define TCPOLEN_MPTCP_RM_ADDR_BASE 3 #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 @@ -709,10 +709,18 @@ static inline unsigned int mptcp_add_addr_len(int family, bool echo, bool port) return len; } +static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) +{ + if (rm_list->nr == 0 || rm_list->nr > MPTCP_RM_IDS_MAX) + return -EINVAL; + + return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; +} + bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_addr_info *saddr, bool *echo, bool *port); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - u8 *rm_id); + struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); void __init mptcp_pm_nl_init(void); -- cgit v1.2.3 From 25660156f4cc4cf0cb55deda69f999dab554b750 Mon Sep 17 00:00:00 2001 From: Xingfeng Hu Date: Fri, 12 Mar 2021 15:08:29 +0100 Subject: flow_offload: add support for packet-per-second policing Allow flow_offload API to configure packet-per-second policing using rate and burst parameters. Dummy implementations of tcf_police_rate_pkt_ps() and tcf_police_burst_pkt() are supplied which return 0, the unconfigured state. This is to facilitate splitting the offload, driver, and TC code portion of this feature into separate patches with the aim of providing a logical flow for review. And the implementation of these helpers will be filled out by a follow-up patch. Signed-off-by: Xingfeng Hu Signed-off-by: Simon Horman Signed-off-by: Louis Peens Signed-off-by: David S. Miller --- include/net/flow_offload.h | 2 ++ include/net/tc_act/tc_police.h | 12 ++++++++++++ net/sched/cls_api.c | 3 +++ 3 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index e6bd8ebf9ac3..fde025c57b4f 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -234,6 +234,8 @@ struct flow_action_entry { u32 index; u32 burst; u64 rate_bytes_ps; + u64 burst_pkt; + u64 rate_pkt_ps; u32 mtu; } police; struct { /* FLOW_ACTION_CT */ diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 6d1e26b709b5..ae117f7937d5 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -97,6 +97,18 @@ static inline u32 tcf_police_burst(const struct tc_action *act) return burst; } +static inline u64 tcf_police_rate_pkt_ps(const struct tc_action *act) +{ + /* Not implemented */ + return 0; +} + +static inline u32 tcf_police_burst_pkt(const struct tc_action *act) +{ + /* Not implemented */ + return 0; +} + static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act) { struct tcf_police *police = to_police(act); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e37556cc37ab..ca8e177bf31b 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -3661,6 +3661,9 @@ int tc_setup_flow_action(struct flow_action *flow_action, entry->police.burst = tcf_police_burst(act); entry->police.rate_bytes_ps = tcf_police_rate_bytes_ps(act); + entry->police.burst_pkt = tcf_police_burst_pkt(act); + entry->police.rate_pkt_ps = + tcf_police_rate_pkt_ps(act); entry->police.mtu = tcf_police_tcfp_mtu(act); entry->police.index = act->tcfa_index; } else if (is_tcf_ct(act)) { -- cgit v1.2.3 From 2ffe0395288aa237ff7e0143366bd1cd57bfc5b7 Mon Sep 17 00:00:00 2001 From: Baowen Zheng Date: Fri, 12 Mar 2021 15:08:31 +0100 Subject: net/sched: act_police: add support for packet-per-second policing Allow a policer action to enforce a rate-limit based on packets-per-second, configurable using a packet-per-second rate and burst parameters. e.g. tc filter add dev tap1 parent ffff: u32 match \ u32 0 0 police pkts_rate 3000 pkts_burst 1000 Testing was unable to uncover a performance impact of this change on existing features. Signed-off-by: Baowen Zheng Signed-off-by: Simon Horman Signed-off-by: Louis Peens Signed-off-by: David S. Miller --- include/net/sch_generic.h | 14 ++++++++ include/net/tc_act/tc_police.h | 48 ++++++++++++++++++++++++--- include/uapi/linux/pkt_cls.h | 2 ++ net/sched/act_police.c | 59 +++++++++++++++++++++++++++++---- net/sched/sch_generic.c | 75 ++++++++++++++++++++++++++++-------------- 5 files changed, 162 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2d6eb60c58c8..f7a6e14491fb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1242,6 +1242,20 @@ static inline void psched_ratecfg_getrate(struct tc_ratespec *res, res->linklayer = (r->linklayer & TC_LINKLAYER_MASK); } +struct psched_pktrate { + u64 rate_pkts_ps; /* packets per second */ + u32 mult; + u8 shift; +}; + +static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r, + unsigned int pkt_num) +{ + return ((u64)pkt_num * r->mult) >> r->shift; +} + +void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); + /* Mini Qdisc serves for specific needs of ingress/clsact Qdisc. * The fast path only needs to access filter list and to update stats */ diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index ae117f7937d5..72649512dcdd 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -10,10 +10,13 @@ struct tcf_police_params { s64 tcfp_burst; u32 tcfp_mtu; s64 tcfp_mtu_ptoks; + s64 tcfp_pkt_burst; struct psched_ratecfg rate; bool rate_present; struct psched_ratecfg peak; bool peak_present; + struct psched_pktrate ppsrate; + bool pps_present; struct rcu_head rcu; }; @@ -24,6 +27,7 @@ struct tcf_police { spinlock_t tcfp_lock ____cacheline_aligned_in_smp; s64 tcfp_toks; s64 tcfp_ptoks; + s64 tcfp_pkttoks; s64 tcfp_t_c; }; @@ -99,14 +103,50 @@ static inline u32 tcf_police_burst(const struct tc_action *act) static inline u64 tcf_police_rate_pkt_ps(const struct tc_action *act) { - /* Not implemented */ - return 0; + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->ppsrate.rate_pkts_ps; } static inline u32 tcf_police_burst_pkt(const struct tc_action *act) { - /* Not implemented */ - return 0; + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + u32 burst; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + + /* + * "rate" pkts "burst" nanoseconds + * ------------ * ------------------- + * 1 second 2^6 ticks + * + * ------------------------------------ + * NSEC_PER_SEC nanoseconds + * ------------------------ + * 2^6 ticks + * + * "rate" pkts "burst" nanoseconds 2^6 ticks + * = ------------ * ------------------- * ------------------------ + * 1 second 2^6 ticks NSEC_PER_SEC nanoseconds + * + * "rate" * "burst" + * = ---------------- pkts/nanosecond + * NSEC_PER_SEC^2 + * + * + * "rate" * "burst" + * = ---------------- pkts/second + * NSEC_PER_SEC + */ + burst = div_u64(params->tcfp_pkt_burst * params->ppsrate.rate_pkts_ps, + NSEC_PER_SEC); + + return burst; } static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act) diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7ea59cfe1fa7..025c40fef93d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -190,6 +190,8 @@ enum { TCA_POLICE_PAD, TCA_POLICE_RATE64, TCA_POLICE_PEAKRATE64, + TCA_POLICE_PKTRATE64, + TCA_POLICE_PKTBURST64, __TCA_POLICE_MAX #define TCA_POLICE_RESULT TCA_POLICE_RESULT }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 8d8452b1cdd4..0fab8de176d2 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -42,6 +42,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RESULT] = { .type = NLA_U32 }, [TCA_POLICE_RATE64] = { .type = NLA_U64 }, [TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 }, + [TCA_POLICE_PKTRATE64] = { .type = NLA_U64, .min = 1 }, + [TCA_POLICE_PKTBURST64] = { .type = NLA_U64, .min = 1 }, }; static int tcf_police_init(struct net *net, struct nlattr *nla, @@ -61,6 +63,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, bool exists = false; u32 index; u64 rate64, prate64; + u64 pps, ppsburst; if (nla == NULL) return -EINVAL; @@ -142,6 +145,21 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } } + if ((tb[TCA_POLICE_PKTRATE64] && !tb[TCA_POLICE_PKTBURST64]) || + (!tb[TCA_POLICE_PKTRATE64] && tb[TCA_POLICE_PKTBURST64])) { + NL_SET_ERR_MSG(extack, + "Both or neither packet-per-second burst and rate must be provided"); + err = -EINVAL; + goto failure; + } + + if (tb[TCA_POLICE_PKTRATE64] && R_tab) { + NL_SET_ERR_MSG(extack, + "packet-per-second and byte-per-second rate limits not allowed in same action"); + err = -EINVAL; + goto failure; + } + new = kzalloc(sizeof(*new), GFP_KERNEL); if (unlikely(!new)) { err = -ENOMEM; @@ -183,6 +201,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, if (tb[TCA_POLICE_AVRATE]) new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]); + if (tb[TCA_POLICE_PKTRATE64]) { + pps = nla_get_u64(tb[TCA_POLICE_PKTRATE64]); + ppsburst = nla_get_u64(tb[TCA_POLICE_PKTBURST64]); + new->pps_present = true; + new->tcfp_pkt_burst = PSCHED_TICKS2NS(ppsburst); + psched_ppscfg_precompute(&new->ppsrate, pps); + } + spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); @@ -217,8 +243,8 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_police *police = to_police(a); + s64 now, toks, ppstoks = 0, ptoks = 0; struct tcf_police_params *p; - s64 now, toks, ptoks = 0; int ret; tcf_lastuse_update(&police->tcf_tm); @@ -236,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, } if (qdisc_pkt_len(skb) <= p->tcfp_mtu) { - if (!p->rate_present) { + if (!p->rate_present && !p->pps_present) { ret = p->tcfp_result; goto end; } @@ -251,14 +277,23 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, ptoks -= (s64)psched_l2t_ns(&p->peak, qdisc_pkt_len(skb)); } - toks += police->tcfp_toks; - if (toks > p->tcfp_burst) - toks = p->tcfp_burst; - toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); - if ((toks|ptoks) >= 0) { + if (p->rate_present) { + toks += police->tcfp_toks; + if (toks > p->tcfp_burst) + toks = p->tcfp_burst; + toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb)); + } else if (p->pps_present) { + ppstoks = min_t(s64, now - police->tcfp_t_c, p->tcfp_pkt_burst); + ppstoks += police->tcfp_pkttoks; + if (ppstoks > p->tcfp_pkt_burst) + ppstoks = p->tcfp_pkt_burst; + ppstoks -= (s64)psched_pkt2t_ns(&p->ppsrate, 1); + } + if ((toks | ptoks | ppstoks) >= 0) { police->tcfp_t_c = now; police->tcfp_toks = toks; police->tcfp_ptoks = ptoks; + police->tcfp_pkttoks = ppstoks; spin_unlock_bh(&police->tcfp_lock); ret = p->tcfp_result; goto inc_drops; @@ -331,6 +366,16 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, TCA_POLICE_PAD)) goto nla_put_failure; } + if (p->pps_present) { + if (nla_put_u64_64bit(skb, TCA_POLICE_PKTRATE64, + police->params->ppsrate.rate_pkts_ps, + TCA_POLICE_PAD)) + goto nla_put_failure; + if (nla_put_u64_64bit(skb, TCA_POLICE_PKTBURST64, + PSCHED_NS2TICKS(p->tcfp_pkt_burst), + TCA_POLICE_PAD)) + goto nla_put_failure; + } if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfp_result && diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 49eae93d1489..44991ea726fc 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1325,6 +1325,48 @@ void dev_shutdown(struct net_device *dev) WARN_ON(timer_pending(&dev->watchdog_timer)); } +/** + * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division + * @rate: Rate to compute reciprocal division values of + * @mult: Multiplier for reciprocal division + * @shift: Shift for reciprocal division + * + * The multiplier and shift for reciprocal division by rate are stored + * in mult and shift. + * + * The deal here is to replace a divide by a reciprocal one + * in fast path (a reciprocal divide is a multiply and a shift) + * + * Normal formula would be : + * time_in_ns = (NSEC_PER_SEC * len) / rate_bps + * + * We compute mult/shift to use instead : + * time_in_ns = (len * mult) >> shift; + * + * We try to get the highest possible mult value for accuracy, + * but have to make sure no overflows will ever happen. + * + * reciprocal_value() is not used here it doesn't handle 64-bit values. + */ +static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift) +{ + u64 factor = NSEC_PER_SEC; + + *mult = 1; + *shift = 0; + + if (rate <= 0) + return; + + for (;;) { + *mult = div64_u64(factor, rate); + if (*mult & (1U << 31) || factor & (1ULL << 63)) + break; + factor <<= 1; + (*shift)++; + } +} + void psched_ratecfg_precompute(struct psched_ratecfg *r, const struct tc_ratespec *conf, u64 rate64) @@ -1333,34 +1375,17 @@ void psched_ratecfg_precompute(struct psched_ratecfg *r, r->overhead = conf->overhead; r->rate_bytes_ps = max_t(u64, conf->rate, rate64); r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK); - r->mult = 1; - /* - * The deal here is to replace a divide by a reciprocal one - * in fast path (a reciprocal divide is a multiply and a shift) - * - * Normal formula would be : - * time_in_ns = (NSEC_PER_SEC * len) / rate_bps - * - * We compute mult/shift to use instead : - * time_in_ns = (len * mult) >> shift; - * - * We try to get the highest possible mult value for accuracy, - * but have to make sure no overflows will ever happen. - */ - if (r->rate_bytes_ps > 0) { - u64 factor = NSEC_PER_SEC; - - for (;;) { - r->mult = div64_u64(factor, r->rate_bytes_ps); - if (r->mult & (1U << 31) || factor & (1ULL << 63)) - break; - factor <<= 1; - r->shift++; - } - } + psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift); } EXPORT_SYMBOL(psched_ratecfg_precompute); +void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) +{ + r->rate_pkts_ps = pktrate64; + psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift); +} +EXPORT_SYMBOL(psched_ppscfg_precompute); + static void mini_qdisc_rcu_func(struct rcu_head *head) { } -- cgit v1.2.3 From dac06b32c705dc8824479b03eee826b4f6615ab2 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:00 +0000 Subject: flow_dissector: constify bpf_flow_dissector's data pointers BPF Flow dissection programs are read-only and don't touch input buffers. Mark 'data' and 'data_end' in struct bpf_flow_dissector as const in preparation for global input constifying. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index cc10b10dc3a1..bf00e71816ed 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -368,8 +368,8 @@ static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissec struct bpf_flow_dissector { struct bpf_flow_keys *flow_keys; const struct sk_buff *skb; - void *data; - void *data_end; + const void *data; + const void *data_end; }; static inline void -- cgit v1.2.3 From e3305138da47f0ae2241e5daa18af276e1e54457 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:14 +0000 Subject: skbuff: make __skb_header_pointer()'s data argument const The function never modifies the input buffer, so 'data' argument can be marked as const. This implies one harmless cast-away. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 483e89348f78..d6ea3dc3eddb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3678,11 +3678,11 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum); static inline void * __must_check -__skb_header_pointer(const struct sk_buff *skb, int offset, - int len, void *data, int hlen, void *buffer) +__skb_header_pointer(const struct sk_buff *skb, int offset, int len, + const void *data, int hlen, void *buffer) { if (hlen - offset >= len) - return data + offset; + return (void *)data + offset; if (!skb || skb_copy_bits(skb, offset, buffer, len) < 0) -- cgit v1.2.3 From f96533cded173b3b019001a505a746c3cd8fc323 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:23 +0000 Subject: flow_dissector: constify raw input data argument Flow Dissector code never modifies the input buffer, neither skb nor raw data. Make 'data' argument const for all of the Flow dissector's functions. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++-------- include/net/flow_dissector.h | 2 +- net/core/flow_dissector.c | 41 ++++++++++++++++++++++------------------- 3 files changed, 30 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6ea3dc3eddb..46c61e127e9f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1292,10 +1292,10 @@ __skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) void __skb_get_hash(struct sk_buff *skb); u32 __skb_get_hash_symmetric(const struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen); __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen_proto); + const void *data, int hlen_proto); static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto) @@ -1314,9 +1314,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen, - unsigned int flags); + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -1338,9 +1337,9 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, static inline bool skb_flow_dissect_flow_keys_basic(const struct net *net, const struct sk_buff *skb, - struct flow_keys_basic *flow, void *data, - __be16 proto, int nhoff, int hlen, - unsigned int flags) + struct flow_keys_basic *flow, + const void *data, __be16 proto, + int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bf00e71816ed..ffd386ea0dbb 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -350,7 +350,7 @@ static inline bool flow_keys_have_l4(const struct flow_keys *keys) u32 flow_hash_from_keys(struct flow_keys *keys); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, - void *data, int thoff, int hlen); + const void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2ef2224b3bff..2ed380d096ce 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,7 +114,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * is the protocol port offset returned from proto_ports_offset */ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen) + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -161,7 +161,7 @@ static bool icmp_has_id(u8 type) */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, - void *data, int thoff, int hlen) + const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; @@ -187,8 +187,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci); */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; @@ -409,8 +409,8 @@ EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen, - int lse_index, bool *entropy_label) + void *target_container, const void *data, int nhoff, + int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; @@ -467,7 +467,8 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, const void *data, + int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { @@ -523,7 +524,7 @@ static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, - void *target_container, void *data, + void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { @@ -663,8 +664,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, - void *data, __be16 *p_proto, int *p_nhoff, int hlen, - unsigned int flags) + const void *data, __be16 *p_proto, int *p_nhoff, + int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; @@ -695,7 +696,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb, static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; @@ -719,8 +721,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb, static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, - u8 ip_proto, int hlen) + void *target_container, const void *data, + int nhoff, u8 ip_proto, int hlen) { enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; struct flow_dissector_key_ports *key_ports; @@ -744,7 +746,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct iphdr *iph) + void *target_container, const void *data, + const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -761,7 +764,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb, static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct ipv6hdr *iph) + void *target_container, const void *data, + const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -908,9 +912,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen, - unsigned int flags) + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -1642,7 +1645,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, } EXPORT_SYMBOL(skb_get_hash_perturb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; -- cgit v1.2.3 From 805a25f3a1bdf4aafd0af412ce1e47d0cb6c7628 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:32 +0000 Subject: linux/etherdevice.h: misc trailing whitespace cleanup Caught by the text editor. Fix it separately from the actual changes. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2e5debc0373c..bcb2f81baafb 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -11,7 +11,7 @@ * Authors: Ross Biro * Fred N. van Kempen, * - * Relocated to include/linux where it belongs by Alan Cox + * Relocated to include/linux where it belongs by Alan Cox * */ #ifndef _LINUX_ETHERDEVICE_H -- cgit v1.2.3 From 59753ce8b196de60211a989c75ece8aeb0d9d57c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:41 +0000 Subject: ethernet: constify eth_get_headlen()'s data argument It's used only for flow dissection, which now takes constant data pointers. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 2 +- net/ethernet/eth.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index bcb2f81baafb..330345b1be54 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -29,7 +29,7 @@ struct device; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); -u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len); +u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 4106373180c6..e01cf766d2c5 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -122,7 +122,7 @@ EXPORT_SYMBOL(eth_header); * Make a best effort attempt to pull the length for all of the headers for * a given frame in a linear buffer. */ -u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len) +u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len) { const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG; const struct ethhdr *eth = (const struct ethhdr *)data; -- cgit v1.2.3 From d206121faf8bb2239cd970af0bd32f5203780427 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:50 +0000 Subject: skbuff: micro-optimize {,__}skb_header_pointer() {,__}skb_header_pointer() helpers exist mainly for preventing accesses-beyond-end of the linear data. In the vast majorify of cases, they bail out on the first condition. All code going after is mostly a fallback. Mark the most common branch as 'likely' one to move it in-line. Also, skb_copy_bits() can return negative values only when the input arguments are invalid, e.g. offset is greater than skb->len. It can be safely marked as 'unlikely' branch, assuming that hotpath code provides sane input to not fail here. These two bump the throughput with a single Flow Dissector pass on every packet (e.g. with RPS or driver that uses eth_get_headlen()) on 20 Mbps per flow/core. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 46c61e127e9f..ecc029674ae4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3680,11 +3680,10 @@ static inline void * __must_check __skb_header_pointer(const struct sk_buff *skb, int offset, int len, const void *data, int hlen, void *buffer) { - if (hlen - offset >= len) + if (likely(hlen - offset >= len)) return (void *)data + offset; - if (!skb || - skb_copy_bits(skb, offset, buffer, len) < 0) + if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0)) return NULL; return buffer; -- cgit v1.2.3 From a03e99d39f1943ec88f6fd3b0b9f34c20663d401 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 14 Mar 2021 14:19:30 +0200 Subject: psample: Encapsulate packet metadata in a struct Currently, callers of psample_sample_packet() pass three metadata attributes: Ingress port, egress port and truncated size. Subsequent patches are going to add more attributes (e.g., egress queue occupancy), which also need an indication whether they are valid or not. Encapsulate packet metadata in a struct in order to keep the number of arguments reasonable. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 8 ++++---- include/net/psample.h | 14 +++++++++----- net/psample/psample.c | 6 ++++-- net/sched/act_sample.c | 16 ++++++---------- 4 files changed, 23 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 93b15b8c007e..3b15f8d728a3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2217,7 +2217,7 @@ void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, { struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port]; struct mlxsw_sp_port_sample *sample; - u32 size; + struct psample_metadata md = {}; if (unlikely(!mlxsw_sp_port)) { dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n", @@ -2229,9 +2229,9 @@ void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb, sample = rcu_dereference(mlxsw_sp_port->sample); if (!sample) goto out_unlock; - size = sample->truncate ? sample->trunc_size : skb->len; - psample_sample_packet(sample->psample_group, skb, size, - mlxsw_sp_port->dev->ifindex, 0, sample->rate); + md.trunc_size = sample->truncate ? sample->trunc_size : skb->len; + md.in_ifindex = mlxsw_sp_port->dev->ifindex; + psample_sample_packet(sample->psample_group, skb, sample->rate, &md); out_unlock: rcu_read_unlock(); out: diff --git a/include/net/psample.h b/include/net/psample.h index 68ae16bb0a4a..ac6dbfb3870d 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -14,6 +14,12 @@ struct psample_group { struct rcu_head rcu; }; +struct psample_metadata { + u32 trunc_size; + int in_ifindex; + int out_ifindex; +}; + struct psample_group *psample_group_get(struct net *net, u32 group_num); void psample_group_take(struct psample_group *group); void psample_group_put(struct psample_group *group); @@ -21,15 +27,13 @@ void psample_group_put(struct psample_group *group); #if IS_ENABLED(CONFIG_PSAMPLE) void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, - u32 trunc_size, int in_ifindex, int out_ifindex, - u32 sample_rate); + u32 sample_rate, const struct psample_metadata *md); #else static inline void psample_sample_packet(struct psample_group *group, - struct sk_buff *skb, u32 trunc_size, - int in_ifindex, int out_ifindex, - u32 sample_rate) + struct sk_buff *skb, u32 sample_rate, + const struct psample_metadata *md) { } diff --git a/net/psample/psample.c b/net/psample/psample.c index 482c07f2766b..065bc887d239 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -356,9 +356,11 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) #endif void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, - u32 trunc_size, int in_ifindex, int out_ifindex, - u32 sample_rate) + u32 sample_rate, const struct psample_metadata *md) { + int out_ifindex = md->out_ifindex; + int in_ifindex = md->in_ifindex; + u32 trunc_size = md->trunc_size; #ifdef CONFIG_INET struct ip_tunnel_info *tun_info; #endif diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index db8ee9e5c8c2..6a0c16e4351d 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -158,10 +158,8 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; + struct psample_metadata md = {}; int retval; - int size; - int iif; - int oif; tcf_lastuse_update(&s->tcf_tm); bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); @@ -172,20 +170,18 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, /* randomly sample packets according to rate */ if (psample_group && (prandom_u32() % s->rate == 0)) { if (!skb_at_tc_ingress(skb)) { - iif = skb->skb_iif; - oif = skb->dev->ifindex; + md.in_ifindex = skb->skb_iif; + md.out_ifindex = skb->dev->ifindex; } else { - iif = skb->dev->ifindex; - oif = 0; + md.in_ifindex = skb->dev->ifindex; } /* on ingress, the mac header gets popped, so push it back */ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); - size = s->truncate ? s->trunc_size : skb->len; - psample_sample_packet(psample_group, skb, size, iif, oif, - s->rate); + md.trunc_size = s->truncate ? s->trunc_size : skb->len; + psample_sample_packet(psample_group, skb, s->rate, &md); if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_pull(skb, skb->mac_len); -- cgit v1.2.3 From 07e1a5809b595df6e125504dff6245cb2c8ed3de Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 14 Mar 2021 14:19:31 +0200 Subject: psample: Add additional metadata attributes Extend psample to report the following attributes when available: * Output traffic class as a 16-bit value * Output traffic class occupancy in bytes as a 64-bit value * End-to-end latency of the packet in nanoseconds resolution * Software timestamp in nanoseconds resolution (always available) * Packet's protocol. Needed for packet dissection in user space (always available) Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/psample.h | 7 +++++++ include/uapi/linux/psample.h | 7 +++++++ net/psample/psample.c | 39 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/psample.h b/include/net/psample.h index ac6dbfb3870d..e328c5127757 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -18,6 +18,13 @@ struct psample_metadata { u32 trunc_size; int in_ifindex; int out_ifindex; + u16 out_tc; + u64 out_tc_occ; /* bytes */ + u64 latency; /* nanoseconds */ + u8 out_tc_valid:1, + out_tc_occ_valid:1, + latency_valid:1, + unused:5; }; struct psample_group *psample_group_get(struct net *net, u32 group_num); diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h index aea26ab1431c..c6329f71b939 100644 --- a/include/uapi/linux/psample.h +++ b/include/uapi/linux/psample.h @@ -16,6 +16,13 @@ enum { /* commands attributes */ PSAMPLE_ATTR_GROUP_REFCOUNT, + PSAMPLE_ATTR_PAD, + PSAMPLE_ATTR_OUT_TC, /* u16 */ + PSAMPLE_ATTR_OUT_TC_OCC, /* u64, bytes */ + PSAMPLE_ATTR_LATENCY, /* u64, nanoseconds */ + PSAMPLE_ATTR_TIMESTAMP, /* u64, nanoseconds */ + PSAMPLE_ATTR_PROTO, /* u16 */ + __PSAMPLE_ATTR_MAX }; diff --git a/net/psample/psample.c b/net/psample/psample.c index 065bc887d239..118d5d2a81a0 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -358,6 +359,7 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, u32 sample_rate, const struct psample_metadata *md) { + ktime_t tstamp = ktime_get_real(); int out_ifindex = md->out_ifindex; int in_ifindex = md->in_ifindex; u32 trunc_size = md->trunc_size; @@ -372,10 +374,15 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) + (out_ifindex ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) + + (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + + (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) + nla_total_size(sizeof(u32)) + /* sample_rate */ nla_total_size(sizeof(u32)) + /* orig_size */ nla_total_size(sizeof(u32)) + /* group_num */ - nla_total_size(sizeof(u32)); /* seq */ + nla_total_size(sizeof(u32)) + /* seq */ + nla_total_size_64bit(sizeof(u64)) + /* timestamp */ + nla_total_size(sizeof(u16)); /* protocol */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -425,6 +432,36 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, if (unlikely(ret < 0)) goto error; + if (md->out_tc_valid) { + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc); + if (unlikely(ret < 0)) + goto error; + } + + if (md->out_tc_occ_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC, + md->out_tc_occ, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + if (md->latency_valid) { + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY, + md->latency, PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + } + + ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP, + ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD); + if (unlikely(ret < 0)) + goto error; + + ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO, + be16_to_cpu(skb->protocol)); + if (unlikely(ret < 0)) + goto error; + if (data_len) { int nla_len = nla_total_size(data_len); struct nlattr *nla; -- cgit v1.2.3 From 9cb24ea051857f2a7ab85c42842c5baa40497e53 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 14 Mar 2021 18:24:02 +0300 Subject: atm: delete include/linux/atm_suni.h This file has been effectively empty since 2.3.99-pre3 ! Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/atm/fore200e.c | 1 - drivers/atm/suni.c | 1 - include/linux/atm_suni.h | 12 ------------ 3 files changed, 14 deletions(-) delete mode 100644 include/linux/atm_suni.h (limited to 'include') diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index 9a70bee84125..0b9c99c3d218 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c index c920a8c52925..21e5acc766b8 100644 --- a/drivers/atm/suni.c +++ b/drivers/atm/suni.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/atm_suni.h b/include/linux/atm_suni.h deleted file mode 100644 index 84f3aab54468..000000000000 --- a/include/linux/atm_suni.h +++ /dev/null @@ -1,12 +0,0 @@ -/* atm_suni.h - Driver-specific declarations of the SUNI driver (for use by - driver-specific utilities) */ - -/* Written 1998,2000 by Werner Almesberger, EPFL ICA */ - - -#ifndef LINUX_ATM_SUNI_H -#define LINUX_ATM_SUNI_H - -/* everything obsoleted */ - -#endif -- cgit v1.2.3 From 07a4bc51fc732b3618fd46dc51609948933064a4 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Mon, 15 Mar 2021 13:27:06 +0800 Subject: net: pcs: rearrange C73 functions to prepare for C37 support later The current implementation for XPCS is validated for C73, so we rename them to have _c73 suffix and introduce a set of functions to use an_mode flag to switch between C73 and C37 AN later. Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/pcs/pcs-xpcs.c | 94 +++++++++++++++++++++++++++++++------------- include/linux/pcs/pcs-xpcs.h | 4 ++ 2 files changed, 70 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 1aa9903d602e..10def2d98696 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -125,22 +125,26 @@ static struct xpcs_id { u32 mask; const int *supported; const phy_interface_t *interface; + int an_mode; } xpcs_id_list[] = { { .id = SYNOPSYS_XPCS_USXGMII_ID, .mask = SYNOPSYS_XPCS_MASK, .supported = xpcs_usxgmii_features, .interface = xpcs_usxgmii_interfaces, + .an_mode = DW_AN_C73, }, { .id = SYNOPSYS_XPCS_10GKR_ID, .mask = SYNOPSYS_XPCS_MASK, .supported = xpcs_10gkr_features, .interface = xpcs_10gkr_interfaces, + .an_mode = DW_AN_C73, }, { .id = SYNOPSYS_XPCS_XLGMII_ID, .mask = SYNOPSYS_XPCS_MASK, .supported = xpcs_xlgmii_features, .interface = xpcs_xlgmii_interfaces, + .an_mode = DW_AN_C73, }, }; @@ -195,9 +199,17 @@ static int xpcs_poll_reset(struct mdio_xpcs_args *xpcs, int dev) return (ret & MDIO_CTRL1_RESET) ? -ETIMEDOUT : 0; } -static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs, int dev) +static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs) { - int ret; + int ret, dev; + + switch (xpcs->an_mode) { + case DW_AN_C73: + dev = MDIO_MMD_PCS; + break; + default: + return -1; + } ret = xpcs_write(xpcs, dev, MDIO_CTRL1, MDIO_CTRL1_RESET); if (ret < 0) @@ -212,8 +224,8 @@ static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs, int dev) dev_warn(&(__xpcs)->bus->dev, ##__args); \ }) -static int xpcs_read_fault(struct mdio_xpcs_args *xpcs, - struct phylink_link_state *state) +static int xpcs_read_fault_c73(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) { int ret; @@ -263,7 +275,7 @@ static int xpcs_read_fault(struct mdio_xpcs_args *xpcs, return 0; } -static int xpcs_read_link(struct mdio_xpcs_args *xpcs, bool an) +static int xpcs_read_link_c73(struct mdio_xpcs_args *xpcs, bool an) { bool link = true; int ret; @@ -357,7 +369,7 @@ static int xpcs_config_usxgmii(struct mdio_xpcs_args *xpcs, int speed) return xpcs_write_vpcs(xpcs, MDIO_CTRL1, ret | DW_USXGMII_RST); } -static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs) +static int _xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs) { int ret, adv; @@ -401,11 +413,11 @@ static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs) return xpcs_write(xpcs, MDIO_MMD_AN, DW_SR_AN_ADV1, adv); } -static int xpcs_config_aneg(struct mdio_xpcs_args *xpcs) +static int xpcs_config_aneg_c73(struct mdio_xpcs_args *xpcs) { int ret; - ret = xpcs_config_aneg_c73(xpcs); + ret = _xpcs_config_aneg_c73(xpcs); if (ret < 0) return ret; @@ -418,8 +430,8 @@ static int xpcs_config_aneg(struct mdio_xpcs_args *xpcs) return xpcs_write(xpcs, MDIO_MMD_AN, MDIO_CTRL1, ret); } -static int xpcs_aneg_done(struct mdio_xpcs_args *xpcs, - struct phylink_link_state *state) +static int xpcs_aneg_done_c73(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) { int ret; @@ -434,7 +446,7 @@ static int xpcs_aneg_done(struct mdio_xpcs_args *xpcs, /* Check if Aneg outcome is valid */ if (!(ret & DW_C73_AN_ADV_SF)) { - xpcs_config_aneg(xpcs); + xpcs_config_aneg_c73(xpcs); return 0; } @@ -444,8 +456,8 @@ static int xpcs_aneg_done(struct mdio_xpcs_args *xpcs, return 0; } -static int xpcs_read_lpa(struct mdio_xpcs_args *xpcs, - struct phylink_link_state *state) +static int xpcs_read_lpa_c73(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) { int ret; @@ -493,8 +505,8 @@ static int xpcs_read_lpa(struct mdio_xpcs_args *xpcs, return 0; } -static void xpcs_resolve_lpa(struct mdio_xpcs_args *xpcs, - struct phylink_link_state *state) +static void xpcs_resolve_lpa_c73(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) { int max_speed = xpcs_get_max_usxgmii_speed(state->lp_advertising); @@ -590,27 +602,33 @@ static int xpcs_config(struct mdio_xpcs_args *xpcs, { int ret; - if (state->an_enabled) { - ret = xpcs_config_aneg(xpcs); - if (ret) - return ret; + switch (xpcs->an_mode) { + case DW_AN_C73: + if (state->an_enabled) { + ret = xpcs_config_aneg_c73(xpcs); + if (ret) + return ret; + } + break; + default: + return -1; } return 0; } -static int xpcs_get_state(struct mdio_xpcs_args *xpcs, - struct phylink_link_state *state) +static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) { int ret; /* Link needs to be read first ... */ - state->link = xpcs_read_link(xpcs, state->an_enabled) > 0 ? 1 : 0; + state->link = xpcs_read_link_c73(xpcs, state->an_enabled) > 0 ? 1 : 0; /* ... and then we check the faults. */ - ret = xpcs_read_fault(xpcs, state); + ret = xpcs_read_fault_c73(xpcs, state); if (ret) { - ret = xpcs_soft_reset(xpcs, MDIO_MMD_PCS); + ret = xpcs_soft_reset(xpcs); if (ret) return ret; @@ -619,10 +637,10 @@ static int xpcs_get_state(struct mdio_xpcs_args *xpcs, return xpcs_config(xpcs, state); } - if (state->an_enabled && xpcs_aneg_done(xpcs, state)) { + if (state->an_enabled && xpcs_aneg_done_c73(xpcs, state)) { state->an_complete = true; - xpcs_read_lpa(xpcs, state); - xpcs_resolve_lpa(xpcs, state); + xpcs_read_lpa_c73(xpcs, state); + xpcs_resolve_lpa_c73(xpcs, state); } else if (state->an_enabled) { state->link = 0; } else if (state->link) { @@ -632,6 +650,24 @@ static int xpcs_get_state(struct mdio_xpcs_args *xpcs, return 0; } +static int xpcs_get_state(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) +{ + int ret; + + switch (xpcs->an_mode) { + case DW_AN_C73: + ret = xpcs_get_state_c73(xpcs, state); + if (ret) + return ret; + break; + default: + return -1; + } + + return 0; +} + static int xpcs_link_up(struct mdio_xpcs_args *xpcs, int speed, phy_interface_t interface) { @@ -676,6 +712,8 @@ static bool xpcs_check_features(struct mdio_xpcs_args *xpcs, for (i = 0; match->supported[i] != __ETHTOOL_LINK_MODE_MASK_NBITS; i++) set_bit(match->supported[i], xpcs->supported); + xpcs->an_mode = match->an_mode; + return true; } @@ -692,7 +730,7 @@ static int xpcs_probe(struct mdio_xpcs_args *xpcs, phy_interface_t interface) match = entry; if (xpcs_check_features(xpcs, match, interface)) - return xpcs_soft_reset(xpcs, MDIO_MMD_PCS); + return xpcs_soft_reset(xpcs); } } diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 351c1c9aedc5..a04e57c25fea 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -10,10 +10,14 @@ #include #include +/* AN mode */ +#define DW_AN_C73 1 + struct mdio_xpcs_args { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); struct mii_bus *bus; int addr; + int an_mode; }; struct mdio_xpcs_ops { -- cgit v1.2.3 From b97b5331b8ab7f60fb880e0c31c9b09b73d2fa4e Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Mon, 15 Mar 2021 13:27:07 +0800 Subject: net: pcs: add C37 SGMII AN support for intel mGbE controller XPCS IP supports C37 SGMII AN process and it is used in intel multi-GbE controller as MAC-side SGMII. Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/pcs/pcs-xpcs.c | 167 ++++++++++++++++++++++++++++++++++++++++++- include/linux/pcs/pcs-xpcs.h | 1 + 2 files changed, 167 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 10def2d98696..944ba105cac1 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -15,6 +15,7 @@ #define SYNOPSYS_XPCS_USXGMII_ID 0x7996ced0 #define SYNOPSYS_XPCS_10GKR_ID 0x7996ced0 #define SYNOPSYS_XPCS_XLGMII_ID 0x7996ced0 +#define SYNOPSYS_XPCS_SGMII_ID 0x7996ced0 #define SYNOPSYS_XPCS_MASK 0xffffffff /* Vendor regs access */ @@ -57,6 +58,34 @@ #define DW_C73_2500KX BIT(0) #define DW_C73_5000KR BIT(1) +/* Clause 37 Defines */ +/* VR MII MMD registers offsets */ +#define DW_VR_MII_DIG_CTRL1 0x8000 +#define DW_VR_MII_AN_CTRL 0x8001 +#define DW_VR_MII_AN_INTR_STS 0x8002 + +/* VR_MII_DIG_CTRL1 */ +#define DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW BIT(9) + +/* VR_MII_AN_CTRL */ +#define DW_VR_MII_AN_CTRL_TX_CONFIG_SHIFT 3 +#define DW_VR_MII_TX_CONFIG_MASK BIT(3) +#define DW_VR_MII_TX_CONFIG_PHY_SIDE_SGMII 0x1 +#define DW_VR_MII_TX_CONFIG_MAC_SIDE_SGMII 0x0 +#define DW_VR_MII_AN_CTRL_PCS_MODE_SHIFT 1 +#define DW_VR_MII_PCS_MODE_MASK GENMASK(2, 1) +#define DW_VR_MII_PCS_MODE_C37_1000BASEX 0x0 +#define DW_VR_MII_PCS_MODE_C37_SGMII 0x2 + +/* VR_MII_AN_INTR_STS */ +#define DW_VR_MII_AN_STS_C37_ANSGM_FD BIT(1) +#define DW_VR_MII_AN_STS_C37_ANSGM_SP_SHIFT 2 +#define DW_VR_MII_AN_STS_C37_ANSGM_SP GENMASK(3, 2) +#define DW_VR_MII_C37_ANSGM_SP_10 0x0 +#define DW_VR_MII_C37_ANSGM_SP_100 0x1 +#define DW_VR_MII_C37_ANSGM_SP_1000 0x2 +#define DW_VR_MII_C37_ANSGM_SP_LNKSTS BIT(4) + static const int xpcs_usxgmii_features[] = { ETHTOOL_LINK_MODE_Pause_BIT, ETHTOOL_LINK_MODE_Asym_Pause_BIT, @@ -105,6 +134,16 @@ static const int xpcs_xlgmii_features[] = { __ETHTOOL_LINK_MODE_MASK_NBITS, }; +static const int xpcs_sgmii_features[] = { + ETHTOOL_LINK_MODE_10baseT_Half_BIT, + ETHTOOL_LINK_MODE_10baseT_Full_BIT, + ETHTOOL_LINK_MODE_100baseT_Half_BIT, + ETHTOOL_LINK_MODE_100baseT_Full_BIT, + ETHTOOL_LINK_MODE_1000baseT_Half_BIT, + ETHTOOL_LINK_MODE_1000baseT_Full_BIT, + __ETHTOOL_LINK_MODE_MASK_NBITS, +}; + static const phy_interface_t xpcs_usxgmii_interfaces[] = { PHY_INTERFACE_MODE_USXGMII, PHY_INTERFACE_MODE_MAX, @@ -120,6 +159,11 @@ static const phy_interface_t xpcs_xlgmii_interfaces[] = { PHY_INTERFACE_MODE_MAX, }; +static const phy_interface_t xpcs_sgmii_interfaces[] = { + PHY_INTERFACE_MODE_SGMII, + PHY_INTERFACE_MODE_MAX, +}; + static struct xpcs_id { u32 id; u32 mask; @@ -145,6 +189,12 @@ static struct xpcs_id { .supported = xpcs_xlgmii_features, .interface = xpcs_xlgmii_interfaces, .an_mode = DW_AN_C73, + }, { + .id = SYNOPSYS_XPCS_SGMII_ID, + .mask = SYNOPSYS_XPCS_MASK, + .supported = xpcs_sgmii_features, + .interface = xpcs_sgmii_interfaces, + .an_mode = DW_AN_C37_SGMII, }, }; @@ -207,6 +257,9 @@ static int xpcs_soft_reset(struct mdio_xpcs_args *xpcs) case DW_AN_C73: dev = MDIO_MMD_PCS; break; + case DW_AN_C37_SGMII: + dev = MDIO_MMD_VEND2; + break; default: return -1; } @@ -597,6 +650,47 @@ static int xpcs_validate(struct mdio_xpcs_args *xpcs, return 0; } +static int xpcs_config_aneg_c37_sgmii(struct mdio_xpcs_args *xpcs) +{ + int ret; + + /* For AN for C37 SGMII mode, the settings are :- + * 1) VR_MII_AN_CTRL Bit(2:1)[PCS_MODE] = 10b (SGMII AN) + * 2) VR_MII_AN_CTRL Bit(3) [TX_CONFIG] = 0b (MAC side SGMII) + * DW xPCS used with DW EQoS MAC is always MAC side SGMII. + * 3) VR_MII_DIG_CTRL1 Bit(9) [MAC_AUTO_SW] = 1b (Automatic + * speed/duplex mode change by HW after SGMII AN complete) + * + * Note: Since it is MAC side SGMII, there is no need to set + * SR_MII_AN_ADV. MAC side SGMII receives AN Tx Config from + * PHY about the link state change after C28 AN is completed + * between PHY and Link Partner. There is also no need to + * trigger AN restart for MAC-side SGMII. + */ + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL); + if (ret < 0) + return ret; + + ret &= ~(DW_VR_MII_PCS_MODE_MASK | DW_VR_MII_TX_CONFIG_MASK); + ret |= (DW_VR_MII_PCS_MODE_C37_SGMII << + DW_VR_MII_AN_CTRL_PCS_MODE_SHIFT & + DW_VR_MII_PCS_MODE_MASK); + ret |= (DW_VR_MII_TX_CONFIG_MAC_SIDE_SGMII << + DW_VR_MII_AN_CTRL_TX_CONFIG_SHIFT & + DW_VR_MII_TX_CONFIG_MASK); + ret = xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_CTRL, ret); + if (ret < 0) + return ret; + + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1); + if (ret < 0) + return ret; + + ret |= DW_VR_MII_DIG_CTRL1_MAC_AUTO_SW; + + return xpcs_write(xpcs, MDIO_MMD_VEND2, DW_VR_MII_DIG_CTRL1, ret); +} + static int xpcs_config(struct mdio_xpcs_args *xpcs, const struct phylink_link_state *state) { @@ -610,6 +704,11 @@ static int xpcs_config(struct mdio_xpcs_args *xpcs, return ret; } break; + case DW_AN_C37_SGMII: + ret = xpcs_config_aneg_c37_sgmii(xpcs); + if (ret) + return ret; + break; default: return -1; } @@ -650,6 +749,47 @@ static int xpcs_get_state_c73(struct mdio_xpcs_args *xpcs, return 0; } +static int xpcs_get_state_c37_sgmii(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state) +{ + int ret; + + /* Reset link_state */ + state->link = false; + state->speed = SPEED_UNKNOWN; + state->duplex = DUPLEX_UNKNOWN; + state->pause = 0; + + /* For C37 SGMII mode, we check DW_VR_MII_AN_INTR_STS for link + * status, speed and duplex. + */ + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_INTR_STS); + if (ret < 0) + return false; + + if (ret & DW_VR_MII_C37_ANSGM_SP_LNKSTS) { + int speed_value; + + state->link = true; + + speed_value = (ret & DW_VR_MII_AN_STS_C37_ANSGM_SP) >> + DW_VR_MII_AN_STS_C37_ANSGM_SP_SHIFT; + if (speed_value == DW_VR_MII_C37_ANSGM_SP_1000) + state->speed = SPEED_1000; + else if (speed_value == DW_VR_MII_C37_ANSGM_SP_100) + state->speed = SPEED_100; + else + state->speed = SPEED_10; + + if (ret & DW_VR_MII_AN_STS_C37_ANSGM_FD) + state->duplex = DUPLEX_FULL; + else + state->duplex = DUPLEX_HALF; + } + + return 0; +} + static int xpcs_get_state(struct mdio_xpcs_args *xpcs, struct phylink_link_state *state) { @@ -661,6 +801,11 @@ static int xpcs_get_state(struct mdio_xpcs_args *xpcs, if (ret) return ret; break; + case DW_AN_C37_SGMII: + ret = xpcs_get_state_c37_sgmii(xpcs, state); + if (ret) + return ret; + break; default: return -1; } @@ -682,6 +827,7 @@ static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs) int ret; u32 id; + /* First, search C73 PCS using PCS MMD */ ret = xpcs_read(xpcs, MDIO_MMD_PCS, MII_PHYSID1); if (ret < 0) return 0xffffffff; @@ -692,7 +838,26 @@ static u32 xpcs_get_id(struct mdio_xpcs_args *xpcs) if (ret < 0) return 0xffffffff; - return id | ret; + /* If Device IDs are not all zeros, we found C73 AN-type device */ + if (id | ret) + return id | ret; + + /* Next, search C37 PCS using Vendor-Specific MII MMD */ + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, MII_PHYSID1); + if (ret < 0) + return 0xffffffff; + + id = ret << 16; + + ret = xpcs_read(xpcs, MDIO_MMD_VEND2, MII_PHYSID2); + if (ret < 0) + return 0xffffffff; + + /* If Device IDs are not all zeros, we found C37 AN-type device */ + if (id | ret) + return id | ret; + + return 0xffffffff; } static bool xpcs_check_features(struct mdio_xpcs_args *xpcs, diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index a04e57c25fea..2cb5188a7ef1 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -12,6 +12,7 @@ /* AN mode */ #define DW_AN_C73 1 +#define DW_AN_C37_SGMII 2 struct mdio_xpcs_args { __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); -- cgit v1.2.3 From ab39385021d1e0b4cd6cc521dc35c2fe659bbddf Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Mon, 15 Mar 2021 13:27:08 +0800 Subject: net: phylink: make phylink_parse_mode() support non-DT platform Certain platform does not support DT, so we make phylink_parse_mode() to allow non-DT platform to use it to setup in-band AN advertising. Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 5 +++-- include/linux/phylink.h | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 053c92e02cd8..12a047d47dec 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -271,8 +271,9 @@ static int phylink_parse_mode(struct phylink *pl, struct fwnode_handle *fwnode) pl->cfg_link_an_mode = MLO_AN_FIXED; fwnode_handle_put(dn); - if (fwnode_property_read_string(fwnode, "managed", &managed) == 0 && - strcmp(managed, "in-band-status") == 0) { + if ((fwnode_property_read_string(fwnode, "managed", &managed) == 0 && + strcmp(managed, "in-band-status") == 0) || + pl->config->ovr_an_inband) { if (pl->cfg_link_an_mode == MLO_AN_FIXED) { phylink_err(pl, "can't use both fixed-link and in-band-status\n"); diff --git a/include/linux/phylink.h b/include/linux/phylink.h index d81a714cfbbd..fd2acfd9b597 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -64,6 +64,7 @@ enum phylink_op_type { * @pcs_poll: MAC PCS cannot provide link change interrupt * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. + * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND * @get_fixed_state: callback to execute to determine the fixed link state, * if MAC link is at %MLO_AN_FIXED mode. */ @@ -72,6 +73,7 @@ struct phylink_config { enum phylink_op_type type; bool pcs_poll; bool poll_fixed_state; + bool ovr_an_inband; void (*get_fixed_state)(struct phylink_config *config, struct phylink_link_state *state); }; -- cgit v1.2.3 From e5e5b771f684c22b25c67df85d2deb43901f7b95 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Mon, 15 Mar 2021 13:27:09 +0800 Subject: net: stmmac: make in-band AN mode parsing is supported for non-DT Not all platform uses DT, so phylink_parse_mode() will skip in-band setup of pl->supported and pl->link_config.advertising entirely. So, we add the setting of ovr_an_inband flag to make it works for non-DT platform. Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 ++ include/linux/stmmac.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 208cae344ffa..b64ee029d41f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1117,6 +1117,8 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) priv->phylink_config.dev = &priv->dev->dev; priv->phylink_config.type = PHYLINK_NETDEV; priv->phylink_config.pcs_poll = true; + priv->phylink_config.ovr_an_inband = + priv->plat->mdio_bus_data->xpcs_an_inband; if (!fwnode) fwnode = dev_fwnode(priv->device); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index a302982de2d7..722dc167b5c9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -81,6 +81,7 @@ struct stmmac_mdio_bus_data { unsigned int phy_mask; unsigned int has_xpcs; + unsigned int xpcs_an_inband; int *irqs; int probed_phy_irq; bool needs_reset; -- cgit v1.2.3 From 7cf3b1dd6aa603fd80969e9e7160becf1455a0eb Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 15 Mar 2021 13:04:37 -0700 Subject: Bluetooth: L2CAP: Fix not checking for maximum number of DCID When receiving L2CAP_CREDIT_BASED_CONNECTION_REQ the remote may request more channels than allowed by the spec (10 octecs = 5 CIDs) so this checks if the number of channels is bigger than the maximum allowed and respond with an error. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_core.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 61800a7b6192..3c4f550e5a8b 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -494,6 +494,7 @@ struct l2cap_le_credits { #define L2CAP_ECRED_MIN_MTU 64 #define L2CAP_ECRED_MIN_MPS 64 +#define L2CAP_ECRED_MAX_CID 5 struct l2cap_ecred_conn_req { __le16 psm; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 72c2f5226d67..374cc32d7138 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5921,7 +5921,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, struct l2cap_ecred_conn_req *req = (void *) data; struct { struct l2cap_ecred_conn_rsp rsp; - __le16 dcid[5]; + __le16 dcid[L2CAP_ECRED_MAX_CID]; } __packed pdu; struct l2cap_chan *chan, *pchan; u16 mtu, mps; @@ -5938,6 +5938,14 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, goto response; } + cmd_len -= sizeof(*req); + num_scid = cmd_len / sizeof(u16); + + if (num_scid > ARRAY_SIZE(pdu.dcid)) { + result = L2CAP_CR_LE_INVALID_PARAMS; + goto response; + } + mtu = __le16_to_cpu(req->mtu); mps = __le16_to_cpu(req->mps); @@ -5970,8 +5978,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, } result = L2CAP_CR_LE_SUCCESS; - cmd_len -= sizeof(*req); - num_scid = cmd_len / sizeof(u16); for (i = 0; i < num_scid; i++) { u16 scid = __le16_to_cpu(req->scid[i]); -- cgit v1.2.3 From b4d45aee6635197d257f3469413837cd94fc11f4 Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Mon, 15 Mar 2021 20:16:47 +0800 Subject: net: stmmac: add platform level clocks management This patch intends to add platform level clocks management. Some platforms may have their own special clocks, they also need to be managed dynamically. If you want to manage such clocks, please implement clks_config callback. Reviewed-by: Andrew Lunn Signed-off-by: Joakim Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 10 ++++++++++ include/linux/stmmac.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 3c50846f59cd..a10704d8e3c6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -127,9 +127,19 @@ int stmmac_bus_clks_config(struct stmmac_priv *priv, bool enabled) clk_disable_unprepare(priv->plat->stmmac_clk); return ret; } + if (priv->plat->clks_config) { + ret = priv->plat->clks_config(priv->plat->bsp_priv, enabled); + if (ret) { + clk_disable_unprepare(priv->plat->stmmac_clk); + clk_disable_unprepare(priv->plat->pclk); + return ret; + } + } } else { clk_disable_unprepare(priv->plat->stmmac_clk); clk_disable_unprepare(priv->plat->pclk); + if (priv->plat->clks_config) + priv->plat->clks_config(priv->plat->bsp_priv, enabled); } return ret; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 722dc167b5c9..51004ebd0540 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -184,6 +184,7 @@ struct plat_stmmacenet_data { int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); struct mac_device_info *(*setup)(void *priv); + int (*clks_config)(void *priv, bool enabled); void *bsp_priv; struct clk *stmmac_clk; struct clk *pclk; -- cgit v1.2.3 From 6e3bac3eba448a438840ab8152cb8bbfcb8787b8 Mon Sep 17 00:00:00 2001 From: Ivan Bornyakov Date: Mon, 15 Mar 2021 17:19:26 +0300 Subject: net: phy: add Marvell 88X2222 transceiver support Add basic support for the Marvell 88X2222 multi-speed ethernet transceiver. This PHY provides data transmission over fiber-optic as well as Twinax copper links. The 88X2222 supports 2 ports of 10GBase-R and 1000Base-X on the line-side interface. The host-side interface supports 4 ports of 10GBase-R, RXAUI, 1000Base-X and 2 ports of XAUI. This driver, however, supports only XAUI on the host-side and 1000Base-X/10GBase-R on the line-side, for now. The SGMII is also supported over 1000Base-X. Interrupts are not supported. Internal registers access compliant with the Clause 45 specification. Signed-off-by: Ivan Bornyakov Signed-off-by: David S. Miller --- drivers/net/phy/Kconfig | 6 + drivers/net/phy/Makefile | 1 + drivers/net/phy/marvell-88x2222.c | 519 ++++++++++++++++++++++++++++++++++++++ include/linux/marvell_phy.h | 1 + 4 files changed, 527 insertions(+) create mode 100644 drivers/net/phy/marvell-88x2222.c (limited to 'include') diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index 698bea312adc..a615b3660b05 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -201,6 +201,12 @@ config MARVELL_10G_PHY help Support for the Marvell Alaska MV88X3310 and compatible PHYs. +config MARVELL_88X2222_PHY + tristate "Marvell 88X2222 PHY" + help + Support for the Marvell 88X2222 Dual-port Multi-speed Ethernet + Transceiver. + config MICREL_PHY tristate "Micrel PHYs" help diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index a13e402074cf..de683e3abe63 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_LSI_ET1011C_PHY) += et1011c.o obj-$(CONFIG_LXT_PHY) += lxt.o obj-$(CONFIG_MARVELL_10G_PHY) += marvell10g.o obj-$(CONFIG_MARVELL_PHY) += marvell.o +obj-$(CONFIG_MARVELL_88X2222_PHY) += marvell-88x2222.o obj-$(CONFIG_MESON_GXL_PHY) += meson-gxl.o obj-$(CONFIG_MICREL_KS8995MA) += spi_ks8995.o obj-$(CONFIG_MICREL_PHY) += micrel.o diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c new file mode 100644 index 000000000000..eca8c2f20684 --- /dev/null +++ b/drivers/net/phy/marvell-88x2222.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Marvell 88x2222 dual-port multi-speed ethernet transceiver. + * + * Supports: + * XAUI on the host side. + * 1000Base-X or 10GBase-R on the line side. + * SGMII over 1000Base-X. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Port PCS Configuration */ +#define MV_PCS_CONFIG 0xF002 +#define MV_PCS_HOST_XAUI 0x73 +#define MV_PCS_LINE_10GBR (0x71 << 8) +#define MV_PCS_LINE_1GBX_AN (0x7B << 8) +#define MV_PCS_LINE_SGMII_AN (0x7F << 8) + +/* Port Reset and Power Down */ +#define MV_PORT_RST 0xF003 +#define MV_LINE_RST_SW BIT(15) +#define MV_HOST_RST_SW BIT(7) +#define MV_PORT_RST_SW (MV_LINE_RST_SW | MV_HOST_RST_SW) + +/* 1000Base-X/SGMII Control Register */ +#define MV_1GBX_CTRL (0x2000 + MII_BMCR) + +/* 1000BASE-X/SGMII Status Register */ +#define MV_1GBX_STAT (0x2000 + MII_BMSR) + +/* 1000Base-X Auto-Negotiation Advertisement Register */ +#define MV_1GBX_ADVERTISE (0x2000 + MII_ADVERTISE) + +/* 1000Base-X PHY Specific Status Register */ +#define MV_1GBX_PHY_STAT 0xA003 +#define MV_1GBX_PHY_STAT_AN_RESOLVED BIT(11) +#define MV_1GBX_PHY_STAT_DUPLEX BIT(13) +#define MV_1GBX_PHY_STAT_SPEED100 BIT(14) +#define MV_1GBX_PHY_STAT_SPEED1000 BIT(15) + +struct mv2222_data { + phy_interface_t line_interface; + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); +}; + +/* SFI PMA transmit enable */ +static int mv2222_tx_enable(struct phy_device *phydev) +{ + return phy_clear_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_TXDIS, + MDIO_PMD_TXDIS_GLOBAL); +} + +/* SFI PMA transmit disable */ +static int mv2222_tx_disable(struct phy_device *phydev) +{ + return phy_set_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_PMA_TXDIS, + MDIO_PMD_TXDIS_GLOBAL); +} + +static int mv2222_soft_reset(struct phy_device *phydev) +{ + int val, ret; + + ret = phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PORT_RST, + MV_PORT_RST_SW); + if (ret < 0) + return ret; + + return phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND2, MV_PORT_RST, + val, !(val & MV_PORT_RST_SW), + 5000, 1000000, true); +} + +/* Returns negative on error, 0 if link is down, 1 if link is up */ +static int mv2222_read_status_10g(struct phy_device *phydev) +{ + int val, link = 0; + + val = phy_read_mmd(phydev, MDIO_MMD_PCS, MDIO_STAT1); + if (val < 0) + return val; + + if (val & MDIO_STAT1_LSTATUS) { + link = 1; + + /* 10GBASE-R do not support auto-negotiation */ + phydev->autoneg = AUTONEG_DISABLE; + phydev->speed = SPEED_10000; + phydev->duplex = DUPLEX_FULL; + } + + return link; +} + +/* Returns negative on error, 0 if link is down, 1 if link is up */ +static int mv2222_read_status_1g(struct phy_device *phydev) +{ + int val, link = 0; + + val = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_STAT); + if (val < 0) + return val; + + if (!(val & BMSR_LSTATUS) || + (phydev->autoneg == AUTONEG_ENABLE && + !(val & BMSR_ANEGCOMPLETE))) + return 0; + + link = 1; + + val = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_PHY_STAT); + if (val < 0) + return val; + + if (val & MV_1GBX_PHY_STAT_AN_RESOLVED) { + if (val & MV_1GBX_PHY_STAT_DUPLEX) + phydev->duplex = DUPLEX_FULL; + else + phydev->duplex = DUPLEX_HALF; + + if (val & MV_1GBX_PHY_STAT_SPEED1000) + phydev->speed = SPEED_1000; + else if (val & MV_1GBX_PHY_STAT_SPEED100) + phydev->speed = SPEED_100; + else + phydev->speed = SPEED_10; + } + + return link; +} + +static int mv2222_read_status(struct phy_device *phydev) +{ + struct mv2222_data *priv = phydev->priv; + int link; + + phydev->link = 0; + phydev->speed = SPEED_UNKNOWN; + phydev->duplex = DUPLEX_UNKNOWN; + + if (priv->line_interface == PHY_INTERFACE_MODE_10GBASER) + link = mv2222_read_status_10g(phydev); + else + link = mv2222_read_status_1g(phydev); + + if (link < 0) + return link; + + phydev->link = link; + + return 0; +} + +static int mv2222_disable_aneg(struct phy_device *phydev) +{ + int ret = phy_clear_bits_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_CTRL, + BMCR_ANENABLE | BMCR_ANRESTART); + if (ret < 0) + return ret; + + return mv2222_soft_reset(phydev); +} + +static int mv2222_enable_aneg(struct phy_device *phydev) +{ + int ret = phy_set_bits_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_CTRL, + BMCR_ANENABLE | BMCR_RESET); + if (ret < 0) + return ret; + + return mv2222_soft_reset(phydev); +} + +static int mv2222_set_sgmii_speed(struct phy_device *phydev) +{ + struct mv2222_data *priv = phydev->priv; + + switch (phydev->speed) { + default: + case SPEED_1000: + if ((linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, + priv->supported))) + return phy_modify_mmd(phydev, MDIO_MMD_PCS, + MV_1GBX_CTRL, + BMCR_SPEED1000 | BMCR_SPEED100, + BMCR_SPEED1000); + + fallthrough; + case SPEED_100: + if ((linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, + priv->supported))) + return phy_modify_mmd(phydev, MDIO_MMD_PCS, + MV_1GBX_CTRL, + BMCR_SPEED1000 | BMCR_SPEED100, + BMCR_SPEED100); + fallthrough; + case SPEED_10: + if ((linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, + priv->supported))) + return phy_modify_mmd(phydev, MDIO_MMD_PCS, + MV_1GBX_CTRL, + BMCR_SPEED1000 | BMCR_SPEED100, + BMCR_SPEED10); + + return -EINVAL; + } +} + +static bool mv2222_is_10g_capable(struct phy_device *phydev) +{ + struct mv2222_data *priv = phydev->priv; + + return (linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseLR_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT, + priv->supported) || + linkmode_test_bit(ETHTOOL_LINK_MODE_10000baseER_Full_BIT, + priv->supported)); +} + +static bool mv2222_is_1gbx_capable(struct phy_device *phydev) +{ + struct mv2222_data *priv = phydev->priv; + + return linkmode_test_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT, + priv->supported); +} + +static int mv2222_config_line(struct phy_device *phydev) +{ + struct mv2222_data *priv = phydev->priv; + + switch (priv->line_interface) { + case PHY_INTERFACE_MODE_10GBASER: + return phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PCS_CONFIG, + MV_PCS_HOST_XAUI | MV_PCS_LINE_10GBR); + case PHY_INTERFACE_MODE_1000BASEX: + return phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PCS_CONFIG, + MV_PCS_HOST_XAUI | MV_PCS_LINE_1GBX_AN); + case PHY_INTERFACE_MODE_SGMII: + return phy_write_mmd(phydev, MDIO_MMD_VEND2, MV_PCS_CONFIG, + MV_PCS_HOST_XAUI | MV_PCS_LINE_SGMII_AN); + default: + return -EINVAL; + } +} + +static int mv2222_setup_forced(struct phy_device *phydev) +{ + struct mv2222_data *priv = phydev->priv; + bool changed = false; + int ret; + + switch (priv->line_interface) { + case PHY_INTERFACE_MODE_10GBASER: + if (phydev->speed == SPEED_1000 && + mv2222_is_1gbx_capable(phydev)) { + priv->line_interface = PHY_INTERFACE_MODE_1000BASEX; + changed = true; + } + + break; + case PHY_INTERFACE_MODE_1000BASEX: + if (phydev->speed == SPEED_10000 && + mv2222_is_10g_capable(phydev)) { + priv->line_interface = PHY_INTERFACE_MODE_10GBASER; + changed = true; + } + + break; + case PHY_INTERFACE_MODE_SGMII: + ret = mv2222_set_sgmii_speed(phydev); + if (ret < 0) + return ret; + + break; + default: + return -EINVAL; + } + + if (changed) { + ret = mv2222_config_line(phydev); + if (ret < 0) + return ret; + } + + return mv2222_disable_aneg(phydev); +} + +static int mv2222_config_aneg(struct phy_device *phydev) +{ + struct mv2222_data *priv = phydev->priv; + int ret, adv; + + /* SFP is not present, do nothing */ + if (priv->line_interface == PHY_INTERFACE_MODE_NA) + return 0; + + if (phydev->autoneg == AUTONEG_DISABLE || + phydev->speed == SPEED_10000) + return mv2222_setup_forced(phydev); + + if (priv->line_interface == PHY_INTERFACE_MODE_10GBASER && + mv2222_is_1gbx_capable(phydev)) { + priv->line_interface = PHY_INTERFACE_MODE_1000BASEX; + ret = mv2222_config_line(phydev); + if (ret < 0) + return ret; + } + + adv = linkmode_adv_to_mii_adv_x(priv->supported, + ETHTOOL_LINK_MODE_1000baseX_Full_BIT); + + ret = phy_modify_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_ADVERTISE, + ADVERTISE_1000XFULL | + ADVERTISE_1000XPAUSE | ADVERTISE_1000XPSE_ASYM, + adv); + if (ret < 0) + return ret; + + return mv2222_enable_aneg(phydev); +} + +static int mv2222_aneg_done(struct phy_device *phydev) +{ + int ret; + + if (mv2222_is_10g_capable(phydev)) { + ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MDIO_STAT1); + if (ret < 0) + return ret; + + if (ret & MDIO_STAT1_LSTATUS) + return 1; + } + + ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_1GBX_STAT); + if (ret < 0) + return ret; + + return (ret & BMSR_ANEGCOMPLETE); +} + +static int mv2222_resume(struct phy_device *phydev) +{ + return mv2222_tx_enable(phydev); +} + +static int mv2222_suspend(struct phy_device *phydev) +{ + return mv2222_tx_disable(phydev); +} + +static int mv2222_get_features(struct phy_device *phydev) +{ + /* All supported linkmodes are set at probe */ + + return 0; +} + +static int mv2222_config_init(struct phy_device *phydev) +{ + if (phydev->interface != PHY_INTERFACE_MODE_XAUI) + return -EINVAL; + + phydev->autoneg = AUTONEG_DISABLE; + + return 0; +} + +static int mv2222_sfp_insert(void *upstream, const struct sfp_eeprom_id *id) +{ + struct phy_device *phydev = upstream; + phy_interface_t sfp_interface; + struct mv2222_data *priv; + struct device *dev; + int ret; + + __ETHTOOL_DECLARE_LINK_MODE_MASK(sfp_supported) = { 0, }; + + priv = (struct mv2222_data *)phydev->priv; + dev = &phydev->mdio.dev; + + sfp_parse_support(phydev->sfp_bus, id, sfp_supported); + sfp_interface = sfp_select_interface(phydev->sfp_bus, sfp_supported); + + dev_info(dev, "%s SFP module inserted\n", phy_modes(sfp_interface)); + + if (sfp_interface != PHY_INTERFACE_MODE_10GBASER && + sfp_interface != PHY_INTERFACE_MODE_1000BASEX && + sfp_interface != PHY_INTERFACE_MODE_SGMII) { + dev_err(dev, "Incompatible SFP module inserted\n"); + + return -EINVAL; + } + + priv->line_interface = sfp_interface; + linkmode_and(priv->supported, phydev->supported, sfp_supported); + + ret = mv2222_config_line(phydev); + if (ret < 0) + return ret; + + if (mutex_trylock(&phydev->lock)) { + if (priv->line_interface == PHY_INTERFACE_MODE_10GBASER) + ret = mv2222_setup_forced(phydev); + else + ret = mv2222_config_aneg(phydev); + + mutex_unlock(&phydev->lock); + } + + return ret; +} + +static void mv2222_sfp_remove(void *upstream) +{ + struct phy_device *phydev = upstream; + struct mv2222_data *priv; + + priv = (struct mv2222_data *)phydev->priv; + + priv->line_interface = PHY_INTERFACE_MODE_NA; + linkmode_zero(priv->supported); +} + +static const struct sfp_upstream_ops sfp_phy_ops = { + .module_insert = mv2222_sfp_insert, + .module_remove = mv2222_sfp_remove, + .attach = phy_sfp_attach, + .detach = phy_sfp_detach, +}; + +static int mv2222_probe(struct phy_device *phydev) +{ + struct device *dev = &phydev->mdio.dev; + struct mv2222_data *priv = NULL; + + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0, }; + + linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_Pause_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_FIBRE_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_TP_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseCR_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseSR_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseLR_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT, supported); + linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseER_Full_BIT, supported); + + linkmode_copy(phydev->supported, supported); + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->line_interface = PHY_INTERFACE_MODE_NA; + phydev->priv = priv; + + return phy_sfp_probe(phydev, &sfp_phy_ops); +} + +static struct phy_driver mv2222_drivers[] = { + { + .phy_id = MARVELL_PHY_ID_88X2222, + .phy_id_mask = MARVELL_PHY_ID_MASK, + .name = "Marvell 88X2222", + .get_features = mv2222_get_features, + .soft_reset = mv2222_soft_reset, + .config_init = mv2222_config_init, + .config_aneg = mv2222_config_aneg, + .aneg_done = mv2222_aneg_done, + .probe = mv2222_probe, + .suspend = mv2222_suspend, + .resume = mv2222_resume, + .read_status = mv2222_read_status, + }, +}; +module_phy_driver(mv2222_drivers); + +static struct mdio_device_id __maybe_unused mv2222_tbl[] = { + { MARVELL_PHY_ID_88X2222, MARVELL_PHY_ID_MASK }, + { } +}; +MODULE_DEVICE_TABLE(mdio, mv2222_tbl); + +MODULE_DESCRIPTION("Marvell 88x2222 ethernet transceiver driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 52b1610eae68..274abd5fbac3 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -24,6 +24,7 @@ #define MARVELL_PHY_ID_88E3016 0x01410e60 #define MARVELL_PHY_ID_88X3310 0x002b09a0 #define MARVELL_PHY_ID_88E2110 0x002b09b0 +#define MARVELL_PHY_ID_88X2222 0x01410f10 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */ #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0 -- cgit v1.2.3 From 45f3a13c816656c9d3d311880d90286341644d9b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Mar 2021 16:51:46 -0500 Subject: net: qualcomm: rmnet: mark trailer field endianness The fields in the checksum trailer structure used for QMAP protocol RX packets are all big-endian format, so define them that way. It turns out these fields are never actually used by the RMNet code. The start offset is always assumed to be zero, and the length is taken from the other packet headers. So making these fields explicitly big endian has no effect on the behavior of the code. Signed-off-by: Alex Elder Reviewed-by: Bjorn Andersson Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/if_rmnet.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h index 9661416a9bb4..8c7845baf383 100644 --- a/include/linux/if_rmnet.h +++ b/include/linux/if_rmnet.h @@ -32,8 +32,8 @@ struct rmnet_map_dl_csum_trailer { #else #error "Please fix " #endif - u16 csum_start_offset; - u16 csum_length; + __be16 csum_start_offset; + __be16 csum_length; __be16 csum_value; } __aligned(1); -- cgit v1.2.3 From 16653c16d282e768763b2e8cc78f75df8fd53992 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Mar 2021 16:51:49 -0500 Subject: net: qualcomm: rmnet: use masks instead of C bit-fields The actual layout of bits defined in C bit-fields (e.g. int foo : 3) is implementation-defined. Structures defined in address this by specifying all bit-fields twice, to cover two possible layouts. I think this pattern is repetitive and noisy, and I find the whole notion of compiler "bitfield endianness" to be non-intuitive. Stop using C bit-fields for the command/data flag and the pad length fields in the rmnet_map structure, and define a single-byte flags field instead. Define a mask for the single-bit "command" flag, and another mask for the encoded pad length. The content of both fields can be accessed using a simple bitwise AND operation. Signed-off-by: Alex Elder Reviewed-by: Bjorn Andersson Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c | 4 ++-- .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 4 +++- include/linux/if_rmnet.h | 23 ++++++++++------------ 3 files changed, 15 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index 2a6b2a609884..0be5ac7ab261 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -61,7 +61,7 @@ __rmnet_map_ingress_handler(struct sk_buff *skb, u16 len, pad; u8 mux_id; - if (map_header->cd_bit) { + if (map_header->flags & MAP_CMD_FLAG) { /* Packet contains a MAP command (not data) */ if (port->data_format & RMNET_FLAGS_INGRESS_MAP_COMMANDS) return rmnet_map_command(skb, port); @@ -70,7 +70,7 @@ __rmnet_map_ingress_handler(struct sk_buff *skb, } mux_id = map_header->mux_id; - pad = map_header->pad_len; + pad = map_header->flags & MAP_PAD_LEN_MASK; len = ntohs(map_header->pkt_len) - pad; if (mux_id >= RMNET_MAX_LOGICAL_EP) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index 3af68368fc31..e7d0394cb297 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -280,6 +280,7 @@ struct rmnet_map_header *rmnet_map_add_map_header(struct sk_buff *skb, return map_header; } + BUILD_BUG_ON(MAP_PAD_LEN_MASK < 3); padding = ALIGN(map_datalen, 4) - map_datalen; if (padding == 0) @@ -293,7 +294,8 @@ struct rmnet_map_header *rmnet_map_add_map_header(struct sk_buff *skb, done: map_header->pkt_len = htons(map_datalen + padding); - map_header->pad_len = padding & 0x3F; + /* This is a data packet, so the CMD bit is 0 */ + map_header->flags = padding & MAP_PAD_LEN_MASK; return map_header; } diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h index 8c7845baf383..a02f0a3df1d9 100644 --- a/include/linux/if_rmnet.h +++ b/include/linux/if_rmnet.h @@ -6,21 +6,18 @@ #define _LINUX_IF_RMNET_H_ struct rmnet_map_header { -#if defined(__LITTLE_ENDIAN_BITFIELD) - u8 pad_len:6; - u8 reserved_bit:1; - u8 cd_bit:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - u8 cd_bit:1; - u8 reserved_bit:1; - u8 pad_len:6; -#else -#error "Please fix " -#endif - u8 mux_id; - __be16 pkt_len; + u8 flags; /* MAP_CMD_FLAG, MAP_PAD_LEN_MASK */ + u8 mux_id; + __be16 pkt_len; /* Length of packet, including pad */ } __aligned(1); +/* rmnet_map_header flags field: + * PAD_LEN: number of pad bytes following packet data + * CMD: 1 = packet contains a MAP command; 0 = packet contains data + */ +#define MAP_PAD_LEN_MASK GENMASK(5, 0) +#define MAP_CMD_FLAG BIT(7) + struct rmnet_map_dl_csum_trailer { u8 reserved1; #if defined(__LITTLE_ENDIAN_BITFIELD) -- cgit v1.2.3 From cc1b21ba6251c8dd8e4e86018c9fdba85df0d219 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Mar 2021 16:51:50 -0500 Subject: net: qualcomm: rmnet: don't use C bit-fields in rmnet checksum trailer Replace the use of C bit-fields in the rmnet_map_dl_csum_trailer structure with a single one-byte field, using constant field masks to encode or get at embedded values. Signed-off-by: Alex Elder Reviewed-by: Bjorn Andersson Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 2 +- include/linux/if_rmnet.h | 17 +++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index e7d0394cb297..c336c17e01fe 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -359,7 +359,7 @@ int rmnet_map_checksum_downlink_packet(struct sk_buff *skb, u16 len) csum_trailer = (struct rmnet_map_dl_csum_trailer *)(skb->data + len); - if (!csum_trailer->valid) { + if (!(csum_trailer->flags & MAP_CSUM_DL_VALID_FLAG)) { priv->stats.csum_valid_unset++; return -EINVAL; } diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h index a02f0a3df1d9..941997df9e08 100644 --- a/include/linux/if_rmnet.h +++ b/include/linux/if_rmnet.h @@ -19,21 +19,18 @@ struct rmnet_map_header { #define MAP_CMD_FLAG BIT(7) struct rmnet_map_dl_csum_trailer { - u8 reserved1; -#if defined(__LITTLE_ENDIAN_BITFIELD) - u8 valid:1; - u8 reserved2:7; -#elif defined (__BIG_ENDIAN_BITFIELD) - u8 reserved2:7; - u8 valid:1; -#else -#error "Please fix " -#endif + u8 reserved1; + u8 flags; /* MAP_CSUM_DL_VALID_FLAG */ __be16 csum_start_offset; __be16 csum_length; __be16 csum_value; } __aligned(1); +/* rmnet_map_dl_csum_trailer flags field: + * VALID: 1 = checksum and length valid; 0 = ignore them + */ +#define MAP_CSUM_DL_VALID_FLAG BIT(0) + struct rmnet_map_ul_csum_header { __be16 csum_start_offset; #if defined(__LITTLE_ENDIAN_BITFIELD) -- cgit v1.2.3 From 86ca860e12ec0feab7d721d3b05e60fb86613540 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Mar 2021 16:51:51 -0500 Subject: net: qualcomm: rmnet: don't use C bit-fields in rmnet checksum header Replace the use of C bit-fields in the rmnet_map_ul_csum_header structure with a single two-byte (big endian) structure member, and use masks to encode or get values within it. The content of these fields can be accessed using simple bitwise AND and OR operations on the (host byte order) value of the new structure member. Previously rmnet_map_ipv4_ul_csum_header() would update C bit-field values in host byte order, then forcibly fix their byte order using a combination of byte swap operations and types. Instead, just compute the value that needs to go into the new structure member and save it with a simple byte-order conversion. Make similar simplifications in rmnet_map_ipv6_ul_csum_header(). Finally, in rmnet_map_checksum_uplink_packet() a set of assignments zeroes every field in the upload checksum header. Replace that with a single memset() operation. Signed-off-by: Alex Elder Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 38 ++++++++-------------- include/linux/if_rmnet.h | 21 ++++++------ 2 files changed, 23 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index c336c17e01fe..0ac2ff828320 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -197,20 +197,16 @@ rmnet_map_ipv4_ul_csum_header(void *iphdr, struct rmnet_map_ul_csum_header *ul_header, struct sk_buff *skb) { - __be16 *hdr = (__be16 *)ul_header; struct iphdr *ip4h = iphdr; + u16 val; - ul_header->csum_start_offset = htons(skb_network_header_len(skb)); - ul_header->csum_insert_offset = skb->csum_offset; - ul_header->csum_enabled = 1; + val = MAP_CSUM_UL_ENABLED_FLAG; if (ip4h->protocol == IPPROTO_UDP) - ul_header->udp_ind = 1; - else - ul_header->udp_ind = 0; + val |= MAP_CSUM_UL_UDP_FLAG; + val |= skb->csum_offset & MAP_CSUM_UL_OFFSET_MASK; - /* Changing remaining fields to network order */ - hdr++; - *hdr = htons((__force u16)*hdr); + ul_header->csum_start_offset = htons(skb_network_header_len(skb)); + ul_header->csum_info = htons(val); skb->ip_summed = CHECKSUM_NONE; @@ -237,21 +233,16 @@ rmnet_map_ipv6_ul_csum_header(void *ip6hdr, struct rmnet_map_ul_csum_header *ul_header, struct sk_buff *skb) { - __be16 *hdr = (__be16 *)ul_header; struct ipv6hdr *ip6h = ip6hdr; + u16 val; - ul_header->csum_start_offset = htons(skb_network_header_len(skb)); - ul_header->csum_insert_offset = skb->csum_offset; - ul_header->csum_enabled = 1; - + val = MAP_CSUM_UL_ENABLED_FLAG; if (ip6h->nexthdr == IPPROTO_UDP) - ul_header->udp_ind = 1; - else - ul_header->udp_ind = 0; + val |= MAP_CSUM_UL_UDP_FLAG; + val |= skb->csum_offset & MAP_CSUM_UL_OFFSET_MASK; - /* Changing remaining fields to network order */ - hdr++; - *hdr = htons((__force u16)*hdr); + ul_header->csum_start_offset = htons(skb_network_header_len(skb)); + ul_header->csum_info = htons(val); skb->ip_summed = CHECKSUM_NONE; @@ -419,10 +410,7 @@ void rmnet_map_checksum_uplink_packet(struct sk_buff *skb, } sw_csum: - ul_header->csum_start_offset = 0; - ul_header->csum_insert_offset = 0; - ul_header->csum_enabled = 0; - ul_header->udp_ind = 0; + memset(ul_header, 0, sizeof(*ul_header)); priv->stats.csum_sw++; } diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h index 941997df9e08..4efb537f57f3 100644 --- a/include/linux/if_rmnet.h +++ b/include/linux/if_rmnet.h @@ -33,17 +33,16 @@ struct rmnet_map_dl_csum_trailer { struct rmnet_map_ul_csum_header { __be16 csum_start_offset; -#if defined(__LITTLE_ENDIAN_BITFIELD) - u16 csum_insert_offset:14; - u16 udp_ind:1; - u16 csum_enabled:1; -#elif defined (__BIG_ENDIAN_BITFIELD) - u16 csum_enabled:1; - u16 udp_ind:1; - u16 csum_insert_offset:14; -#else -#error "Please fix " -#endif + __be16 csum_info; /* MAP_CSUM_UL_* */ } __aligned(1); +/* csum_info field: + * OFFSET: where (offset in bytes) to insert computed checksum + * UDP: 1 = UDP checksum (zero checkum means no checksum) + * ENABLED: 1 = checksum computation requested + */ +#define MAP_CSUM_UL_OFFSET_MASK GENMASK(13, 0) +#define MAP_CSUM_UL_UDP_FLAG BIT(14) +#define MAP_CSUM_UL_ENABLED_FLAG BIT(15) + #endif /* !(_LINUX_IF_RMNET_H_) */ -- cgit v1.2.3 From ebb1bb401303ffac0ee994ba8ed9dfd24bb2ac5f Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 16 Mar 2021 21:10:17 +0100 Subject: net: ocelot: Add PGID_BLACKHOLE Add a new PGID that is used not to forward frames anywhere. It is used by MRP to make sure that MRP Test frames will not reach CPU port. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 3 +++ include/soc/mscc/ocelot.h | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 46e5c9136bac..f74d7cf002a5 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -2051,6 +2051,9 @@ int ocelot_init(struct ocelot *ocelot) ocelot_write_rix(ocelot, val, ANA_PGID_PGID, i); } + + ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_BLACKHOLE); + /* Allow broadcast and unknown L2 multicast to the CPU. */ ocelot_rmw_rix(ocelot, ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)), ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)), diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 425ff29d9389..4d10ccc8e7b5 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -51,6 +51,7 @@ */ /* Reserve some destination PGIDs at the end of the range: + * PGID_BLACKHOLE: used for not forwarding the frames * PGID_CPU: used for whitelisting certain MAC addresses, such as the addresses * of the switch port net devices, towards the CPU port module. * PGID_UC: the flooding destinations for unknown unicast traffic. @@ -59,6 +60,7 @@ * PGID_MCIPV6: the flooding destinations for IPv6 multicast traffic. * PGID_BC: the flooding destinations for broadcast traffic. */ +#define PGID_BLACKHOLE 57 #define PGID_CPU 58 #define PGID_UC 59 #define PGID_MC 60 @@ -73,7 +75,7 @@ #define for_each_nonreserved_multicast_dest_pgid(ocelot, pgid) \ for ((pgid) = (ocelot)->num_phys_ports + 1; \ - (pgid) < PGID_CPU; \ + (pgid) < PGID_BLACKHOLE; \ (pgid)++) #define for_each_aggr_pgid(ocelot, pgid) \ -- cgit v1.2.3 From 7c588c3e96e9733a2a8a40caefd26c9189416821 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 16 Mar 2021 21:10:18 +0100 Subject: net: ocelot: Extend MRP This patch extends MRP support for Ocelot. It allows to have multiple rings and when the node has the MRC role it forwards MRP Test frames in HW. For MRM there is no change. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 6 - drivers/net/ethernet/mscc/ocelot_mrp.c | 233 ++++++++++++++++++++++----------- include/soc/mscc/ocelot.h | 8 +- net/dsa/tag_ocelot.c | 6 - 4 files changed, 160 insertions(+), 93 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index f74d7cf002a5..9cc9378157e4 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -772,12 +772,6 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) skb->protocol = eth_type_trans(skb, dev); -#if IS_ENABLED(CONFIG_BRIDGE_MRP) - if (skb->protocol == cpu_to_be16(ETH_P_MRP) && - cpuq & BIT(OCELOT_MRP_CPUQ)) - skb->offload_fwd_mark = 0; -#endif - *nskb = skb; return 0; diff --git a/drivers/net/ethernet/mscc/ocelot_mrp.c b/drivers/net/ethernet/mscc/ocelot_mrp.c index 683da320bfd8..439129a65b71 100644 --- a/drivers/net/ethernet/mscc/ocelot_mrp.c +++ b/drivers/net/ethernet/mscc/ocelot_mrp.c @@ -1,8 +1,5 @@ // SPDX-License-Identifier: (GPL-2.0 OR MIT) /* Microsemi Ocelot Switch driver - * - * This contains glue logic between the switchdev driver operations and the - * mscc_ocelot_switch_lib. * * Copyright (c) 2017, 2019 Microsemi Corporation * Copyright 2020-2021 NXP Semiconductors @@ -15,13 +12,34 @@ #include "ocelot.h" #include "ocelot_vcap.h" -static int ocelot_mrp_del_vcap(struct ocelot *ocelot, int port) +static const u8 mrp_test_dmac[] = { 0x01, 0x15, 0x4e, 0x00, 0x00, 0x01 }; +static const u8 mrp_control_dmac[] = { 0x01, 0x15, 0x4e, 0x00, 0x00, 0x02 }; + +static int ocelot_mrp_find_partner_port(struct ocelot *ocelot, + struct ocelot_port *p) +{ + int i; + + for (i = 0; i < ocelot->num_phys_ports; ++i) { + struct ocelot_port *ocelot_port = ocelot->ports[i]; + + if (!ocelot_port || p == ocelot_port) + continue; + + if (ocelot_port->mrp_ring_id == p->mrp_ring_id) + return i; + } + + return -1; +} + +static int ocelot_mrp_del_vcap(struct ocelot *ocelot, int id) { struct ocelot_vcap_block *block_vcap_is2; struct ocelot_vcap_filter *filter; block_vcap_is2 = &ocelot->block[VCAP_IS2]; - filter = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, port, + filter = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, id, false); if (!filter) return 0; @@ -29,6 +47,87 @@ static int ocelot_mrp_del_vcap(struct ocelot *ocelot, int port) return ocelot_vcap_filter_del(ocelot, filter); } +static int ocelot_mrp_redirect_add_vcap(struct ocelot *ocelot, int src_port, + int dst_port) +{ + const u8 mrp_test_mask[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + struct ocelot_vcap_filter *filter; + int err; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return -ENOMEM; + + filter->key_type = OCELOT_VCAP_KEY_ETYPE; + filter->prio = 1; + filter->id.cookie = src_port; + filter->id.tc_offload = false; + filter->block_id = VCAP_IS2; + filter->type = OCELOT_VCAP_FILTER_OFFLOAD; + filter->ingress_port_mask = BIT(src_port); + ether_addr_copy(filter->key.etype.dmac.value, mrp_test_dmac); + ether_addr_copy(filter->key.etype.dmac.mask, mrp_test_mask); + filter->action.mask_mode = OCELOT_MASK_MODE_REDIRECT; + filter->action.port_mask = BIT(dst_port); + + err = ocelot_vcap_filter_add(ocelot, filter, NULL); + if (err) + kfree(filter); + + return err; +} + +static int ocelot_mrp_copy_add_vcap(struct ocelot *ocelot, int port, + int prio, unsigned long cookie) +{ + const u8 mrp_mask[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }; + struct ocelot_vcap_filter *filter; + int err; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!filter) + return -ENOMEM; + + filter->key_type = OCELOT_VCAP_KEY_ETYPE; + filter->prio = prio; + filter->id.cookie = cookie; + filter->id.tc_offload = false; + filter->block_id = VCAP_IS2; + filter->type = OCELOT_VCAP_FILTER_OFFLOAD; + filter->ingress_port_mask = BIT(port); + /* Here is possible to use control or test dmac because the mask + * doesn't cover the LSB + */ + ether_addr_copy(filter->key.etype.dmac.value, mrp_test_dmac); + ether_addr_copy(filter->key.etype.dmac.mask, mrp_mask); + filter->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY; + filter->action.port_mask = 0x0; + filter->action.cpu_copy_ena = true; + filter->action.cpu_qu_num = OCELOT_MRP_CPUQ; + + err = ocelot_vcap_filter_add(ocelot, filter, NULL); + if (err) + kfree(filter); + + return err; +} + +static void ocelot_mrp_save_mac(struct ocelot *ocelot, + struct ocelot_port *port) +{ + ocelot_mact_learn(ocelot, PGID_BLACKHOLE, mrp_test_dmac, + port->pvid_vlan.vid, ENTRYTYPE_LOCKED); + ocelot_mact_learn(ocelot, PGID_BLACKHOLE, mrp_control_dmac, + port->pvid_vlan.vid, ENTRYTYPE_LOCKED); +} + +static void ocelot_mrp_del_mac(struct ocelot *ocelot, + struct ocelot_port *port) +{ + ocelot_mact_forget(ocelot, mrp_test_dmac, port->pvid_vlan.vid); + ocelot_mact_forget(ocelot, mrp_control_dmac, port->pvid_vlan.vid); +} + int ocelot_mrp_add(struct ocelot *ocelot, int port, const struct switchdev_obj_mrp *mrp) { @@ -45,18 +144,7 @@ int ocelot_mrp_add(struct ocelot *ocelot, int port, if (mrp->p_port != dev && mrp->s_port != dev) return 0; - if (ocelot->mrp_ring_id != 0 && - ocelot->mrp_s_port && - ocelot->mrp_p_port) - return -EINVAL; - - if (mrp->p_port == dev) - ocelot->mrp_p_port = dev; - - if (mrp->s_port == dev) - ocelot->mrp_s_port = dev; - - ocelot->mrp_ring_id = mrp->ring_id; + ocelot_port->mrp_ring_id = mrp->ring_id; return 0; } @@ -66,34 +154,31 @@ int ocelot_mrp_del(struct ocelot *ocelot, int port, const struct switchdev_obj_mrp *mrp) { struct ocelot_port *ocelot_port = ocelot->ports[port]; - struct ocelot_port_private *priv; - struct net_device *dev; + int i; if (!ocelot_port) return -EOPNOTSUPP; - priv = container_of(ocelot_port, struct ocelot_port_private, port); - dev = priv->dev; - - if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + if (ocelot_port->mrp_ring_id != mrp->ring_id) return 0; - if (ocelot->mrp_ring_id == 0 && - !ocelot->mrp_s_port && - !ocelot->mrp_p_port) - return -EINVAL; + ocelot_mrp_del_vcap(ocelot, port); + ocelot_mrp_del_vcap(ocelot, port + ocelot->num_phys_ports); - if (ocelot_mrp_del_vcap(ocelot, priv->chip_port)) - return -EINVAL; + ocelot_port->mrp_ring_id = 0; - if (ocelot->mrp_p_port == dev) - ocelot->mrp_p_port = NULL; + for (i = 0; i < ocelot->num_phys_ports; ++i) { + ocelot_port = ocelot->ports[i]; - if (ocelot->mrp_s_port == dev) - ocelot->mrp_s_port = NULL; + if (!ocelot_port) + continue; - ocelot->mrp_ring_id = 0; + if (ocelot_port->mrp_ring_id != 0) + goto out; + } + ocelot_mrp_del_mac(ocelot, ocelot_port); +out: return 0; } EXPORT_SYMBOL(ocelot_mrp_del); @@ -102,49 +187,39 @@ int ocelot_mrp_add_ring_role(struct ocelot *ocelot, int port, const struct switchdev_obj_ring_role_mrp *mrp) { struct ocelot_port *ocelot_port = ocelot->ports[port]; - struct ocelot_vcap_filter *filter; - struct ocelot_port_private *priv; - struct net_device *dev; + int dst_port; int err; if (!ocelot_port) return -EOPNOTSUPP; - priv = container_of(ocelot_port, struct ocelot_port_private, port); - dev = priv->dev; - - if (ocelot->mrp_ring_id != mrp->ring_id) - return -EINVAL; - - if (!mrp->sw_backup) + if (mrp->ring_role != BR_MRP_RING_ROLE_MRC && !mrp->sw_backup) return -EOPNOTSUPP; - if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + if (ocelot_port->mrp_ring_id != mrp->ring_id) return 0; - filter = kzalloc(sizeof(*filter), GFP_ATOMIC); - if (!filter) - return -ENOMEM; + ocelot_mrp_save_mac(ocelot, ocelot_port); - filter->key_type = OCELOT_VCAP_KEY_ETYPE; - filter->prio = 1; - filter->id.cookie = priv->chip_port; - filter->id.tc_offload = false; - filter->block_id = VCAP_IS2; - filter->type = OCELOT_VCAP_FILTER_OFFLOAD; - filter->ingress_port_mask = BIT(priv->chip_port); - *(__be16 *)filter->key.etype.etype.value = htons(ETH_P_MRP); - *(__be16 *)filter->key.etype.etype.mask = htons(0xffff); - filter->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY; - filter->action.port_mask = 0x0; - filter->action.cpu_copy_ena = true; - filter->action.cpu_qu_num = OCELOT_MRP_CPUQ; + if (mrp->ring_role != BR_MRP_RING_ROLE_MRC) + return ocelot_mrp_copy_add_vcap(ocelot, port, 1, port); - err = ocelot_vcap_filter_add(ocelot, filter, NULL); + dst_port = ocelot_mrp_find_partner_port(ocelot, ocelot_port); + if (dst_port == -1) + return -EINVAL; + + err = ocelot_mrp_redirect_add_vcap(ocelot, port, dst_port); if (err) - kfree(filter); + return err; - return err; + err = ocelot_mrp_copy_add_vcap(ocelot, port, 2, + port + ocelot->num_phys_ports); + if (err) { + ocelot_mrp_del_vcap(ocelot, port); + return err; + } + + return 0; } EXPORT_SYMBOL(ocelot_mrp_add_ring_role); @@ -152,24 +227,32 @@ int ocelot_mrp_del_ring_role(struct ocelot *ocelot, int port, const struct switchdev_obj_ring_role_mrp *mrp) { struct ocelot_port *ocelot_port = ocelot->ports[port]; - struct ocelot_port_private *priv; - struct net_device *dev; + int i; if (!ocelot_port) return -EOPNOTSUPP; - priv = container_of(ocelot_port, struct ocelot_port_private, port); - dev = priv->dev; - - if (ocelot->mrp_ring_id != mrp->ring_id) - return -EINVAL; - - if (!mrp->sw_backup) + if (mrp->ring_role != BR_MRP_RING_ROLE_MRC && !mrp->sw_backup) return -EOPNOTSUPP; - if (ocelot->mrp_p_port != dev && ocelot->mrp_s_port != dev) + if (ocelot_port->mrp_ring_id != mrp->ring_id) return 0; - return ocelot_mrp_del_vcap(ocelot, priv->chip_port); + ocelot_mrp_del_vcap(ocelot, port); + ocelot_mrp_del_vcap(ocelot, port + ocelot->num_phys_ports); + + for (i = 0; i < ocelot->num_phys_ports; ++i) { + ocelot_port = ocelot->ports[i]; + + if (!ocelot_port) + continue; + + if (ocelot_port->mrp_ring_id != 0) + goto out; + } + + ocelot_mrp_del_mac(ocelot, ocelot_port); +out: + return 0; } EXPORT_SYMBOL(ocelot_mrp_del_ring_role); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 4d10ccc8e7b5..0a0751bf97dd 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -613,6 +613,8 @@ struct ocelot_port { struct net_device *bond; bool lag_tx_active; + + u16 mrp_ring_id; }; struct ocelot { @@ -681,12 +683,6 @@ struct ocelot { /* Protects the PTP clock */ spinlock_t ptp_clock_lock; struct ptp_pin_desc ptp_pins[OCELOT_PTP_PINS_NUM]; - -#if IS_ENABLED(CONFIG_BRIDGE_MRP) - u16 mrp_ring_id; - struct net_device *mrp_p_port; - struct net_device *mrp_s_port; -#endif }; struct ocelot_policer { diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 743809b5806b..157f95689d8d 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -128,12 +128,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, skb->offload_fwd_mark = 1; skb->priority = qos_class; -#if IS_ENABLED(CONFIG_BRIDGE_MRP) - if (eth_hdr(skb)->h_proto == cpu_to_be16(ETH_P_MRP) && - cpuq & BIT(OCELOT_MRP_CPUQ)) - skb->offload_fwd_mark = 0; -#endif - /* Ocelot switches copy frames unmodified to the CPU. However, it is * possible for the user to request a VLAN modification through * VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that -- cgit v1.2.3 From 2ed2c5f0391106406ead3a74bfa571575eafe8b6 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 16 Mar 2021 21:10:19 +0100 Subject: net: ocelot: Remove ocelot_xfh_get_cpuq Now when extracting frames from CPU the cpuq is not used anymore so remove it. Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 3 +-- include/linux/dsa/ocelot.h | 5 ----- net/dsa/tag_ocelot.c | 2 -- 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 9cc9378157e4..9f0c9bdd9f5d 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -687,7 +687,7 @@ static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh) int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) { struct skb_shared_hwtstamps *shhwtstamps; - u64 tod_in_ns, full_ts_in_ns, cpuq; + u64 tod_in_ns, full_ts_in_ns; u64 timestamp, src_port, len; u32 xfh[OCELOT_TAG_LEN / 4]; struct net_device *dev; @@ -704,7 +704,6 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) ocelot_xfh_get_src_port(xfh, &src_port); ocelot_xfh_get_len(xfh, &len); ocelot_xfh_get_rew_val(xfh, ×tamp); - ocelot_xfh_get_cpuq(xfh, &cpuq); if (WARN_ON(src_port >= ocelot->num_phys_ports)) return -EINVAL; diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h index 4265f328681a..c6bc45ae5e03 100644 --- a/include/linux/dsa/ocelot.h +++ b/include/linux/dsa/ocelot.h @@ -160,11 +160,6 @@ static inline void ocelot_xfh_get_src_port(void *extraction, u64 *src_port) packing(extraction, src_port, 46, 43, OCELOT_TAG_LEN, UNPACK, 0); } -static inline void ocelot_xfh_get_cpuq(void *extraction, u64 *cpuq) -{ - packing(extraction, cpuq, 28, 20, OCELOT_TAG_LEN, UNPACK, 0); -} - static inline void ocelot_xfh_get_qos_class(void *extraction, u64 *qos_class) { packing(extraction, qos_class, 19, 17, OCELOT_TAG_LEN, UNPACK, 0); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 157f95689d8d..f9df9cac81c5 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -83,7 +83,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, struct dsa_port *dp; u8 *extraction; u16 vlan_tpid; - u64 cpuq; /* Revert skb->data by the amount consumed by the DSA master, * so it points to the beginning of the frame. @@ -113,7 +112,6 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, ocelot_xfh_get_qos_class(extraction, &qos_class); ocelot_xfh_get_tag_type(extraction, &tag_type); ocelot_xfh_get_vlan_tci(extraction, &vlan_tci); - ocelot_xfh_get_cpuq(extraction, &cpuq); skb->dev = dsa_master_find_slave(netdev, 0, src_port); if (!skb->dev) -- cgit v1.2.3 From 01035bcc0f9195a19a76c8a006b3c520428acb61 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 16 Mar 2021 15:52:15 -0700 Subject: Revert "net: socket: use BIT() for MSG_*" This reverts commit 0bb3262c0248d44aea3be31076f44beb82a7b120. Breaks things on mips64/qemu Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/socket.h | 71 ++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index e88859f38cd0..385894b4a8bb 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -283,45 +283,42 @@ struct ucred { Added those for 1003.1g not all are supported yet */ -#define MSG_OOB BIT(0) -#define MSG_PEEK BIT(1) -#define MSG_DONTROUTE BIT(2) -#define MSG_TRYHARD BIT(2) /* Synonym for MSG_DONTROUTE for DECnet */ -#define MSG_CTRUNC BIT(3) -#define MSG_PROBE BIT(4) /* Do not send. Only probe path f.e. for MTU */ -#define MSG_TRUNC BIT(5) -#define MSG_DONTWAIT BIT(6) /* Nonblocking io */ -#define MSG_EOR BIT(7) /* End of record */ -#define MSG_WAITALL BIT(8) /* Wait for a full request */ -#define MSG_FIN BIT(9) -#define MSG_SYN BIT(10) -#define MSG_CONFIRM BIT(11) /* Confirm path validity */ -#define MSG_RST BIT(12) -#define MSG_ERRQUEUE BIT(13) /* Fetch message from error queue */ -#define MSG_NOSIGNAL BIT(14) /* Do not generate SIGPIPE */ -#define MSG_MORE BIT(15) /* Sender will send more */ -#define MSG_WAITFORONE BIT(16) /* recvmmsg(): block until 1+ packets avail */ -#define MSG_SENDPAGE_NOPOLICY BIT(16) /* sendpage() internal : do no apply policy */ -#define MSG_SENDPAGE_NOTLAST BIT(17) /* sendpage() internal : not the last page */ -#define MSG_BATCH BIT(18) /* sendmmsg(): more messages coming */ -#define MSG_EOF MSG_FIN -#define MSG_NO_SHARED_FRAGS BIT(19) /* sendpage() internal : page frags - * are not shared - */ -#define MSG_SENDPAGE_DECRYPTED BIT(20) /* sendpage() internal : page may carry - * plain text and require encryption - */ - -#define MSG_ZEROCOPY BIT(26) /* Use user data in kernel path */ -#define MSG_FASTOPEN BIT(29) /* Send data in TCP SYN */ -#define MSG_CMSG_CLOEXEC BIT(30) /* Set close_on_exec for file - * descriptor received through - * SCM_RIGHTS - */ +#define MSG_OOB 1 +#define MSG_PEEK 2 +#define MSG_DONTROUTE 4 +#define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ +#define MSG_CTRUNC 8 +#define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ +#define MSG_TRUNC 0x20 +#define MSG_DONTWAIT 0x40 /* Nonblocking io */ +#define MSG_EOR 0x80 /* End of record */ +#define MSG_WAITALL 0x100 /* Wait for a full request */ +#define MSG_FIN 0x200 +#define MSG_SYN 0x400 +#define MSG_CONFIRM 0x800 /* Confirm path validity */ +#define MSG_RST 0x1000 +#define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ +#define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ +#define MSG_MORE 0x8000 /* Sender will send more */ +#define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ +#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ +#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ +#define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ +#define MSG_EOF MSG_FIN +#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ +#define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry + * plain text and require encryption + */ + +#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ +#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ +#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file + descriptor received through + SCM_RIGHTS */ #if defined(CONFIG_COMPAT) -#define MSG_CMSG_COMPAT BIT(31) /* This message needs 32 bit fixups */ +#define MSG_CMSG_COMPAT 0x80000000 /* This message needs 32 bit fixups */ #else -#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ +#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif -- cgit v1.2.3 From 7a126a43a3dcf0fa6b9f7f2fe3ce82102517afe3 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 16 Sep 2020 10:11:20 +0300 Subject: net: Change dev parameter to const in netif_device_present() Not all ndos check the present bit before calling the ndo and the driver may want to check it. Sometimes the dev parameter passed as const so we pass it to netif_device_present() as const. Since netif_device_present() doesn't modify dev parameter anyway, declare it as const. Signed-off-by: Roi Dayan Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b379d08a12ed..97254c089eb2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4175,7 +4175,7 @@ static inline bool netif_oper_up(const struct net_device *dev) * * Check if device has not been removed from system. */ -static inline bool netif_device_present(struct net_device *dev) +static inline bool netif_device_present(const struct net_device *dev) { return test_bit(__LINK_STATE_PRESENT, &dev->state); } -- cgit v1.2.3 From c276aae8c19d65e21a43c2690c7c7dafea0e97fa Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 26 Jan 2021 11:51:04 +0200 Subject: net/mlx5: Move mlx5e hw resources into a sub object This is to separate between resources attributes and other attributes we will want to use. Signed-off-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/trap.c | 6 ++--- .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_common.c | 27 ++++++++++++---------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 22 +++++++++--------- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 2 +- .../net/ethernet/mellanox/mlx5/core/lib/crypto.c | 2 +- include/linux/mlx5/driver.h | 10 ++++---- 8 files changed, 40 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index d57b6f06382f..bb5d108f75d0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -174,7 +174,7 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_port_ptp *c, int txq_ix, sq->mdev = mdev; sq->ch_ix = c->ix; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.bfreg.map; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->stats = &c->priv->port_ptp_stats.sq[tc]; @@ -475,7 +475,7 @@ int mlx5e_port_ptp_open(struct mlx5e_priv *priv, struct mlx5e_params *params, c->ix = 0; c->pdev = mlx5_core_dma_dev(priv->mdev); c->netdev = priv->netdev; - c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); + c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key); c->num_tc = params->num_tc; c->stats = &priv->port_ptp_stats.ch; c->lag_port = lag_port; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c index 37fc1d77ded7..41db93883fea 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/trap.c @@ -84,7 +84,7 @@ static int mlx5e_alloc_trap_rq(struct mlx5e_priv *priv, struct mlx5e_rq_param *r if (err) goto err_free_frags; - rq->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); + rq->mkey_be = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey.key); mlx5e_rq_set_trap_handlers(rq, params); @@ -213,7 +213,7 @@ static int mlx5e_create_trap_direct_rq_tir(struct mlx5_core_dev *mdev, struct ml return -ENOMEM; tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); - MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.td.tdn); + MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn); MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_NONE); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, rqn); @@ -266,7 +266,7 @@ static struct mlx5e_trap *mlx5e_open_trap(struct mlx5e_priv *priv) t->tstamp = &priv->tstamp; t->pdev = mlx5_core_dma_dev(priv->mdev); t->netdev = priv->netdev; - t->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); + t->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key); t->stats = &priv->trap_stats.ch; netif_napi_add(netdev, &t->napi, mlx5e_trap_napi_poll, 64); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index d06532d0baa4..f7c880edae37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -84,7 +84,7 @@ static int mlx5e_ktls_create_tir(struct mlx5_core_dev *mdev, u32 *tirn, u32 rqtn tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); - MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.td.tdn); + MLX5_SET(tirc, tirc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8); MLX5_SET(tirc, tirc, indirect_table, rqtn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index a6cf008057b5..8c166ee56d8b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -38,15 +38,16 @@ int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, u32 *in) { + struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; int err; err = mlx5_core_create_tir(mdev, in, &tir->tirn); if (err) return err; - mutex_lock(&mdev->mlx5e_res.td.list_lock); - list_add(&tir->list, &mdev->mlx5e_res.td.tirs_list); - mutex_unlock(&mdev->mlx5e_res.td.list_lock); + mutex_lock(&res->td.list_lock); + list_add(&tir->list, &res->td.tirs_list); + mutex_unlock(&res->td.list_lock); return 0; } @@ -54,10 +55,12 @@ int mlx5e_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, u32 *in) void mlx5e_destroy_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir) { - mutex_lock(&mdev->mlx5e_res.td.list_lock); + struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; + + mutex_lock(&res->td.list_lock); mlx5_core_destroy_tir(mdev, tir->tirn); list_del(&tir->list); - mutex_unlock(&mdev->mlx5e_res.td.list_lock); + mutex_unlock(&res->td.list_lock); } void mlx5e_mkey_set_relaxed_ordering(struct mlx5_core_dev *mdev, void *mkc) @@ -99,7 +102,7 @@ static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn, int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) { - struct mlx5e_resources *res = &mdev->mlx5e_res; + struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; int err; err = mlx5_core_alloc_pd(mdev, &res->pdn); @@ -126,8 +129,8 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev) goto err_destroy_mkey; } - INIT_LIST_HEAD(&mdev->mlx5e_res.td.tirs_list); - mutex_init(&mdev->mlx5e_res.td.list_lock); + INIT_LIST_HEAD(&res->td.tirs_list); + mutex_init(&res->td.list_lock); return 0; @@ -142,7 +145,7 @@ err_dealloc_pd: void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) { - struct mlx5e_resources *res = &mdev->mlx5e_res; + struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; mlx5_free_bfreg(mdev, &res->bfreg); mlx5_core_destroy_mkey(mdev, &res->mkey); @@ -180,8 +183,8 @@ int mlx5e_refresh_tirs(struct mlx5e_priv *priv, bool enable_uc_lb, MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1); - mutex_lock(&mdev->mlx5e_res.td.list_lock); - list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) { + mutex_lock(&mdev->mlx5e_res.hw_objs.td.list_lock); + list_for_each_entry(tir, &mdev->mlx5e_res.hw_objs.td.tirs_list, list) { tirn = tir->tirn; err = mlx5_core_modify_tir(mdev, tirn, in); if (err) @@ -192,7 +195,7 @@ out: kvfree(in); if (err) netdev_err(priv->netdev, "refresh tir(0x%x) failed, %d\n", tirn, err); - mutex_unlock(&mdev->mlx5e_res.td.list_lock); + mutex_unlock(&mdev->mlx5e_res.hw_objs.td.list_lock); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3e8434dcc1df..2f961bd9e528 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -302,7 +302,7 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev, MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); mlx5e_mkey_set_relaxed_ordering(mdev, mkc); MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn); MLX5_SET64(mkc, mkc, len, npages << page_shift); MLX5_SET(mkc, mkc, translations_octword_size, MLX5_MTT_OCTW(npages)); @@ -1019,7 +1019,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->pdev = c->pdev; sq->mkey_be = c->mkey_be; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.bfreg.map; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->xsk_pool = xsk_pool; @@ -1090,7 +1090,7 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, int err; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.bfreg.map; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; param->wq.db_numa_node = cpu_to_node(c->cpu); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, wq, &sq->wq_ctrl); @@ -1174,7 +1174,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->priv = c->priv; sq->ch_ix = c->ix; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.bfreg.map; + sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); @@ -1257,7 +1257,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, csp->wq_ctrl->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, csp->wq_ctrl->db.dma); @@ -2032,7 +2032,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->cpu = cpu; c->pdev = mlx5_core_dma_dev(priv->mdev); c->netdev = priv->netdev; - c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); + c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key); c->num_tc = params->num_tc; c->xdp = !!params->xdp_prog; c->stats = &priv->channel_stats[ix].ch; @@ -2217,7 +2217,7 @@ void mlx5e_build_rq_param(struct mlx5e_priv *priv, MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); MLX5_SET(wq, wq, log_wq_stride, mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs)); - MLX5_SET(wq, wq, pd, mdev->mlx5e_res.pdn); + MLX5_SET(wq, wq, pd, mdev->mlx5e_res.hw_objs.pdn); MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter); MLX5_SET(rqc, rqc, vsd, params->vlan_strip_disable); MLX5_SET(rqc, rqc, scatter_fcs, params->scatter_fcs_en); @@ -2248,7 +2248,7 @@ void mlx5e_build_sq_param_common(struct mlx5e_priv *priv, void *wq = MLX5_ADDR_OF(sqc, sqc, wq); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); - MLX5_SET(wq, wq, pd, priv->mdev->mlx5e_res.pdn); + MLX5_SET(wq, wq, pd, priv->mdev->mlx5e_res.hw_objs.pdn); param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev)); } @@ -3421,10 +3421,10 @@ int mlx5e_create_tis(struct mlx5_core_dev *mdev, void *in, u32 *tisn) { void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); - MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn); + MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn); if (MLX5_GET(tisc, tisc, tls_en)) - MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.pdn); + MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.hw_objs.pdn); if (mlx5_lag_is_lacp_owner(mdev)) MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1); @@ -3494,7 +3494,7 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv) static void mlx5e_build_indir_tir_ctx_common(struct mlx5e_priv *priv, u32 rqtn, u32 *tirc) { - MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn); + MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.hw_objs.td.tdn); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); MLX5_SET(tirc, tirc, indirect_table, rqtn); MLX5_SET(tirc, tirc, tunneled_offload_en, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 1eeca45cfcdf..0fc055cdf221 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -708,7 +708,7 @@ static void mlx5_rdma_netdev_free(struct net_device *netdev) static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.pdn != 0; + return mdev->mlx5e_res.hw_objs.pdn != 0; } static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c index 57eb91bcbca7..e995f8378df7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/crypto.c @@ -46,7 +46,7 @@ int mlx5_create_encryption_key(struct mlx5_core_dev *mdev, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY); - MLX5_SET(encryption_key_obj, obj, pd, mdev->mlx5e_res.pdn); + MLX5_SET(encryption_key_obj, obj, pd, mdev->mlx5e_res.hw_objs.pdn); err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); if (!err) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 53b89631a1d9..9887181dea5f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -644,10 +644,12 @@ struct mlx5_td { }; struct mlx5e_resources { - u32 pdn; - struct mlx5_td td; - struct mlx5_core_mkey mkey; - struct mlx5_sq_bfreg bfreg; + struct mlx5e_hw_objs { + u32 pdn; + struct mlx5_td td; + struct mlx5_core_mkey mkey; + struct mlx5_sq_bfreg bfreg; + } hw_objs; }; enum mlx5_sw_icm_type { -- cgit v1.2.3 From c27971d08abecc91f06214dacc66ce3ce2662a44 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 28 Oct 2020 11:21:26 +0200 Subject: net/mlx5: Move devlink port from mlx5e priv to mlx5e resources We re-use the native NIC port net device instance for the Uplink representor, and the devlink port. When changing profiles we reset the mlx5e priv but we should still use the devlink port so move it to mlx5e resources. Signed-off-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c | 17 ++++++++++++----- drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h | 6 ++++++ .../net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 4 +++- .../net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 4 +++- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 5 ++++- include/linux/mlx5/driver.h | 1 + 7 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 7435fe6829b6..4d621d142f76 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -880,7 +880,6 @@ struct mlx5e_priv { #endif struct devlink_health_reporter *tx_reporter; struct devlink_health_reporter *rx_reporter; - struct devlink_port dl_port; struct mlx5e_xsk xsk; #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE) struct mlx5e_hv_vhca_stats_agent stats_agent; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c index 054bc2fc0520..765f3064689d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c @@ -19,6 +19,7 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv) struct devlink *devlink = priv_to_devlink(priv->mdev); struct devlink_port_attrs attrs = {}; struct netdev_phys_item_id ppid = {}; + struct devlink_port *dl_port; unsigned int dl_port_index; if (mlx5_core_is_pf(priv->mdev)) { @@ -36,24 +37,30 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv) dl_port_index = mlx5_esw_vport_to_devlink_port_index(priv->mdev, 0); } - devlink_port_attrs_set(&priv->dl_port, &attrs); + dl_port = mlx5e_devlink_get_dl_port(priv); + memset(dl_port, 0, sizeof(*dl_port)); + devlink_port_attrs_set(dl_port, &attrs); - return devlink_port_register(devlink, &priv->dl_port, dl_port_index); + return devlink_port_register(devlink, dl_port, dl_port_index); } void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv) { - devlink_port_type_eth_set(&priv->dl_port, priv->netdev); + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); + + devlink_port_type_eth_set(dl_port, priv->netdev); } void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv) { - devlink_port_unregister(&priv->dl_port); + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); + + devlink_port_unregister(dl_port); } struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev) { struct mlx5e_priv *priv = netdev_priv(dev); - return &priv->dl_port; + return mlx5e_devlink_get_dl_port(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h index 83123a801adc..10b50feb9883 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h @@ -12,4 +12,10 @@ void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv); void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv); struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev); +static inline struct devlink_port * +mlx5e_devlink_get_dl_port(struct mlx5e_priv *priv) +{ + return &priv->mdev->mlx5e_res.dl_port; +} + #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index d80bbd17e5f8..f0a419fc4adf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -4,6 +4,7 @@ #include "health.h" #include "params.h" #include "txrx.h" +#include "devlink.h" static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state) { @@ -615,9 +616,10 @@ static const struct devlink_health_reporter_ops mlx5_rx_reporter_ops = { void mlx5e_reporter_rx_create(struct mlx5e_priv *priv) { + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_rx_reporter_ops, + reporter = devlink_port_health_reporter_create(dl_port, &mlx5_rx_reporter_ops, MLX5E_REPORTER_RX_GRACEFUL_PERIOD, priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, "Failed to create rx reporter, err = %ld\n", diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index d7275c84313e..db64fa2620c4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -3,6 +3,7 @@ #include "health.h" #include "en/ptp.h" +#include "en/devlink.h" static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) { @@ -572,9 +573,10 @@ static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { void mlx5e_reporter_tx_create(struct mlx5e_priv *priv) { + struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); struct devlink_health_reporter *reporter; - reporter = devlink_port_health_reporter_create(&priv->dl_port, &mlx5_tx_reporter_ops, + reporter = devlink_port_health_reporter_create(dl_port, &mlx5_tx_reporter_ops, MLX5_REPORTER_TX_GRACEFUL_PERIOD, priv); if (IS_ERR(reporter)) { netdev_warn(priv->netdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 1f15c6183dc1..b0604b113530 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -52,6 +52,7 @@ #include "en/health.h" #include "en/params.h" #include "devlink.h" +#include "en/devlink.h" static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, @@ -1823,6 +1824,7 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe struct mlx5e_priv *priv = netdev_priv(rq->netdev); struct mlx5_wq_cyc *wq = &rq->wqe.wq; struct mlx5e_wqe_frag_info *wi; + struct devlink_port *dl_port; struct sk_buff *skb; u32 cqe_bcnt; u16 trap_id; @@ -1845,7 +1847,8 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); skb_push(skb, ETH_HLEN); - mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port); + dl_port = mlx5e_devlink_get_dl_port(priv); + mlx5_devlink_trap_report(rq->mdev, trap_id, skb, dl_port); dev_kfree_skb_any(skb); free_wqe: diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9887181dea5f..f1d0340e46a7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -650,6 +650,7 @@ struct mlx5e_resources { struct mlx5_core_mkey mkey; struct mlx5_sq_bfreg bfreg; } hw_objs; + struct devlink_port dl_port; }; enum mlx5_sw_icm_type { -- cgit v1.2.3 From 7a9fb35e8c3a67145fca262c304de65cb2f83abf Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 16 Sep 2020 10:11:33 +0300 Subject: net/mlx5e: Do not reload ethernet ports when changing eswitch mode When switching modes between legacy and switchdev and back, do not reload ethernet interfaces. just change the profile from nic profile to uplink rep profile in switchdev mode. Signed-off-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/dev.c | 3 - drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + .../ethernet/mellanox/mlx5/core/en/reporter_rx.c | 1 + .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 6 + drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 148 ++++++++++++++------- drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h | 9 ++ include/linux/mlx5/driver.h | 1 + 8 files changed, 116 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/dev.c b/drivers/net/ethernet/mellanox/mlx5/core/dev.c index b051417ede67..4def64d0e669 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/dev.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/dev.c @@ -58,9 +58,6 @@ static bool is_eth_supported(struct mlx5_core_dev *dev) if (!IS_ENABLED(CONFIG_MLX5_CORE_EN)) return false; - if (is_eth_rep_supported(dev)) - return false; - if (MLX5_CAP_GEN(dev, port_type) != MLX5_CAP_PORT_TYPE_ETH) return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 4d621d142f76..1f5bc4d91060 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1173,6 +1173,7 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv); void mlx5e_destroy_netdev(struct mlx5e_priv *priv); int mlx5e_netdev_change_profile(struct mlx5e_priv *priv, const struct mlx5e_profile *new_profile, void *new_ppriv); +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv); void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv); void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu); void mlx5e_build_rq_params(struct mlx5_core_dev *mdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index f0a419fc4adf..34b3b316b688 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -635,4 +635,5 @@ void mlx5e_reporter_rx_destroy(struct mlx5e_priv *priv) return; devlink_port_health_reporter_destroy(priv->rx_reporter); + priv->rx_reporter = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index db64fa2620c4..63ee3b9416de 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -593,4 +593,5 @@ void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv) return; devlink_port_health_reporter_destroy(priv->tx_reporter); + priv->tx_reporter = NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 685cf071a9de..9c08f0bd1fcc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5742,6 +5742,11 @@ rollback: return err; } +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv) +{ + mlx5e_netdev_change_profile(priv, &mlx5e_nic_profile, NULL); +} + void mlx5e_destroy_netdev(struct mlx5e_priv *priv) { struct net_device *netdev = priv->netdev; @@ -5852,6 +5857,7 @@ static int mlx5e_probe(struct auxiliary_device *adev, mlx5e_devlink_port_type_eth_set(priv); mlx5e_dcbnl_init_app(priv); + mlx5_uplink_netdev_set(mdev, netdev); return 0; err_resume: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 9533085005c3..4cc902e0d71b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -44,6 +44,7 @@ #include "en_tc.h" #include "en/rep/tc.h" #include "en/rep/neigh.h" +#include "en/devlink.h" #include "fs_core.h" #include "lib/mlx5.h" #define CREATE_TRACE_POINTS @@ -588,26 +589,15 @@ static void mlx5e_build_rep_params(struct net_device *netdev) } static void mlx5e_build_rep_netdev(struct net_device *netdev, - struct mlx5_core_dev *mdev, - struct mlx5_eswitch_rep *rep) + struct mlx5_core_dev *mdev) { SET_NETDEV_DEV(netdev, mdev->device); - if (rep->vport == MLX5_VPORT_UPLINK) { - netdev->netdev_ops = &mlx5e_netdev_ops; - /* we want a persistent mac for the uplink rep */ - mlx5_query_mac_address(mdev, netdev->dev_addr); - netdev->ethtool_ops = &mlx5e_ethtool_ops; - mlx5e_dcbnl_build_rep_netdev(netdev); - } else { - netdev->netdev_ops = &mlx5e_netdev_ops_rep; - eth_hw_addr_random(netdev); - netdev->ethtool_ops = &mlx5e_rep_ethtool_ops; - } + netdev->netdev_ops = &mlx5e_netdev_ops_rep; + eth_hw_addr_random(netdev); + netdev->ethtool_ops = &mlx5e_rep_ethtool_ops; netdev->watchdog_timeo = 15 * HZ; - netdev->features |= NETIF_F_NETNS_LOCAL; - #if IS_ENABLED(CONFIG_MLX5_CLS_ACT) netdev->hw_features |= NETIF_F_HW_TC; #endif @@ -619,12 +609,9 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev, netdev->hw_features |= NETIF_F_TSO6; netdev->hw_features |= NETIF_F_RXCSUM; - if (rep->vport == MLX5_VPORT_UPLINK) - netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX; - else - netdev->features |= NETIF_F_VLAN_CHALLENGED; - netdev->features |= netdev->hw_features; + netdev->features |= NETIF_F_VLAN_CHALLENGED; + netdev->features |= NETIF_F_NETNS_LOCAL; } static int mlx5e_init_rep(struct mlx5_core_dev *mdev, @@ -990,6 +977,14 @@ static void mlx5e_uplink_rep_enable(struct mlx5e_priv *priv) mlx5e_dcbnl_initialize(priv); mlx5e_dcbnl_init_app(priv); mlx5e_rep_neigh_init(rpriv); + + netdev->wanted_features |= NETIF_F_HW_TC; + + rtnl_lock(); + if (netif_running(netdev)) + mlx5e_open(netdev); + netif_device_attach(netdev); + rtnl_unlock(); } static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) @@ -997,6 +992,12 @@ static void mlx5e_uplink_rep_disable(struct mlx5e_priv *priv) struct mlx5e_rep_priv *rpriv = priv->ppriv; struct mlx5_core_dev *mdev = priv->mdev; + rtnl_lock(); + if (netif_running(priv->netdev)) + mlx5e_close(priv->netdev); + netif_device_detach(priv->netdev); + rtnl_unlock(); + mlx5e_rep_neigh_cleanup(rpriv); mlx5e_dcbnl_delete_app(priv); mlx5_notifier_unregister(mdev, &priv->events_nb); @@ -1081,26 +1082,56 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { /* e-Switch vport representors */ static int -mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); + struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct devlink_port *dl_port; + int err; + + rpriv->netdev = priv->netdev; + + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + if (err) + return err; + + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_eth_set(dl_port, rpriv->netdev); + + return 0; +} + +static void +mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) +{ + struct net_device *netdev = rpriv->netdev; + struct devlink_port *dl_port; + struct mlx5_core_dev *dev; + struct mlx5e_priv *priv; + + priv = netdev_priv(netdev); + dev = priv->mdev; + + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); + if (dl_port) + devlink_port_type_clear(dl_port); + mlx5e_netdev_attach_nic_profile(priv); +} + +static int +mlx5e_vport_vf_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { + struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); const struct mlx5e_profile *profile; - struct mlx5e_rep_priv *rpriv; struct devlink_port *dl_port; struct net_device *netdev; struct mlx5e_priv *priv; unsigned int txqs, rxqs; int nch, err; - rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL); - if (!rpriv) - return -ENOMEM; - - /* rpriv->rep to be looked up when profile->init() is called */ - rpriv->rep = rep; - - profile = (rep->vport == MLX5_VPORT_UPLINK) ? - &mlx5e_uplink_rep_profile : &mlx5e_rep_profile; - + profile = &mlx5e_rep_profile; nch = mlx5e_get_max_num_channels(dev); txqs = nch * profile->max_tc; rxqs = nch * profile->rq_groups; @@ -1109,21 +1140,11 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) mlx5_core_warn(dev, "Failed to create representor netdev for vport %d\n", rep->vport); - kfree(rpriv); return -EINVAL; } - mlx5e_build_rep_netdev(netdev, dev, rep); - + mlx5e_build_rep_netdev(netdev, dev); rpriv->netdev = netdev; - rep->rep_data[REP_ETH].priv = rpriv; - INIT_LIST_HEAD(&rpriv->vport_sqs_list); - - if (rep->vport == MLX5_VPORT_UPLINK) { - err = mlx5e_create_mdev_resources(dev); - if (err) - goto err_destroy_netdev; - } priv = netdev_priv(netdev); priv->profile = profile; @@ -1131,7 +1152,7 @@ mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) err = profile->init(dev, netdev); if (err) { netdev_warn(netdev, "rep profile init failed, %d\n", err); - goto err_destroy_mdev_resources; + goto err_destroy_netdev; } err = mlx5e_attach_netdev(netdev_priv(netdev)); @@ -1161,13 +1182,34 @@ err_detach_netdev: err_cleanup_profile: priv->profile->cleanup(priv); -err_destroy_mdev_resources: - if (rep->vport == MLX5_VPORT_UPLINK) - mlx5e_destroy_mdev_resources(dev); - err_destroy_netdev: mlx5e_destroy_netdev(netdev_priv(netdev)); - kfree(rpriv); + return err; +} + +static int +mlx5e_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) +{ + struct mlx5e_rep_priv *rpriv; + int err; + + rpriv = kzalloc(sizeof(*rpriv), GFP_KERNEL); + if (!rpriv) + return -ENOMEM; + + /* rpriv->rep to be looked up when profile->init() is called */ + rpriv->rep = rep; + rep->rep_data[REP_ETH].priv = rpriv; + INIT_LIST_HEAD(&rpriv->vport_sqs_list); + + if (rep->vport == MLX5_VPORT_UPLINK) + err = mlx5e_vport_uplink_rep_load(dev, rep); + else + err = mlx5e_vport_vf_rep_load(dev, rep); + + if (err) + kfree(rpriv); + return err; } @@ -1181,15 +1223,19 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) struct devlink_port *dl_port; void *ppriv = priv->ppriv; + if (rep->vport == MLX5_VPORT_UPLINK) { + mlx5e_vport_uplink_rep_unload(rpriv); + goto free_ppriv; + } + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); if (dl_port) devlink_port_type_clear(dl_port); unregister_netdev(netdev); mlx5e_detach_netdev(priv); priv->profile->cleanup(priv); - if (rep->vport == MLX5_VPORT_UPLINK) - mlx5e_destroy_mdev_resources(priv->mdev); mlx5e_destroy_netdev(priv); +free_ppriv: kfree(ppriv); /* mlx5e_rep_priv */ } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index d046db7bb047..2f536c5d30b1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -95,4 +95,13 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) return devlink_net(priv_to_devlink(dev)); } +static inline void mlx5_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev) +{ + mdev->mlx5e_res.uplink_netdev = netdev; +} + +static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) +{ + return mdev->mlx5e_res.uplink_netdev; +} #endif diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f1d0340e46a7..23bb01d7c9b9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -651,6 +651,7 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; } hw_objs; struct devlink_port dl_port; + struct net_device *uplink_netdev; }; enum mlx5_sw_icm_type { -- cgit v1.2.3 From 7888fe53b7066c284e172d98d98d1865d6a9e5a0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 16 Mar 2021 17:30:36 -0700 Subject: ethtool: Add common function for filling out strings Add a function to handle the common pattern of printing a string into the ethtool strings interface and incrementing the string pointer by the ETH_GSTRING_LEN. Most of the drivers end up doing this and several have implemented their own versions of this function so it would make sense to consolidate on one implementation. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/ethtool.h | 9 +++++++++ net/ethtool/ioctl.c | 12 ++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index ec4cd3921c67..3583f7fc075c 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -571,4 +571,13 @@ struct ethtool_phy_ops { */ void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops); +/** + * ethtool_sprintf - Write formatted string to ethtool string data + * @data: Pointer to start of string to update + * @fmt: Format of string to write + * + * Write formatted string to data. Update data to point at start of + * next string. + */ +extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); #endif /* _LINUX_ETHTOOL_H */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 24783b71c584..0788cc3b3114 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1844,6 +1844,18 @@ out: return ret; } +__printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsnprintf(*data, ETH_GSTRING_LEN, fmt, args); + va_end(args); + + *data += ETH_GSTRING_LEN; +} +EXPORT_SYMBOL(ethtool_sprintf); + static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; -- cgit v1.2.3 From 964dbf186eaa84d409c359ddf09c827a3fbe8228 Mon Sep 17 00:00:00 2001 From: Álvaro Fernández Rojas Date: Wed, 17 Mar 2021 11:29:26 +0100 Subject: net: dsa: tag_brcm: add support for legacy tags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for legacy Broadcom tags, which are similar to DSA_TAG_PROTO_BRCM. These tags are used on BCM5325, BCM5365 and BCM63xx switches. Signed-off-by: Álvaro Fernández Rojas Signed-off-by: David S. Miller --- include/net/dsa.h | 2 + net/dsa/Kconfig | 7 ++++ net/dsa/tag_brcm.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 113 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 83a933e563fe..dac303edd33d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -49,10 +49,12 @@ struct phylink_link_state; #define DSA_TAG_PROTO_XRS700X_VALUE 19 #define DSA_TAG_PROTO_OCELOT_8021Q_VALUE 20 #define DSA_TAG_PROTO_SEVILLE_VALUE 21 +#define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, + DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 58b8fc82cd3c..aaf8a452fd5b 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -48,6 +48,13 @@ config NET_DSA_TAG_BRCM Say Y if you want to enable support for tagging frames for the Broadcom switches which place the tag after the MAC source address. +config NET_DSA_TAG_BRCM_LEGACY + tristate "Tag driver for Broadcom legacy switches using in-frame headers" + select NET_DSA_TAG_BRCM_COMMON + help + Say Y if you want to enable support for tagging frames for the + Broadcom legacy switches which place the tag after the MAC source + address. config NET_DSA_TAG_BRCM_PREPEND tristate "Tag driver for Broadcom switches using prepended headers" diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index e2577a7dcbca..40e9f3098c8d 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -12,9 +12,26 @@ #include "dsa_priv.h" -/* This tag length is 4 bytes, older ones were 6 bytes, we do not - * handle them - */ +/* Legacy Broadcom tag (6 bytes) */ +#define BRCM_LEG_TAG_LEN 6 + +/* Type fields */ +/* 1st byte in the tag */ +#define BRCM_LEG_TYPE_HI 0x88 +/* 2nd byte in the tag */ +#define BRCM_LEG_TYPE_LO 0x74 + +/* Tag fields */ +/* 3rd byte in the tag */ +#define BRCM_LEG_UNICAST (0 << 5) +#define BRCM_LEG_MULTICAST (1 << 5) +#define BRCM_LEG_EGRESS (2 << 5) +#define BRCM_LEG_INGRESS (3 << 5) + +/* 6th byte in the tag */ +#define BRCM_LEG_PORT_ID (0xf) + +/* Newer Broadcom tag (4 bytes) */ #define BRCM_TAG_LEN 4 /* Tag is constructed and desconstructed using byte by byte access @@ -195,6 +212,87 @@ DSA_TAG_DRIVER(brcm_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM); #endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) +static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + u8 *brcm_tag; + + /* The Ethernet switch we are interfaced with needs packets to be at + * least 64 bytes (including FCS) otherwise they will be discarded when + * they enter the switch port logic. When Broadcom tags are enabled, we + * need to make sure that packets are at least 70 bytes + * (including FCS and tag) because the length verification is done after + * the Broadcom tag is stripped off the ingress packet. + * + * Let dsa_slave_xmit() free the SKB + */ + if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false)) + return NULL; + + skb_push(skb, BRCM_LEG_TAG_LEN); + + memmove(skb->data, skb->data + BRCM_LEG_TAG_LEN, 2 * ETH_ALEN); + + brcm_tag = skb->data + 2 * ETH_ALEN; + + /* Broadcom tag type */ + brcm_tag[0] = BRCM_LEG_TYPE_HI; + brcm_tag[1] = BRCM_LEG_TYPE_LO; + + /* Broadcom tag value */ + brcm_tag[2] = BRCM_LEG_EGRESS; + brcm_tag[3] = 0; + brcm_tag[4] = 0; + brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID; + + return skb; +} + +static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt) +{ + int source_port; + u8 *brcm_tag; + + if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) + return NULL; + + brcm_tag = skb->data - 2; + + source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; + + skb->dev = dsa_master_find_slave(dev, 0, source_port); + if (!skb->dev) + return NULL; + + /* Remove Broadcom tag and update checksum */ + skb_pull_rcsum(skb, BRCM_LEG_TAG_LEN); + + skb->offload_fwd_mark = 1; + + /* Move the Ethernet DA and SA */ + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - BRCM_LEG_TAG_LEN, + 2 * ETH_ALEN); + + return skb; +} + +static const struct dsa_device_ops brcm_legacy_netdev_ops = { + .name = "brcm-legacy", + .proto = DSA_TAG_PROTO_BRCM_LEGACY, + .xmit = brcm_leg_tag_xmit, + .rcv = brcm_leg_tag_rcv, + .overhead = BRCM_LEG_TAG_LEN, +}; + +DSA_TAG_DRIVER(brcm_legacy_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY); +#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */ + #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, struct net_device *dev) @@ -227,6 +325,9 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] = { #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) &DSA_TAG_DRIVER_NAME(brcm_netdev_ops), #endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) + &DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops), +#endif #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) &DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops), #endif -- cgit v1.2.3 From d4a96be65423296e42091b0b79973b8d446e7798 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 11 Mar 2021 13:55:59 +0800 Subject: netfilter: conntrack: Remove unused variable declaration commit e97c3e278e95 ("tproxy: split off ipv6 defragmentation to a separate module") left behind this. Signed-off-by: YueHaibing Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv6/nf_conntrack_ipv6.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index 7b3c873f8839..e95483192d1b 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -4,7 +4,4 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; -#include -extern struct ctl_table nf_ct_ipv6_sysctl_table[]; - #endif /* _NF_CONNTRACK_IPV6_H*/ -- cgit v1.2.3 From 4f08f173d08cad4664e447e580dc0c5aa6332db3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Mar 2021 15:55:11 +0100 Subject: netfilter: flowtable: move FLOW_OFFLOAD_DIR_MAX away from enumeration This allows to remove the default case which should not ever happen and that was added to avoid gcc warnings on unhandled FLOW_OFFLOAD_DIR_MAX enumeration case. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 +- net/netfilter/nf_flow_table_core.c | 4 ---- net/netfilter/nf_flow_table_ip.c | 8 -------- 3 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 54c4d5c908a5..ce507251b3d8 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -86,8 +86,8 @@ static inline bool nf_flowtable_hw_offload(struct nf_flowtable *flowtable) enum flow_offload_tuple_dir { FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL, FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY, - FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX }; +#define FLOW_OFFLOAD_DIR_MAX IP_CT_DIR_MAX struct flow_offload_tuple { union { diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index ff5d94b644ac..3bdbd962a084 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -454,8 +454,6 @@ int nf_flow_snat_port(const struct flow_offload *flow, new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; hdr->dest = new_port; break; - default: - return -1; } return nf_flow_nat_port(skb, thoff, protocol, port, new_port); @@ -482,8 +480,6 @@ int nf_flow_dnat_port(const struct flow_offload *flow, new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; hdr->source = new_port; break; - default: - return -1; } return nf_flow_nat_port(skb, thoff, protocol, port, new_port); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 95adf74515ea..0579e15c4968 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -96,8 +96,6 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; iph->daddr = new_addr; break; - default: - return -1; } csum_replace4(&iph->check, addr, new_addr); @@ -121,8 +119,6 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; iph->saddr = new_addr; break; - default: - return -1; } csum_replace4(&iph->check, addr, new_addr); @@ -371,8 +367,6 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; ip6h->daddr = new_addr; break; - default: - return -1; } return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); @@ -396,8 +390,6 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow, new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; ip6h->saddr = new_addr; break; - default: - return -1; } return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); -- cgit v1.2.3 From f4401262b927b84d2f1861e347627fa0d77d4eb7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 17 Mar 2021 15:55:25 +0100 Subject: netfilter: flowtable: fast NAT functions never fail Simplify existing fast NAT routines by returning void. After the skb_try_make_writable() call consolidation, these routines cannot ever fail. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 12 +-- net/netfilter/nf_flow_table_core.c | 41 ++++------ net/netfilter/nf_flow_table_ip.c | 147 ++++++++++++++-------------------- 3 files changed, 84 insertions(+), 116 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index ce507251b3d8..fb165697c8a1 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -229,12 +229,12 @@ void nf_flow_table_free(struct nf_flowtable *flow_table); void flow_offload_teardown(struct flow_offload *flow); -int nf_flow_snat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir); -int nf_flow_dnat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir); +void nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); +void nf_flow_dnat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir); struct flow_ports { __be16 source, dest; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 3bdbd962a084..8ffd3f3c288c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -389,20 +389,17 @@ static void nf_flow_offload_work_gc(struct work_struct *work) queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); } - -static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, - __be16 port, __be16 new_port) +static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); - - return 0; } -static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, - __be16 port, __be16 new_port) +static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, + __be16 port, __be16 new_port) { struct udphdr *udph; @@ -413,30 +410,24 @@ static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, - u8 protocol, __be16 port, __be16 new_port) +static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, + u8 protocol, __be16 port, __be16 new_port) { switch (protocol) { case IPPROTO_TCP: - if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) - return NF_DROP; + nf_flow_nat_port_tcp(skb, thoff, port, new_port); break; case IPPROTO_UDP: - if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) - return NF_DROP; + nf_flow_nat_port_udp(skb, thoff, port, new_port); break; } - - return 0; } -int nf_flow_snat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir) +void nf_flow_snat_port(const struct flow_offload *flow, + struct sk_buff *skb, unsigned int thoff, + u8 protocol, enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; @@ -456,13 +447,13 @@ int nf_flow_snat_port(const struct flow_offload *flow, break; } - return nf_flow_nat_port(skb, thoff, protocol, port, new_port); + nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_snat_port); -int nf_flow_dnat_port(const struct flow_offload *flow, - struct sk_buff *skb, unsigned int thoff, - u8 protocol, enum flow_offload_tuple_dir dir) +void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, + unsigned int thoff, u8 protocol, + enum flow_offload_tuple_dir dir) { struct flow_ports *hdr; __be16 port, new_port; @@ -482,7 +473,7 @@ int nf_flow_dnat_port(const struct flow_offload *flow, break; } - return nf_flow_nat_port(skb, thoff, protocol, port, new_port); + nf_flow_nat_port(skb, thoff, protocol, port, new_port); } EXPORT_SYMBOL_GPL(nf_flow_dnat_port); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 0579e15c4968..714dc083f093 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -34,19 +34,17 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, return 0; } -static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) +static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); - - return 0; } -static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, - __be32 addr, __be32 new_addr) +static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, + __be32 addr, __be32 new_addr) { struct udphdr *udph; @@ -57,31 +55,25 @@ static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, - unsigned int thoff, __be32 addr, - __be32 new_addr) +static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, __be32 addr, + __be32 new_addr) { switch (iph->protocol) { case IPPROTO_TCP: - if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr); break; case IPPROTO_UDP: - if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ip_udp(skb, thoff, addr, new_addr); break; } - - return 0; } -static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_snat_ip(const struct flow_offload *flow, + struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; @@ -99,12 +91,12 @@ static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, } csum_replace4(&iph->check, addr, new_addr); - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); + nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } -static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, - struct iphdr *iph, unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_dnat_ip(const struct flow_offload *flow, + struct sk_buff *skb, struct iphdr *iph, + unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; @@ -122,24 +114,21 @@ static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, } csum_replace4(&iph->check, addr, new_addr); - return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); + nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } -static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, +static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, enum flow_offload_tuple_dir dir, struct iphdr *iph) { - if (test_bit(NF_FLOW_SNAT, &flow->flags) && - (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) - return -1; - - if (test_bit(NF_FLOW_DNAT, &flow->flags) && - (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || - nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) - return -1; - - return 0; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir); + nf_flow_snat_ip(flow, skb, iph, thoff, dir); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir); + nf_flow_dnat_ip(flow, skb, iph, thoff, dir); + } } static bool ip_has_options(unsigned int thoff) @@ -276,8 +265,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return NF_DROP; iph = ip_hdr(skb); - if (nf_flow_nat_ip(flow, skb, thoff, dir, iph) < 0) - return NF_DROP; + nf_flow_nat_ip(flow, skb, thoff, dir, iph); ip_decrease_ttl(iph); skb->tstamp = 0; @@ -301,22 +289,21 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); -static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr, + struct ipv6hdr *ip6h) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, new_addr->s6_addr32, true); - - return 0; } -static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, - struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, + struct in6_addr *addr, + struct in6_addr *new_addr) { struct udphdr *udph; @@ -327,32 +314,26 @@ static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, if (!udph->check) udph->check = CSUM_MANGLED_0; } - - return 0; } -static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, struct in6_addr *addr, - struct in6_addr *new_addr) +static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, struct in6_addr *addr, + struct in6_addr *new_addr) { switch (ip6h->nexthdr) { case IPPROTO_TCP: - if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h); break; case IPPROTO_UDP: - if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0) - return NF_DROP; + nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr); break; } - - return 0; } -static int nf_flow_snat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_snat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; @@ -369,13 +350,13 @@ static int nf_flow_snat_ipv6(const struct flow_offload *flow, break; } - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); + nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } -static int nf_flow_dnat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, struct ipv6hdr *ip6h, - unsigned int thoff, - enum flow_offload_tuple_dir dir) +static void nf_flow_dnat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int thoff, + enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; @@ -392,27 +373,24 @@ static int nf_flow_dnat_ipv6(const struct flow_offload *flow, break; } - return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); + nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } -static int nf_flow_nat_ipv6(const struct flow_offload *flow, - struct sk_buff *skb, - enum flow_offload_tuple_dir dir, - struct ipv6hdr *ip6h) +static void nf_flow_nat_ipv6(const struct flow_offload *flow, + struct sk_buff *skb, + enum flow_offload_tuple_dir dir, + struct ipv6hdr *ip6h) { unsigned int thoff = sizeof(*ip6h); - if (test_bit(NF_FLOW_SNAT, &flow->flags) && - (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) - return -1; - - if (test_bit(NF_FLOW_DNAT, &flow->flags) && - (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || - nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) - return -1; - - return 0; + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { + nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir); + nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir); + } + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { + nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir); + nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir); + } } static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, @@ -507,8 +485,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return NF_DROP; ip6h = ipv6_hdr(skb); - if (nf_flow_nat_ipv6(flow, skb, dir, ip6h) < 0) - return NF_DROP; + nf_flow_nat_ipv6(flow, skb, dir, ip6h); ip6h->hop_limit--; skb->tstamp = 0; -- cgit v1.2.3 From 0ce7cf4127f14078ca598ba9700d813178a59409 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 18 Mar 2021 01:25:05 +0100 Subject: netfilter: nftables: update table flags from the commit phase Do not update table flags from the preparation phase. Store the flags update into the transaction, then update the flags from the commit phase. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 ++++++--- net/netfilter/nf_tables_api.c | 31 ++++++++++++++++--------------- 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index fdec57d862b7..67bc36f7f4fb 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1498,13 +1498,16 @@ struct nft_trans_chain { struct nft_trans_table { bool update; - bool enable; + u8 state; + u32 flags; }; #define nft_trans_table_update(trans) \ (((struct nft_trans_table *)trans->data)->update) -#define nft_trans_table_enable(trans) \ - (((struct nft_trans_table *)trans->data)->enable) +#define nft_trans_table_state(trans) \ + (((struct nft_trans_table *)trans->data)->state) +#define nft_trans_table_flags(trans) \ + (((struct nft_trans_table *)trans->data)->flags) struct nft_trans_elem { struct nft_set *set; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 083c112bee0b..bd5e8122ea5e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -900,6 +900,12 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) nft_table_disable(net, table, 0); } +enum { + NFT_TABLE_STATE_UNCHANGED = 0, + NFT_TABLE_STATE_DORMANT, + NFT_TABLE_STATE_WAKEUP +}; + static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; @@ -929,19 +935,17 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if ((flags & NFT_TABLE_F_DORMANT) && !(ctx->table->flags & NFT_TABLE_F_DORMANT)) { - nft_trans_table_enable(trans) = false; + nft_trans_table_state(trans) = NFT_TABLE_STATE_DORMANT; } else if (!(flags & NFT_TABLE_F_DORMANT) && ctx->table->flags & NFT_TABLE_F_DORMANT) { - ctx->table->flags &= ~NFT_TABLE_F_DORMANT; ret = nf_tables_table_enable(ctx->net, ctx->table); if (ret >= 0) - nft_trans_table_enable(trans) = true; - else - ctx->table->flags |= NFT_TABLE_F_DORMANT; + nft_trans_table_state(trans) = NFT_TABLE_STATE_WAKEUP; } if (ret < 0) goto err; + nft_trans_table_flags(trans) = flags; nft_trans_table_update(trans) = true; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; @@ -8068,11 +8072,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (!nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; - } + if (nft_trans_table_state(trans) == NFT_TABLE_STATE_DORMANT) + nf_tables_table_disable(net, trans->ctx.table); + + trans->ctx.table->flags = nft_trans_table_flags(trans); } else { nft_clear(net, trans->ctx.table); } @@ -8283,11 +8286,9 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) switch (trans->msg_type) { case NFT_MSG_NEWTABLE: if (nft_trans_table_update(trans)) { - if (nft_trans_table_enable(trans)) { - nf_tables_table_disable(net, - trans->ctx.table); - trans->ctx.table->flags |= NFT_TABLE_F_DORMANT; - } + if (nft_trans_table_state(trans) == NFT_TABLE_STATE_WAKEUP) + nf_tables_table_disable(net, trans->ctx.table); + nft_trans_destroy(trans); } else { list_del_rcu(&trans->ctx.table->list); -- cgit v1.2.3 From 97a19caf1b1f6a9d4f620a9d51405a1973bd4641 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 17 Mar 2021 10:41:32 -0700 Subject: bpf: net: Emit anonymous enum with BPF_TCP_CLOSE value explicitly The selftest failed to compile with clang-built bpf-next. Adding LLVM=1 to your vmlinux and selftest build will use clang. The error message is: progs/test_sk_storage_tracing.c:38:18: error: use of undeclared identifier 'BPF_TCP_CLOSE' if (newstate == BPF_TCP_CLOSE) ^ 1 error generated. make: *** [Makefile:423: /bpf-next/tools/testing/selftests/bpf/test_sk_storage_tracing.o] Error 1 The reason for the failure is that BPF_TCP_CLOSE, a value of an anonymous enum defined in uapi bpf.h, is not defined in vmlinux.h. gcc does not have this problem. Since vmlinux.h is derived from BTF which is derived from vmlinux DWARF, that means gcc-produced vmlinux DWARF has BPF_TCP_CLOSE while llvm-produced vmlinux DWARF does not have. BPF_TCP_CLOSE is referenced in net/ipv4/tcp.c as BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); The following test mimics the above BUILD_BUG_ON, preprocessed with clang compiler, and shows gcc DWARF contains BPF_TCP_CLOSE while llvm DWARF does not. $ cat t.c enum { BPF_TCP_ESTABLISHED = 1, BPF_TCP_CLOSE = 7, }; enum { TCP_ESTABLISHED = 1, TCP_CLOSE = 7, }; int test() { do { extern void __compiletime_assert_767(void) ; if ((int)BPF_TCP_CLOSE != (int)TCP_CLOSE) __compiletime_assert_767(); } while (0); return 0; } $ clang t.c -O2 -c -g && llvm-dwarfdump t.o | grep BPF_TCP_CLOSE $ gcc t.c -O2 -c -g && llvm-dwarfdump t.o | grep BPF_TCP_CLOSE DW_AT_name ("BPF_TCP_CLOSE") Further checking clang code find clang actually tried to evaluate condition at compile time. If it is definitely true/false, it will perform optimization and the whole if condition will be removed before generating IR/debuginfo. This patch explicited add an expression after the above mentioned BUILD_BUG_ON in net/ipv4/tcp.c like (void)BPF_TCP_ESTABLISHED to enable generation of debuginfo for the anonymous enum which also includes BPF_TCP_CLOSE. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210317174132.589276-1-yhs@fb.com --- include/linux/btf.h | 1 + net/ipv4/tcp.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 7fabf1428093..9c1b52738bbe 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -9,6 +9,7 @@ #include #define BTF_TYPE_EMIT(type) ((void)(type *)0) +#define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val) struct btf; struct btf_member; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index de7cc8445ac0..e14fd0c50c10 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -267,6 +267,7 @@ #include #include #include +#include #include #include @@ -2587,6 +2588,17 @@ void tcp_set_state(struct sock *sk, int state) BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); + /* bpf uapi header bpf.h defines an anonymous enum with values + * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux + * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. + * But clang built vmlinux does not have this enum in DWARF + * since clang removes the above code before generating IR/debuginfo. + * Let us explicitly emit the type debuginfo to ensure the + * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF + * regardless of which compiler is used. + */ + BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); + if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); -- cgit v1.2.3 From 255c04a87f4381849fce9ed81e5efabf78a71a30 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:43 +0100 Subject: net: embed num_tc in the xps maps The xps cpus/rxqs map is accessed using dev->num_tc, which is used when allocating the map. But later updates of dev->num_tc can lead to having a mismatch between the maps and how they're accessed. In such cases the map values do not make any sense and out of bound accesses can occur (that can be easily seen using KASAN). This patch aims at fixing this by embedding num_tc into the maps, using the value at the time the map is created. This brings two improvements: - The maps can be accessed using the embedded num_tc, so we know for sure we won't have out of bound accesses. - Checks can be made before accessing the maps so we know the values retrieved will make sense. We also update __netif_set_xps_queue to conditionally copy old maps from dev_maps in the new one only if the number of traffic classes from both maps match. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++++ net/core/dev.c | 63 ++++++++++++++++++++++++++++++----------------- net/core/net-sysfs.c | 45 +++++++++++++-------------------- 3 files changed, 64 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 97254c089eb2..c38534c55ea1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -771,9 +771,15 @@ struct xps_map { /* * This structure holds all XPS maps for device. Maps are indexed by CPU. + * + * We keep track of the number of traffic classes used when the struct is + * allocated, in num_tc. This will be used to navigate the maps, to ensure we're + * not crossing its upper bound, as the original dev->num_tc can be updated in + * the meantime. */ struct xps_dev_maps { struct rcu_head rcu; + s16 num_tc; struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */ }; diff --git a/net/core/dev.c b/net/core/dev.c index 6bc20eabd2b0..4e29d1994fdd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2491,7 +2491,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { - int num_tc = dev->num_tc ? : 1; + int num_tc = dev_maps->num_tc; bool active = false; int tci; @@ -2634,10 +2634,10 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, { const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - bool active = false; unsigned int nr_ids; if (dev->num_tc) { @@ -2672,19 +2672,29 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc could have been updated in between. We + * could try to be smart, but let's be safe instead and only copy + * foreign traffic classes if the two map sizes match. + */ + if (dev_maps && dev_maps->num_tc == num_tc) + copy = true; + /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { - if (!new_dev_maps) - new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : - NULL; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) @@ -2706,7 +2716,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; copy && i--; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); @@ -2736,14 +2746,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, numa_node_id = -1; } #endif - } else if (dev_maps) { + } else if (copy) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ - for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { + for (i = num_tc - tc, tci++; copy && --i; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); @@ -2761,11 +2771,18 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { - for (i = num_tc, tci = j * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); - if (map && map != new_map) - kfree_rcu(map, rcu); + if (!map) + continue; + + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + kfree_rcu(map, rcu); } } @@ -2789,12 +2806,12 @@ out_no_new_maps: /* removes tx-queue from unused CPUs/rx-queues */ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { - for (i = tc, tci = j * num_tc; i--; tci++) + for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); if (!netif_attr_test_mask(j, mask, nr_ids) || !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); - for (i = num_tc - tc, tci++; --i; tci++) + for (i = dev_maps->num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); } @@ -2812,7 +2829,7 @@ error: j < nr_ids;) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); - map = dev_maps ? + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) @@ -3944,13 +3961,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { + int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tc >= dev_maps->num_tc) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5f76183ad5bc..1364d0f39cb0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1364,9 +1364,9 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { - int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + int j, len, ret, tc = 0; unsigned long *mask; unsigned int index; @@ -1378,22 +1378,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, if (!rtnl_trylock()) return restart_syscall(); - if (dev->num_tc) { - /* Do not allow XPS on subordinate device directly */ - num_tc = dev->num_tc; - if (num_tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } - - /* If queue belongs to subordinate dev use its map */ - dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; } mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); @@ -1404,12 +1395,12 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); - if (!dev_maps) + if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), j < nr_cpu_ids;) { - int i, tci = j * num_tc + tc; + int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); @@ -1480,9 +1471,9 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { - int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + int j, len, ret, tc = 0; unsigned long *mask; unsigned int index; @@ -1491,14 +1482,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) if (!rtnl_trylock()) return restart_syscall(); - if (dev->num_tc) { - num_tc = dev->num_tc; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; } + mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); if (!mask) { ret = -ENOMEM; @@ -1507,12 +1496,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); - if (!dev_maps) + if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), j < dev->num_rx_queues;) { - int i, tci = j * num_tc + tc; + int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); -- cgit v1.2.3 From 5478fcd0f48322e04ae6c173ad3a1959e066dc83 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:44 +0100 Subject: net: embed nr_ids in the xps maps Embed nr_ids (the number of cpu for the xps cpus map, and the number of rxqs for the xps cpus map) in dev_maps. That will help not accessing out of bound memory if those values change after dev_maps was allocated. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ net/core/dev.c | 45 +++++++++++++++++++++------------------------ net/core/net-sysfs.c | 38 ++++++++++++++++++++++---------------- 3 files changed, 47 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c38534c55ea1..09e73f5a8c78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -772,6 +772,9 @@ struct xps_map { /* * This structure holds all XPS maps for device. Maps are indexed by CPU. * + * We keep track of the number of cpus/rxqs used when the struct is allocated, + * in nr_ids. This will help not accessing out-of-bound memory. + * * We keep track of the number of traffic classes used when the struct is * allocated, in num_tc. This will be used to navigate the maps, to ensure we're * not crossing its upper bound, as the original dev->num_tc can be updated in @@ -779,6 +782,7 @@ struct xps_map { */ struct xps_dev_maps { struct rcu_head rcu; + unsigned int nr_ids; s16 num_tc; struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */ }; diff --git a/net/core/dev.c b/net/core/dev.c index 4e29d1994fdd..7530c95970a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2524,14 +2524,14 @@ static void reset_xps_maps(struct net_device *dev, } static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, - struct xps_dev_maps *dev_maps, unsigned int nr_ids, - u16 offset, u16 count, bool is_rxqs_map) + struct xps_dev_maps *dev_maps, u16 offset, u16 count, + bool is_rxqs_map) { + unsigned int nr_ids = dev_maps->nr_ids; bool active = false; int i, j; - for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), - j < nr_ids;) + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) @@ -2551,7 +2551,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, { const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - unsigned int nr_ids; if (!static_key_false(&xps_needed)) return; @@ -2561,11 +2560,9 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (static_key_false(&xps_rxqs_needed)) { dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) { - nr_ids = dev->num_rx_queues; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, - offset, count, true); - } + if (dev_maps) + clean_xps_maps(dev, possible_mask, dev_maps, offset, + count, true); } dev_maps = xmap_dereference(dev->xps_cpus_map); @@ -2574,9 +2571,7 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (num_possible_cpus() > 1) possible_mask = cpumask_bits(cpu_possible_mask); - nr_ids = nr_cpu_ids; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, - false); + clean_xps_maps(dev, possible_mask, dev_maps, offset, count, false); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2673,11 +2668,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're - * setting up now, as dev->num_tc could have been updated in between. We - * could try to be smart, but let's be safe instead and only copy - * foreign traffic classes if the two map sizes match. + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. */ - if (dev_maps && dev_maps->num_tc == num_tc) + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ @@ -2690,6 +2686,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, return -ENOMEM; } + new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } @@ -2770,7 +2767,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, goto out_no_old_maps; for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + j < dev_maps->nr_ids;) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) @@ -2804,12 +2801,12 @@ out_no_new_maps: goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = -1; j = netif_attrmask_next(j, possible_mask, dev_maps->nr_ids), + j < dev_maps->nr_ids;) { for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!netif_attr_test_mask(j, mask, nr_ids) || - !netif_attr_test_online(j, online_mask, nr_ids)) + if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) || + !netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = dev_maps->num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -3965,7 +3962,7 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_map *map; int queue_index = -1; - if (tc >= dev_maps->num_tc) + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; @@ -4004,7 +4001,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (dev_maps) { int tci = sk_rx_queue_get(sk); - if (tci >= 0 && tci < dev->num_rx_queues) + if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1364d0f39cb0..bb08bdc88fa9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1366,9 +1366,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, { struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + unsigned int index, nr_ids; int j, len, ret, tc = 0; unsigned long *mask; - unsigned int index; if (!netif_is_multiqueue(dev)) return -ENOENT; @@ -1387,19 +1387,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, goto err_rtnl_unlock; } - mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_cpus_map); + nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; + + mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { ret = -ENOMEM; - goto err_rtnl_unlock; + goto err_rcu_unlock; } - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), - j < nr_cpu_ids;) { + for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1419,10 +1420,12 @@ out_no_maps: rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; +err_rcu_unlock: + rcu_read_unlock(); err_rtnl_unlock: rtnl_unlock(); return ret; @@ -1473,9 +1476,9 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + unsigned int index, nr_ids; int j, len, ret, tc = 0; unsigned long *mask; - unsigned int index; index = get_netdev_queue_index(queue); @@ -1488,19 +1491,20 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) goto err_rtnl_unlock; } - mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; + + mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { ret = -ENOMEM; - goto err_rtnl_unlock; + goto err_rcu_unlock; } - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_rxqs_map); if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), - j < dev->num_rx_queues;) { + for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1520,11 +1524,13 @@ out_no_maps: rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; +err_rcu_unlock: + rcu_read_unlock(); err_rtnl_unlock: rtnl_unlock(); return ret; -- cgit v1.2.3 From 044ab86d431b59b88966457dbb62679f274ec442 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:46 +0100 Subject: net: move the xps maps to an array Move the xps maps (xps_cpus_map and xps_rxqs_map) to an array in net_device. That will simplify a lot the code removing the need for lots of if/else conditionals as the correct map will be available using its offset in the array. This should not modify the xps maps behaviour in any way. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 2 +- include/linux/netdevice.h | 17 +++++++---- net/core/dev.c | 73 ++++++++++++++++++++--------------------------- net/core/net-sysfs.c | 6 ++-- 4 files changed, 46 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 77ba8e2fc11c..584a9bd59dda 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2015,7 +2015,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) } virtqueue_set_affinity(vi->rq[i].vq, mask); virtqueue_set_affinity(vi->sq[i].vq, mask); - __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false); + __netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS); cpumask_clear(mask); } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 09e73f5a8c78..4940509999be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -754,6 +754,13 @@ struct rx_queue_attribute { const char *buf, size_t len); }; +/* XPS map type and offset of the xps map within net_device->xps_maps[]. */ +enum xps_map_type { + XPS_CPUS = 0, + XPS_RXQS, + XPS_MAPS_MAX, +}; + #ifdef CONFIG_XPS /* * This structure holds an XPS map which can be of variable length. The @@ -1773,8 +1780,7 @@ enum netdev_ml_priv_type { * @tx_queue_len: Max frames per queue allowed * @tx_global_lock: XXX: need comments on this one * @xdp_bulkq: XDP device bulk queue - * @xps_cpus_map: all CPUs map for XPS device - * @xps_rxqs_map: all RXQs map for XPS device + * @xps_maps: all CPUs/RXQs maps for XPS device * * @xps_maps: XXX: need comments on this one * @miniq_egress: clsact qdisc specific data for @@ -2070,8 +2076,7 @@ struct net_device { struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_cpus_map; - struct xps_dev_maps __rcu *xps_rxqs_map; + struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; #endif #ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc __rcu *miniq_egress; @@ -3701,7 +3706,7 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map); + u16 index, enum xps_map_type type); /** * netif_attr_test_mask - Test a CPU or Rx queue set in a mask @@ -3796,7 +3801,7 @@ static inline int netif_set_xps_queue(struct net_device *dev, static inline int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map) + u16 index, enum xps_map_type type) { return 0; } diff --git a/net/core/dev.c b/net/core/dev.c index 3ed8cb3a4061..af57e32bb543 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2511,31 +2511,34 @@ static bool remove_xps_queue_cpu(struct net_device *dev, static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, - bool is_rxqs_map) + enum xps_map_type type) { - if (is_rxqs_map) { - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - } static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + kfree_rcu(dev_maps, rcu); } -static void clean_xps_maps(struct net_device *dev, - struct xps_dev_maps *dev_maps, u16 offset, u16 count, - bool is_rxqs_map) +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) { + struct xps_dev_maps *dev_maps; bool active = false; int i, j; + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); - if (!is_rxqs_map) { + if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); @@ -2545,27 +2548,17 @@ static void clean_xps_maps(struct net_device *dev, static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { - struct xps_dev_maps *dev_maps; - if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); - if (static_key_false(&xps_rxqs_needed)) { - dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) - clean_xps_maps(dev, dev_maps, offset, count, true); - } - - dev_maps = xmap_dereference(dev->xps_cpus_map); - if (!dev_maps) - goto out_no_maps; + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); - clean_xps_maps(dev, dev_maps, offset, count, false); + clean_xps_maps(dev, XPS_CPUS, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2617,7 +2610,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map) + u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; const unsigned long *online_mask = NULL; @@ -2642,15 +2635,15 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } mutex_lock(&xps_map_mutex); - if (is_rxqs_map) { + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); - dev_maps = xmap_dereference(dev->xps_rxqs_map); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); - dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } @@ -2683,7 +2676,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, j, index, is_rxqs_map); + map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; @@ -2696,7 +2689,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) + if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } @@ -2725,7 +2718,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (!is_rxqs_map) { + if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) @@ -2746,10 +2739,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } } - if (is_rxqs_map) - rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); - else - rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) @@ -2778,12 +2768,11 @@ out_no_old_maps: active = true; out_no_new_maps: - if (!is_rxqs_map) { + if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); - } if (!dev_maps) goto out_no_maps; @@ -2801,7 +2790,7 @@ out_no_new_maps: /* free map if not active */ if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2833,7 +2822,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, int ret; cpus_read_lock(); - ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; @@ -3983,7 +3972,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; - dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); @@ -3994,7 +3983,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, get_cpus_map: if (queue_index < 0) { - dev_maps = rcu_dereference(sb_dev->xps_cpus_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c762c435ff76..ca1f3b63cfad 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1388,7 +1388,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); + dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]); nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; mask = bitmap_zalloc(nr_ids, GFP_KERNEL); @@ -1492,7 +1492,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) } rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_rxqs_map); + dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]); nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; mask = bitmap_zalloc(nr_ids, GFP_KERNEL); @@ -1566,7 +1566,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, } cpus_read_lock(); - err = __netif_set_xps_queue(dev, mask, index, true); + err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); -- cgit v1.2.3 From cc76ce9e8dc659561ee62876da2cffc03fb58cc5 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 18 Mar 2021 20:25:33 +0100 Subject: net: dsa: Add helper to resolve bridge port from DSA port In order for a driver to be able to query a bridge for information about itself, e.g. reading out port flags, it has to use a netdev that is known to the bridge. In the simple case, that is just the netdev representing the port, e.g. swp0 or swp1 in this example: br0 / \ swp0 swp1 But in the case of an offloaded lag, this will be the bond or team interface, e.g. bond0 in this example: br0 / bond0 / \ swp0 swp1 Add a helper that hides some of this complexity from the drivers. Then, redefine dsa_port_offloads_bridge_port using the helper to avoid double accounting of the set of possible offloaded uppers. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 14 ++++++++++++++ net/dsa/dsa_priv.h | 14 +------------- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index dac303edd33d..57b2c49f72f4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -493,6 +493,20 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) return dp->vlan_filtering; } +static inline +struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) +{ + if (!dp->bridge_dev) + return NULL; + + if (dp->lag_dev) + return dp->lag_dev; + else if (dp->hsr_dev) + return dp->hsr_dev; + + return dp->slave; +} + typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 9d4b0e9b1aa1..4c43c5406834 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -233,19 +233,7 @@ extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, struct net_device *dev) { - /* Switchdev offloading can be configured on: */ - - if (dev == dp->slave) - /* DSA ports directly connected to a bridge, and event - * was emitted for the ports themselves. - */ - return true; - - if (dp->lag_dev == dev) - /* DSA ports connected to a bridge via a LAG */ - return true; - - return false; + return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, -- cgit v1.2.3 From 76da35dc99afb460b9c335182ba6a3e7ff924186 Mon Sep 17 00:00:00 2001 From: "Wong, Vee Khee" Date: Wed, 17 Mar 2021 09:32:47 +0800 Subject: stmmac: intel: Add PSE and PCH PTP clock source selection Intel mGbE variant implemented in EHL and TGL can be set to select different clock frequency based on GPO bits in MAC_GPIO_STATUS register. We introduce a new "void (*ptp_clk_freq_config)(void *priv)" in platform data so that if a platform is required to configure the frequency of clock source, in this case Intel mGBE does, the platform-specific configuration of the PTP clock setting is done when stmmac_ptp_register() is called. Signed-off-by: Wong, Vee Khee Signed-off-by: Voon Weifeng Co-developed-by: Ong Boon Leong Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 46 +++++++++++++++++++++++ drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 7 ++++ drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 3 ++ include/linux/stmmac.h | 1 + 4 files changed, 57 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index c49646773871..763b549e3c2d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -12,8 +12,18 @@ #define INTEL_MGBE_ADHOC_ADDR 0x15 #define INTEL_MGBE_XPCS_ADDR 0x16 +/* Selection for PTP Clock Freq belongs to PSE & PCH GbE */ +#define PSE_PTP_CLK_FREQ_MASK (GMAC_GPO0 | GMAC_GPO3) +#define PSE_PTP_CLK_FREQ_19_2MHZ (GMAC_GPO0) +#define PSE_PTP_CLK_FREQ_200MHZ (GMAC_GPO0 | GMAC_GPO3) +#define PSE_PTP_CLK_FREQ_256MHZ (0) +#define PCH_PTP_CLK_FREQ_MASK (GMAC_GPO0) +#define PCH_PTP_CLK_FREQ_19_2MHZ (GMAC_GPO0) +#define PCH_PTP_CLK_FREQ_200MHZ (0) + struct intel_priv_data { int mdio_adhoc_addr; /* mdio address for serdes & etc */ + bool is_pse; }; /* This struct is used to associate PCI Function of MAC controller on a board, @@ -204,6 +214,32 @@ static void intel_serdes_powerdown(struct net_device *ndev, void *intel_data) } } +/* Program PTP Clock Frequency for different variant of + * Intel mGBE that has slightly different GPO mapping + */ +static void intel_mgbe_ptp_clk_freq_config(void *npriv) +{ + struct stmmac_priv *priv = (struct stmmac_priv *)npriv; + struct intel_priv_data *intel_priv; + u32 gpio_value; + + intel_priv = (struct intel_priv_data *)priv->plat->bsp_priv; + + gpio_value = readl(priv->ioaddr + GMAC_GPIO_STATUS); + + if (intel_priv->is_pse) { + /* For PSE GbE, use 200MHz */ + gpio_value &= ~PSE_PTP_CLK_FREQ_MASK; + gpio_value |= PSE_PTP_CLK_FREQ_200MHZ; + } else { + /* For PCH GbE, use 200MHz */ + gpio_value &= ~PCH_PTP_CLK_FREQ_MASK; + gpio_value |= PCH_PTP_CLK_FREQ_200MHZ; + } + + writel(gpio_value, priv->ioaddr + GMAC_GPIO_STATUS); +} + static void common_default_data(struct plat_stmmacenet_data *plat) { plat->clk_csr = 2; /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ @@ -322,6 +358,8 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, return ret; } + plat->ptp_clk_freq_config = intel_mgbe_ptp_clk_freq_config; + /* Set default value for multicast hash bins */ plat->multicast_filter_bins = HASH_TABLE_SIZE; @@ -391,8 +429,12 @@ static struct stmmac_pci_info ehl_rgmii1g_info = { static int ehl_pse0_common_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { + struct intel_priv_data *intel_priv = plat->bsp_priv; + + intel_priv->is_pse = true; plat->bus_id = 2; plat->addr64 = 32; + return ehl_common_data(pdev, plat); } @@ -423,8 +465,12 @@ static struct stmmac_pci_info ehl_pse0_sgmii1g_info = { static int ehl_pse1_common_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { + struct intel_priv_data *intel_priv = plat->bsp_priv; + + intel_priv->is_pse = true; plat->bus_id = 3; plat->addr64 = 32; + return ehl_common_data(pdev, plat); } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 82df91c130f7..ef8502d2b6e6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -42,6 +42,7 @@ #define GMAC_HW_FEATURE3 0x00000128 #define GMAC_MDIO_ADDR 0x00000200 #define GMAC_MDIO_DATA 0x00000204 +#define GMAC_GPIO_STATUS 0x0000020C #define GMAC_ARP_ADDR 0x00000210 #define GMAC_ADDR_HIGH(reg) (0x300 + reg * 8) #define GMAC_ADDR_LOW(reg) (0x304 + reg * 8) @@ -278,6 +279,12 @@ enum power_event { #define GMAC_HW_FEAT_DVLAN BIT(5) #define GMAC_HW_FEAT_NRVF GENMASK(2, 0) +/* GMAC GPIO Status reg */ +#define GMAC_GPO0 BIT(16) +#define GMAC_GPO1 BIT(17) +#define GMAC_GPO2 BIT(18) +#define GMAC_GPO3 BIT(19) + /* MAC HW ADDR regs */ #define GMAC_HI_DCS GENMASK(18, 16) #define GMAC_HI_DCS_SHIFT 16 diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index 0989e2bb6ee3..8b10fd10446f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -192,6 +192,9 @@ void stmmac_ptp_register(struct stmmac_priv *priv) { int i; + if (priv->plat->ptp_clk_freq_config) + priv->plat->ptp_clk_freq_config(priv); + for (i = 0; i < priv->dma_cap.pps_out_num; i++) { if (i >= STMMAC_PPS_MAX) break; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 51004ebd0540..10abc80b601e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -181,6 +181,7 @@ struct plat_stmmacenet_data { void (*fix_mac_speed)(void *priv, unsigned int speed); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); + void (*ptp_clk_freq_config)(void *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); struct mac_device_info *(*setup)(void *priv); -- cgit v1.2.3 From df291e54ccca0ef357f07a7b89263f7918d6ed7a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 19 Mar 2021 01:36:36 +0200 Subject: net: ocelot: support multiple bridges The ocelot switches are a bit odd in that they do not have an STP state to put the ports into. Instead, the forwarding configuration is delayed from the typical port_bridge_join into stp_state_set, when the port enters the BR_STATE_FORWARDING state. I can only guess that the implementation of this quirk is the reason that led to the simplification of the driver such that only one bridge could be offloaded at a time. We can simplify the data structures somewhat, and introduce a per-port bridge device pointer and STP state, similar to how the LAG offload works now (there we have a per-port bonding device pointer and TX enabled state). This allows offloading multiple bridges with relative ease, while still keeping in place the quirk to delay the programming of the PGIDs. We actually need this change now because we need to remove the bogus restriction from ocelot_bridge_stp_state_set that ocelot->bridge_mask needs to contain BIT(port), otherwise that function is a no-op. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 72 +++++++++++++++++++------------------- include/soc/mscc/ocelot.h | 7 ++-- 2 files changed, 39 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 9f0c9bdd9f5d..ce57929ba3d1 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -766,7 +766,7 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb) /* Everything we see on an interface that is in the HW bridge * has already been forwarded. */ - if (ocelot->bridge_mask & BIT(src_port)) + if (ocelot->ports[src_port]->bridge) skb->offload_fwd_mark = 1; skb->protocol = eth_type_trans(skb, dev); @@ -1183,6 +1183,26 @@ static u32 ocelot_get_bond_mask(struct ocelot *ocelot, struct net_device *bond, return mask; } +static u32 ocelot_get_bridge_fwd_mask(struct ocelot *ocelot, + struct net_device *bridge) +{ + u32 mask = 0; + int port; + + for (port = 0; port < ocelot->num_phys_ports; port++) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; + + if (!ocelot_port) + continue; + + if (ocelot_port->stp_state == BR_STATE_FORWARDING && + ocelot_port->bridge == bridge) + mask |= BIT(port); + } + + return mask; +} + static u32 ocelot_get_dsa_8021q_cpu_mask(struct ocelot *ocelot) { u32 mask = 0; @@ -1232,10 +1252,12 @@ void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot) */ mask = GENMASK(ocelot->num_phys_ports - 1, 0); mask &= ~cpu_fwd_mask; - } else if (ocelot->bridge_fwd_mask & BIT(port)) { + } else if (ocelot_port->bridge) { + struct net_device *bridge = ocelot_port->bridge; struct net_device *bond = ocelot_port->bond; - mask = ocelot->bridge_fwd_mask & ~BIT(port); + mask = ocelot_get_bridge_fwd_mask(ocelot, bridge); + mask &= ~BIT(port); if (bond) { mask &= ~ocelot_get_bond_mask(ocelot, bond, false); @@ -1256,29 +1278,16 @@ EXPORT_SYMBOL(ocelot_apply_bridge_fwd_mask); void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state) { struct ocelot_port *ocelot_port = ocelot->ports[port]; - u32 port_cfg; - - if (!(BIT(port) & ocelot->bridge_mask)) - return; + u32 learn_ena = 0; - port_cfg = ocelot_read_gix(ocelot, ANA_PORT_PORT_CFG, port); + ocelot_port->stp_state = state; - switch (state) { - case BR_STATE_FORWARDING: - ocelot->bridge_fwd_mask |= BIT(port); - fallthrough; - case BR_STATE_LEARNING: - if (ocelot_port->learn_ena) - port_cfg |= ANA_PORT_PORT_CFG_LEARN_ENA; - break; - - default: - port_cfg &= ~ANA_PORT_PORT_CFG_LEARN_ENA; - ocelot->bridge_fwd_mask &= ~BIT(port); - break; - } + if ((state == BR_STATE_LEARNING || state == BR_STATE_FORWARDING) && + ocelot_port->learn_ena) + learn_ena = ANA_PORT_PORT_CFG_LEARN_ENA; - ocelot_write_gix(ocelot, port_cfg, ANA_PORT_PORT_CFG, port); + ocelot_rmw_gix(ocelot, learn_ena, ANA_PORT_PORT_CFG_LEARN_ENA, + ANA_PORT_PORT_CFG, port); ocelot_apply_bridge_fwd_mask(ocelot); } @@ -1508,16 +1517,9 @@ EXPORT_SYMBOL(ocelot_port_mdb_del); int ocelot_port_bridge_join(struct ocelot *ocelot, int port, struct net_device *bridge) { - if (!ocelot->bridge_mask) { - ocelot->hw_bridge_dev = bridge; - } else { - if (ocelot->hw_bridge_dev != bridge) - /* This is adding the port to a second bridge, this is - * unsupported */ - return -ENODEV; - } + struct ocelot_port *ocelot_port = ocelot->ports[port]; - ocelot->bridge_mask |= BIT(port); + ocelot_port->bridge = bridge; return 0; } @@ -1526,13 +1528,11 @@ EXPORT_SYMBOL(ocelot_port_bridge_join); int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, struct net_device *bridge) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; struct ocelot_vlan pvid = {0}, native_vlan = {0}; int ret; - ocelot->bridge_mask &= ~BIT(port); - - if (!ocelot->bridge_mask) - ocelot->hw_bridge_dev = NULL; + ocelot_port->bridge = NULL; ret = ocelot_port_vlan_filtering(ocelot, port, false); if (ret) diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 0a0751bf97dd..ce7e5c1bd90d 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -615,6 +615,9 @@ struct ocelot_port { bool lag_tx_active; u16 mrp_ring_id; + + struct net_device *bridge; + u8 stp_state; }; struct ocelot { @@ -634,10 +637,6 @@ struct ocelot { int num_frame_refs; int num_mact_rows; - struct net_device *hw_bridge_dev; - u16 bridge_mask; - u16 bridge_fwd_mask; - struct ocelot_port **ports; u8 base_mac[ETH_ALEN]; -- cgit v1.2.3 From e75ec151c1088c1aa7a49ff16a2adcaddb24a861 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 18 Mar 2021 18:42:23 +0000 Subject: gro: make net/gro.h self-contained MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If some source file includes , but doesn't include : In file included from net/8021q/vlan_core.c:7: ./include/net/gro.h:6:1: warning: data definition has no type or storage class 6 | INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, | ^~~~~~~~~~~~~~~~~~~~~~~~~ ./include/net/gro.h:6:1: error: type defaults to ‘int’ in declaration of ‘INDIRECT_CALLABLE_DECLARE’ [-Werror=implicit-int] [...] Include directly. It's small and won't pull lots of dependencies. Also add some incomplete struct declarations to be fully stacked. Fixes: 04f00ab2275f ("net/core: move gro function declarations to separate header ") Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/net/gro.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index 8a6eb5303cc4..27c38b36df16 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -3,6 +3,11 @@ #ifndef _NET_IPV6_GRO_H #define _NET_IPV6_GRO_H +#include + +struct list_head; +struct sk_buff; + INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); -- cgit v1.2.3 From 86af2c82c28499b4b509d6b15637a96bd829201b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 18 Mar 2021 18:42:30 +0000 Subject: gro: add combined call_gro_receive() + INDIRECT_CALL_INET() helper call_gro_receive() is used to limit GRO recursion, but it works only with callback pointers. There's a combined version of call_gro_receive() + INDIRECT_CALL_2() in , but it doesn't check for IPv6 modularity. Add a similar new helper to cover both of these. It can and will be used to avoid retpoline overhead when IP header lies behind another offloaded proto. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- include/net/gro.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index 27c38b36df16..01edaf3fdda0 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -14,4 +14,12 @@ INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int)); + +#define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \ +({ \ + unlikely(gro_recursion_inc_test(skb)) ? \ + NAPI_GRO_CB(skb)->flush |= 1, NULL : \ + INDIRECT_CALL_INET(cb, f2, f1, head, skb); \ +}) + #endif /* _NET_IPV6_GRO_H */ -- cgit v1.2.3 From 919067cc845f323a80b6fe987b64238bd82d309e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Mar 2021 10:39:33 -0700 Subject: net: add CONFIG_PCPU_DEV_REFCNT I was working on a syzbot issue, claiming one device could not be dismantled because its refcount was -1 unregister_netdevice: waiting for sit0 to become free. Usage count = -1 It would be nice if syzbot could trigger a warning at the time this reference count became negative. This patch adds CONFIG_PCPU_DEV_REFCNT options which defaults to per cpu variables (as before this patch) on SMP builds. v2: free_dev label in alloc_netdev_mqs() is moved to avoid a compiler warning (-Wunused-label), as reported by kernel test robot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 13 +++++++++++++ net/Kconfig | 8 ++++++++ net/core/dev.c | 10 ++++++++++ 3 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4940509999be..8f003955c485 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2092,7 +2092,12 @@ struct net_device { u32 proto_down_reason; struct list_head todo_list; + +#ifdef CONFIG_PCPU_DEV_REFCNT int __percpu *pcpu_refcnt; +#else + refcount_t dev_refcnt; +#endif struct list_head link_watch_list; @@ -4044,7 +4049,11 @@ void netdev_run_todo(void); */ static inline void dev_put(struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_dec(*dev->pcpu_refcnt); +#else + refcount_dec(&dev->dev_refcnt); +#endif } /** @@ -4055,7 +4064,11 @@ static inline void dev_put(struct net_device *dev) */ static inline void dev_hold(struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT this_cpu_inc(*dev->pcpu_refcnt); +#else + refcount_inc(&dev->dev_refcnt); +#endif } /* Carrier loss detection, dial on demand. The functions netif_carrier_on diff --git a/net/Kconfig b/net/Kconfig index 0ead7ec0d2bd..9c456acc379e 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -245,6 +245,14 @@ source "net/l3mdev/Kconfig" source "net/qrtr/Kconfig" source "net/ncsi/Kconfig" +config PCPU_DEV_REFCNT + bool "Use percpu variables to maintain network device refcount" + depends on SMP + default y + help + network device refcount are using per cpu variables if this option is set. + This can be forced to N to detect underflows (with a performance drop). + config RPS bool depends on SMP && SYSFS diff --git a/net/core/dev.c b/net/core/dev.c index 4961fc2e9b19..be941ed754ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10312,11 +10312,15 @@ EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif } EXPORT_SYMBOL(netdev_refcnt_read); @@ -10674,9 +10678,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; +#ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; +#endif if (dev_addr_init(dev)) goto free_pcpu; @@ -10740,8 +10746,10 @@ free_all: return NULL; free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: +#endif netdev_freemem(dev); return NULL; } @@ -10783,8 +10791,10 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; +#endif free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; -- cgit v1.2.3 From 5c4c8c9544099bb9043a10a5318130a943e32fc3 Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Mon, 22 Mar 2021 14:03:11 +0800 Subject: Bluetooth: verify AMP hci_chan before amp_destroy hci_chan can be created in 2 places: hci_loglink_complete_evt() if it is an AMP hci_chan, or l2cap_conn_add() otherwise. In theory, Only AMP hci_chan should be removed by a call to hci_disconn_loglink_complete_evt(). However, the controller might mess up, call that function, and destroy an hci_chan which is not initiated by hci_loglink_complete_evt(). This patch adds a verification that the destroyed hci_chan must have been init'd by hci_loglink_complete_evt(). Example crash call trace: Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xe3/0x144 lib/dump_stack.c:118 print_address_description+0x67/0x22a mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report mm/kasan/report.c:412 [inline] kasan_report+0x251/0x28f mm/kasan/report.c:396 hci_send_acl+0x3b/0x56e net/bluetooth/hci_core.c:4072 l2cap_send_cmd+0x5af/0x5c2 net/bluetooth/l2cap_core.c:877 l2cap_send_move_chan_cfm_icid+0x8e/0xb1 net/bluetooth/l2cap_core.c:4661 l2cap_move_fail net/bluetooth/l2cap_core.c:5146 [inline] l2cap_move_channel_rsp net/bluetooth/l2cap_core.c:5185 [inline] l2cap_bredr_sig_cmd net/bluetooth/l2cap_core.c:5464 [inline] l2cap_sig_channel net/bluetooth/l2cap_core.c:5799 [inline] l2cap_recv_frame+0x1d12/0x51aa net/bluetooth/l2cap_core.c:7023 l2cap_recv_acldata+0x2ea/0x693 net/bluetooth/l2cap_core.c:7596 hci_acldata_packet net/bluetooth/hci_core.c:4606 [inline] hci_rx_work+0x2bd/0x45e net/bluetooth/hci_core.c:4796 process_one_work+0x6f8/0xb50 kernel/workqueue.c:2175 worker_thread+0x4fc/0x670 kernel/workqueue.c:2321 kthread+0x2f0/0x304 kernel/kthread.c:253 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415 Allocated by task 38: set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0x8d/0x9a mm/kasan/kasan.c:553 kmem_cache_alloc_trace+0x102/0x129 mm/slub.c:2787 kmalloc include/linux/slab.h:515 [inline] kzalloc include/linux/slab.h:709 [inline] hci_chan_create+0x86/0x26d net/bluetooth/hci_conn.c:1674 l2cap_conn_add.part.0+0x1c/0x814 net/bluetooth/l2cap_core.c:7062 l2cap_conn_add net/bluetooth/l2cap_core.c:7059 [inline] l2cap_connect_cfm+0x134/0x852 net/bluetooth/l2cap_core.c:7381 hci_connect_cfm+0x9d/0x122 include/net/bluetooth/hci_core.h:1404 hci_remote_ext_features_evt net/bluetooth/hci_event.c:4161 [inline] hci_event_packet+0x463f/0x72fa net/bluetooth/hci_event.c:5981 hci_rx_work+0x197/0x45e net/bluetooth/hci_core.c:4791 process_one_work+0x6f8/0xb50 kernel/workqueue.c:2175 worker_thread+0x4fc/0x670 kernel/workqueue.c:2321 kthread+0x2f0/0x304 kernel/kthread.c:253 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415 Freed by task 1732: set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free mm/kasan/kasan.c:521 [inline] __kasan_slab_free+0x106/0x128 mm/kasan/kasan.c:493 slab_free_hook mm/slub.c:1409 [inline] slab_free_freelist_hook+0xaa/0xf6 mm/slub.c:1436 slab_free mm/slub.c:3009 [inline] kfree+0x182/0x21e mm/slub.c:3972 hci_disconn_loglink_complete_evt net/bluetooth/hci_event.c:4891 [inline] hci_event_packet+0x6a1c/0x72fa net/bluetooth/hci_event.c:6050 hci_rx_work+0x197/0x45e net/bluetooth/hci_core.c:4791 process_one_work+0x6f8/0xb50 kernel/workqueue.c:2175 worker_thread+0x4fc/0x670 kernel/workqueue.c:2321 kthread+0x2f0/0x304 kernel/kthread.c:253 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415 The buggy address belongs to the object at ffff8881d7af9180 which belongs to the cache kmalloc-128 of size 128 The buggy address is located 24 bytes inside of 128-byte region [ffff8881d7af9180, ffff8881d7af9200) The buggy address belongs to the page: page:ffffea00075ebe40 count:1 mapcount:0 mapping:ffff8881da403200 index:0x0 flags: 0x8000000000000200(slab) raw: 8000000000000200 dead000000000100 dead000000000200 ffff8881da403200 raw: 0000000000000000 0000000080150015 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881d7af9080: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff8881d7af9100: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff8881d7af9180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8881d7af9200: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8881d7af9280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc Signed-off-by: Archie Pusaka Reported-by: syzbot+98228e7407314d2d4ba2@syzkaller.appspotmail.com Reviewed-by: Alain Michaud Reviewed-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_event.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ebdd4afe30d2..ca4ac6603b9a 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -704,6 +704,7 @@ struct hci_chan { struct sk_buff_head data_q; unsigned int sent; __u8 state; + bool amp; }; struct hci_conn_params { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index cf2f4a0abdbd..341c8ce93648 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5032,6 +5032,7 @@ static void hci_loglink_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) return; hchan->handle = le16_to_cpu(ev->handle); + hchan->amp = true; BT_DBG("hcon %p mgr %p hchan %p", hcon, hcon->amp_mgr, hchan); @@ -5064,7 +5065,7 @@ static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev, hci_dev_lock(hdev); hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle)); - if (!hchan) + if (!hchan || !hchan->amp) goto unlock; amp_destroy_logical_link(hchan, ev->reason); -- cgit v1.2.3 From 1f7ea1cd6a3748427512ccc9582e18cd9efea966 Mon Sep 17 00:00:00 2001 From: Qi Zhang Date: Tue, 9 Mar 2021 11:08:04 +0800 Subject: ice: Enable FDIR Configure for AVF The virtual channel is going to be extended to support FDIR and RSS configure from AVF. New data structures and OP codes will be added, the patch enable the FDIR part. To support above advanced AVF feature, we need to figure out what kind of data structure should be passed from VF to PF to describe an FDIR rule or RSS config rule. The common part of the requirement is we need a data structure to represent the input set selection of a rule's hash key. An input set selection is a group of fields be selected from one or more network protocol layers that could be identified as a specific flow. For example, select dst IP address from an IPv4 header combined with dst port from the TCP header as the input set for an IPv4/TCP flow. The patch adds a new data structure virtchnl_proto_hdrs to abstract a network protocol headers group which is composed of layers of network protocol header(virtchnl_proto_hdr). A protocol header contains a 32 bits mask (field_selector) to describe which fields are selected as input sets, as well as a header type (enum virtchnl_proto_hdr_type). Each bit is mapped to a field in enum virtchnl_proto_hdr_field guided by its header type. +------------+-----------+------------------------------+ | | Proto Hdr | Header Type A | | | +------------------------------+ | | | BIT 31 | ... | BIT 1 | BIT 0 | | |-----------+------------------------------+ |Proto Hdrs | Proto Hdr | Header Type B | | | +------------------------------+ | | | BIT 31 | ... | BIT 1 | BIT 0 | | |-----------+------------------------------+ | | Proto Hdr | Header Type C | | | +------------------------------+ | | | BIT 31 | ... | BIT 1 | BIT 0 | | |-----------+------------------------------+ | | .... | +-------------------------------------------------------+ All fields in enum virtchnl_proto_hdr_fields are grouped with header type and the value of the first field of a header type is always 32 aligned. enum proto_hdr_type { header_type_A = 0; header_type_B = 1; .... } enum proto_hdr_field { /* header type A */ header_A_field_0 = 0, header_A_field_1 = 1, header_A_field_2 = 2, header_A_field_3 = 3, /* header type B */ header_B_field_0 = 32, // = header_type_B << 5 header_B_field_0 = 33, header_B_field_0 = 34 header_B_field_0 = 35, .... }; So we have: proto_hdr_type = proto_hdr_field / 32 bit offset = proto_hdr_field % 32 To simply the protocol header's operations, couple help macros are added. For example, to select src IP and dst port as input set for an IPv4/UDP flow. we have: struct virtchnl_proto_hdr hdr[2]; VIRTCHNL_SET_PROTO_HDR_TYPE(&hdr[0], IPV4) VIRTCHNL_ADD_PROTO_HDR_FIELD(&hdr[0], IPV4, SRC) VIRTCHNL_SET_PROTO_HDR_TYPE(&hdr[1], UDP) VIRTCHNL_ADD_PROTO_HDR_FIELD(&hdr[1], UDP, DST) The byte array is used to store the protocol header of a training package. The byte array must be network order. The patch added virtual channel support for iAVF FDIR add/validate/delete filter. iAVF FDIR is Flow Director for Intel Adaptive Virtual Function which can direct Ethernet packets to the queues of the Network Interface Card. Add/delete command is adding or deleting one rule for each virtual channel message, while validate command is just verifying if this rule is valid without any other operations. To add or delete one rule, driver needs to config TCAM and Profile, build training packets which contains the input set value, and send the training packets through FDIR Tx queue. In addition, driver needs to manage the software context to avoid adding duplicated rules, deleting non-existent rule, input set conflicts and other invalid cases. NOTE: Supported pattern/actions and their parse functions are not be included in this patch, they will be added in a separate one. Signed-off-by: Jeff Guo Signed-off-by: Yahui Cao Signed-off-by: Simei Su Signed-off-by: Beilei Xing Signed-off-by: Qi Zhang Tested-by: Chen Bo Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/Makefile | 2 +- drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c | 4 + drivers/net/ethernet/intel/ice/ice_fdir.c | 6 +- drivers/net/ethernet/intel/ice/ice_fdir.h | 5 + drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h | 2 + drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c | 1034 ++++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h | 24 + drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 17 +- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h | 6 + include/linux/avf/virtchnl.h | 278 ++++++ 10 files changed, 1372 insertions(+), 6 deletions(-) create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index 73da4f71f530..f391691e2c7e 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -26,7 +26,7 @@ ice-y := ice_main.o \ ice_fw_update.o \ ice_lag.o \ ice_ethtool.o -ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o +ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice_virtchnl_fdir.o ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o ice-$(CONFIG_XDP_SOCKETS) += ice_xsk.o diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c index 192729546bbf..440964defa4a 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c @@ -1679,6 +1679,10 @@ int ice_add_fdir_ethtool(struct ice_vsi *vsi, struct ethtool_rxnfc *cmd) input->flex_offset = userdata.flex_offset; } + input->cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS; + input->fdid_prio = ICE_FXD_FLTR_QW1_FDID_PRI_THREE; + input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL; + /* input struct is added to the HW filter list */ ice_fdir_update_list_entry(pf, input, fsp->location); diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.c b/drivers/net/ethernet/intel/ice/ice_fdir.c index 0c2066c0ab1f..8f3e61c6bfd6 100644 --- a/drivers/net/ethernet/intel/ice/ice_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_fdir.c @@ -378,7 +378,7 @@ ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input, fdir_fltr_ctx.drop = ICE_FXD_FLTR_QW0_DROP_NO; fdir_fltr_ctx.qindex = input->q_index; } - fdir_fltr_ctx.cnt_ena = ICE_FXD_FLTR_QW0_STAT_ENA_PKTS; + fdir_fltr_ctx.cnt_ena = input->cnt_ena; fdir_fltr_ctx.cnt_index = input->cnt_index; fdir_fltr_ctx.fd_vsi = ice_get_hw_vsi_num(hw, input->dest_vsi); fdir_fltr_ctx.evict_ena = ICE_FXD_FLTR_QW0_EVICT_ENA_FALSE; @@ -387,8 +387,8 @@ ice_fdir_get_prgm_desc(struct ice_hw *hw, struct ice_fdir_fltr *input, ICE_FXD_FLTR_QW1_PCMD_REMOVE; fdir_fltr_ctx.swap = ICE_FXD_FLTR_QW1_SWAP_NOT_SET; fdir_fltr_ctx.comp_q = ICE_FXD_FLTR_QW0_COMP_Q_ZERO; - fdir_fltr_ctx.comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL; - fdir_fltr_ctx.fdid_prio = 3; + fdir_fltr_ctx.comp_report = input->comp_report; + fdir_fltr_ctx.fdid_prio = input->fdid_prio; fdir_fltr_ctx.desc_prof = 1; fdir_fltr_ctx.desc_prof_prio = 3; ice_set_fd_desc_val(&fdir_fltr_ctx, fdesc); diff --git a/drivers/net/ethernet/intel/ice/ice_fdir.h b/drivers/net/ethernet/intel/ice/ice_fdir.h index 84b40298a513..93f3f0d9d37b 100644 --- a/drivers/net/ethernet/intel/ice/ice_fdir.h +++ b/drivers/net/ethernet/intel/ice/ice_fdir.h @@ -31,6 +31,8 @@ #define ICE_IPV6_HLIM_OFFSET 21 #define ICE_IPV6_PROTO_OFFSET 20 +#define ICE_FDIR_MAX_FLTRS 16384 + /* IP v4 has 2 flag bits that enable fragment processing: DF and MF. DF * requests that the packet not be fragmented. MF indicates that a packet has * been fragmented. @@ -138,9 +140,12 @@ struct ice_fdir_fltr { u16 q_index; u16 dest_vsi; u8 dest_ctl; + u8 cnt_ena; u8 fltr_status; u16 cnt_index; u32 fltr_id; + u8 fdid_prio; + u8 comp_report; }; /* Dummy packet filter definition structure */ diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h index 4ec24c3e813f..b30c22358c0a 100644 --- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h +++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h @@ -55,6 +55,7 @@ struct ice_fltr_desc { #define ICE_FXD_FLTR_QW0_COMP_REPORT_M \ (0x3ULL << ICE_FXD_FLTR_QW0_COMP_REPORT_S) #define ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL 0x1ULL +#define ICE_FXD_FLTR_QW0_COMP_REPORT_SW 0x2ULL #define ICE_FXD_FLTR_QW0_FD_SPACE_S 14 #define ICE_FXD_FLTR_QW0_FD_SPACE_M (0x3ULL << ICE_FXD_FLTR_QW0_FD_SPACE_S) @@ -128,6 +129,7 @@ struct ice_fltr_desc { #define ICE_FXD_FLTR_QW1_FDID_PRI_S 25 #define ICE_FXD_FLTR_QW1_FDID_PRI_M (0x7ULL << ICE_FXD_FLTR_QW1_FDID_PRI_S) #define ICE_FXD_FLTR_QW1_FDID_PRI_ONE 0x1ULL +#define ICE_FXD_FLTR_QW1_FDID_PRI_THREE 0x3ULL #define ICE_FXD_FLTR_QW1_FDID_MDID_S 28 #define ICE_FXD_FLTR_QW1_FDID_MDID_M (0xFULL << ICE_FXD_FLTR_QW1_FDID_MDID_S) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c new file mode 100644 index 000000000000..6e7e8531d6ec --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c @@ -0,0 +1,1034 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021, Intel Corporation. */ + +#include "ice.h" +#include "ice_base.h" +#include "ice_lib.h" +#include "ice_flow.h" + +#define to_fltr_conf_from_desc(p) \ + container_of(p, struct virtchnl_fdir_fltr_conf, input) + +#define ICE_FLOW_PROF_TYPE_S 0 +#define ICE_FLOW_PROF_TYPE_M (0xFFFFFFFFULL << ICE_FLOW_PROF_TYPE_S) +#define ICE_FLOW_PROF_VSI_S 32 +#define ICE_FLOW_PROF_VSI_M (0xFFFFFFFFULL << ICE_FLOW_PROF_VSI_S) + +/* Flow profile ID format: + * [0:31] - flow type, flow + tun_offs + * [32:63] - VSI index + */ +#define ICE_FLOW_PROF_FD(vsi, flow, tun_offs) \ + ((u64)(((((flow) + (tun_offs)) & ICE_FLOW_PROF_TYPE_M)) | \ + (((u64)(vsi) << ICE_FLOW_PROF_VSI_S) & ICE_FLOW_PROF_VSI_M))) + +struct virtchnl_fdir_fltr_conf { + struct ice_fdir_fltr input; +}; + +struct virtchnl_fdir_inset_map { + enum virtchnl_proto_hdr_field field; + enum ice_flow_field fld; +}; + +static const struct virtchnl_fdir_inset_map fdir_inset_map[] = { + {VIRTCHNL_PROTO_HDR_IPV4_SRC, ICE_FLOW_FIELD_IDX_IPV4_SA}, + {VIRTCHNL_PROTO_HDR_IPV4_DST, ICE_FLOW_FIELD_IDX_IPV4_DA}, + {VIRTCHNL_PROTO_HDR_IPV4_DSCP, ICE_FLOW_FIELD_IDX_IPV4_DSCP}, + {VIRTCHNL_PROTO_HDR_IPV4_TTL, ICE_FLOW_FIELD_IDX_IPV4_TTL}, + {VIRTCHNL_PROTO_HDR_IPV4_PROT, ICE_FLOW_FIELD_IDX_IPV4_PROT}, + {VIRTCHNL_PROTO_HDR_IPV6_SRC, ICE_FLOW_FIELD_IDX_IPV6_SA}, + {VIRTCHNL_PROTO_HDR_IPV6_DST, ICE_FLOW_FIELD_IDX_IPV6_DA}, + {VIRTCHNL_PROTO_HDR_IPV6_TC, ICE_FLOW_FIELD_IDX_IPV6_DSCP}, + {VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT, ICE_FLOW_FIELD_IDX_IPV6_TTL}, + {VIRTCHNL_PROTO_HDR_IPV6_PROT, ICE_FLOW_FIELD_IDX_IPV6_PROT}, + {VIRTCHNL_PROTO_HDR_UDP_SRC_PORT, ICE_FLOW_FIELD_IDX_UDP_SRC_PORT}, + {VIRTCHNL_PROTO_HDR_UDP_DST_PORT, ICE_FLOW_FIELD_IDX_UDP_DST_PORT}, + {VIRTCHNL_PROTO_HDR_TCP_SRC_PORT, ICE_FLOW_FIELD_IDX_TCP_SRC_PORT}, + {VIRTCHNL_PROTO_HDR_TCP_DST_PORT, ICE_FLOW_FIELD_IDX_TCP_DST_PORT}, + {VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT, ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT}, + {VIRTCHNL_PROTO_HDR_SCTP_DST_PORT, ICE_FLOW_FIELD_IDX_SCTP_DST_PORT}, +}; + +/** + * ice_vc_fdir_param_check + * @vf: pointer to the VF structure + * @vsi_id: VF relative VSI ID + * + * Check for the valid VSI ID, PF's state and VF's state + * + * Return: 0 on success, and -EINVAL on error. + */ +static int +ice_vc_fdir_param_check(struct ice_vf *vf, u16 vsi_id) +{ + struct ice_pf *pf = vf->pf; + + if (!test_bit(ICE_FLAG_FD_ENA, pf->flags)) + return -EINVAL; + + if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) + return -EINVAL; + + if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF)) + return -EINVAL; + + if (vsi_id != vf->lan_vsi_num) + return -EINVAL; + + if (!ice_vc_isvalid_vsi_id(vf, vsi_id)) + return -EINVAL; + + if (!pf->vsi[vf->lan_vsi_idx]) + return -EINVAL; + + return 0; +} + +/** + * ice_vf_start_ctrl_vsi + * @vf: pointer to the VF structure + * + * Allocate ctrl_vsi for the first time and open the ctrl_vsi port for VF + * + * Return: 0 on success, and other on error. + */ +static int ice_vf_start_ctrl_vsi(struct ice_vf *vf) +{ + struct ice_pf *pf = vf->pf; + struct ice_vsi *ctrl_vsi; + struct device *dev; + int err; + + dev = ice_pf_to_dev(pf); + if (vf->ctrl_vsi_idx != ICE_NO_VSI) + return -EEXIST; + + ctrl_vsi = ice_vf_ctrl_vsi_setup(vf); + if (!ctrl_vsi) { + dev_dbg(dev, "Could not setup control VSI for VF %d\n", + vf->vf_id); + return -ENOMEM; + } + + err = ice_vsi_open_ctrl(ctrl_vsi); + if (err) { + dev_dbg(dev, "Could not open control VSI for VF %d\n", + vf->vf_id); + goto err_vsi_open; + } + + return 0; + +err_vsi_open: + ice_vsi_release(ctrl_vsi); + if (vf->ctrl_vsi_idx != ICE_NO_VSI) { + pf->vsi[vf->ctrl_vsi_idx] = NULL; + vf->ctrl_vsi_idx = ICE_NO_VSI; + } + return err; +} + +/** + * ice_vc_fdir_alloc_prof - allocate profile for this filter flow type + * @vf: pointer to the VF structure + * @flow: filter flow type + * + * Return: 0 on success, and other on error. + */ +static int +ice_vc_fdir_alloc_prof(struct ice_vf *vf, enum ice_fltr_ptype flow) +{ + struct ice_vf_fdir *fdir = &vf->fdir; + + if (!fdir->fdir_prof) { + fdir->fdir_prof = devm_kcalloc(ice_pf_to_dev(vf->pf), + ICE_FLTR_PTYPE_MAX, + sizeof(*fdir->fdir_prof), + GFP_KERNEL); + if (!fdir->fdir_prof) + return -ENOMEM; + } + + if (!fdir->fdir_prof[flow]) { + fdir->fdir_prof[flow] = devm_kzalloc(ice_pf_to_dev(vf->pf), + sizeof(**fdir->fdir_prof), + GFP_KERNEL); + if (!fdir->fdir_prof[flow]) + return -ENOMEM; + } + + return 0; +} + +/** + * ice_vc_fdir_free_prof - free profile for this filter flow type + * @vf: pointer to the VF structure + * @flow: filter flow type + */ +static void +ice_vc_fdir_free_prof(struct ice_vf *vf, enum ice_fltr_ptype flow) +{ + struct ice_vf_fdir *fdir = &vf->fdir; + + if (!fdir->fdir_prof) + return; + + if (!fdir->fdir_prof[flow]) + return; + + devm_kfree(ice_pf_to_dev(vf->pf), fdir->fdir_prof[flow]); + fdir->fdir_prof[flow] = NULL; +} + +/** + * ice_vc_fdir_free_prof_all - free all the profile for this VF + * @vf: pointer to the VF structure + */ +static void ice_vc_fdir_free_prof_all(struct ice_vf *vf) +{ + struct ice_vf_fdir *fdir = &vf->fdir; + enum ice_fltr_ptype flow; + + if (!fdir->fdir_prof) + return; + + for (flow = ICE_FLTR_PTYPE_NONF_NONE; flow < ICE_FLTR_PTYPE_MAX; flow++) + ice_vc_fdir_free_prof(vf, flow); + + devm_kfree(ice_pf_to_dev(vf->pf), fdir->fdir_prof); + fdir->fdir_prof = NULL; +} + +/** + * ice_vc_fdir_parse_flow_fld + * @proto_hdr: virtual channel protocol filter header + * @conf: FDIR configuration for each filter + * @fld: field type array + * @fld_cnt: field counter + * + * Parse the virtual channel filter header and store them into field type array + * + * Return: 0 on success, and other on error. + */ +static int +ice_vc_fdir_parse_flow_fld(struct virtchnl_proto_hdr *proto_hdr, + struct virtchnl_fdir_fltr_conf *conf, + enum ice_flow_field *fld, int *fld_cnt) +{ + struct virtchnl_proto_hdr hdr; + u32 i; + + memcpy(&hdr, proto_hdr, sizeof(hdr)); + + for (i = 0; (i < ARRAY_SIZE(fdir_inset_map)) && + VIRTCHNL_GET_PROTO_HDR_FIELD(&hdr); i++) + if (VIRTCHNL_TEST_PROTO_HDR(&hdr, fdir_inset_map[i].field)) { + fld[*fld_cnt] = fdir_inset_map[i].fld; + *fld_cnt += 1; + if (*fld_cnt >= ICE_FLOW_FIELD_IDX_MAX) + return -EINVAL; + VIRTCHNL_DEL_PROTO_HDR_FIELD(&hdr, + fdir_inset_map[i].field); + } + + return 0; +} + +/** + * ice_vc_fdir_set_flow_fld + * @vf: pointer to the VF structure + * @fltr: virtual channel add cmd buffer + * @conf: FDIR configuration for each filter + * @seg: array of one or more packet segments that describe the flow + * + * Parse the virtual channel add msg buffer's field vector and store them into + * flow's packet segment field + * + * Return: 0 on success, and other on error. + */ +static int +ice_vc_fdir_set_flow_fld(struct ice_vf *vf, struct virtchnl_fdir_add *fltr, + struct virtchnl_fdir_fltr_conf *conf, + struct ice_flow_seg_info *seg) +{ + struct virtchnl_fdir_rule *rule = &fltr->rule_cfg; + enum ice_flow_field fld[ICE_FLOW_FIELD_IDX_MAX]; + struct device *dev = ice_pf_to_dev(vf->pf); + struct virtchnl_proto_hdrs *proto; + int fld_cnt = 0; + int i; + + proto = &rule->proto_hdrs; + for (i = 0; i < proto->count; i++) { + struct virtchnl_proto_hdr *hdr = &proto->proto_hdr[i]; + int ret; + + ret = ice_vc_fdir_parse_flow_fld(hdr, conf, fld, &fld_cnt); + if (ret) + return ret; + } + + if (fld_cnt == 0) { + dev_dbg(dev, "Empty input set for VF %d\n", vf->vf_id); + return -EINVAL; + } + + for (i = 0; i < fld_cnt; i++) + ice_flow_set_fld(seg, fld[i], + ICE_FLOW_FLD_OFF_INVAL, + ICE_FLOW_FLD_OFF_INVAL, + ICE_FLOW_FLD_OFF_INVAL, false); + + return 0; +} + +/** + * ice_vc_fdir_set_flow_hdr - config the flow's packet segment header + * @vf: pointer to the VF structure + * @conf: FDIR configuration for each filter + * @seg: array of one or more packet segments that describe the flow + * + * Return: 0 on success, and other on error. + */ +static int +ice_vc_fdir_set_flow_hdr(struct ice_vf *vf, + struct virtchnl_fdir_fltr_conf *conf, + struct ice_flow_seg_info *seg) +{ + enum ice_fltr_ptype flow = conf->input.flow_type; + struct device *dev = ice_pf_to_dev(vf->pf); + + switch (flow) { + case ICE_FLTR_PTYPE_NONF_IPV4_OTHER: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV4 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + case ICE_FLTR_PTYPE_NONF_IPV4_TCP: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP | + ICE_FLOW_SEG_HDR_IPV4 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + case ICE_FLTR_PTYPE_NONF_IPV4_UDP: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP | + ICE_FLOW_SEG_HDR_IPV4 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + case ICE_FLTR_PTYPE_NONF_IPV4_SCTP: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP | + ICE_FLOW_SEG_HDR_IPV4 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + case ICE_FLTR_PTYPE_NONF_IPV6_OTHER: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_IPV6 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + case ICE_FLTR_PTYPE_NONF_IPV6_TCP: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_TCP | + ICE_FLOW_SEG_HDR_IPV6 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + case ICE_FLTR_PTYPE_NONF_IPV6_UDP: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_UDP | + ICE_FLOW_SEG_HDR_IPV6 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + case ICE_FLTR_PTYPE_NONF_IPV6_SCTP: + ICE_FLOW_SET_HDRS(seg, ICE_FLOW_SEG_HDR_SCTP | + ICE_FLOW_SEG_HDR_IPV6 | + ICE_FLOW_SEG_HDR_IPV_OTHER); + break; + default: + dev_dbg(dev, "Invalid flow type 0x%x for VF %d failed\n", + flow, vf->vf_id); + return -EINVAL; + } + + return 0; +} + +/** + * ice_vc_fdir_rem_prof - remove profile for this filter flow type + * @vf: pointer to the VF structure + * @flow: filter flow type + * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter + */ +static void +ice_vc_fdir_rem_prof(struct ice_vf *vf, enum ice_fltr_ptype flow, int tun) +{ + struct ice_vf_fdir *fdir = &vf->fdir; + struct ice_fd_hw_prof *vf_prof; + struct ice_pf *pf = vf->pf; + struct ice_vsi *vf_vsi; + struct device *dev; + struct ice_hw *hw; + u64 prof_id; + int i; + + dev = ice_pf_to_dev(pf); + hw = &pf->hw; + if (!fdir->fdir_prof || !fdir->fdir_prof[flow]) + return; + + vf_prof = fdir->fdir_prof[flow]; + + vf_vsi = pf->vsi[vf->lan_vsi_idx]; + if (!vf_vsi) { + dev_dbg(dev, "NULL vf %d vsi pointer\n", vf->vf_id); + return; + } + + if (!fdir->prof_entry_cnt[flow][tun]) + return; + + prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num, + flow, tun ? ICE_FLTR_PTYPE_MAX : 0); + + for (i = 0; i < fdir->prof_entry_cnt[flow][tun]; i++) + if (vf_prof->entry_h[i][tun]) { + u16 vsi_num = ice_get_hw_vsi_num(hw, vf_prof->vsi_h[i]); + + ice_rem_prof_id_flow(hw, ICE_BLK_FD, vsi_num, prof_id); + ice_flow_rem_entry(hw, ICE_BLK_FD, + vf_prof->entry_h[i][tun]); + vf_prof->entry_h[i][tun] = 0; + } + + ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id); + devm_kfree(dev, vf_prof->fdir_seg[tun]); + vf_prof->fdir_seg[tun] = NULL; + + for (i = 0; i < vf_prof->cnt; i++) + vf_prof->vsi_h[i] = 0; + + fdir->prof_entry_cnt[flow][tun] = 0; +} + +/** + * ice_vc_fdir_rem_prof_all - remove profile for this VF + * @vf: pointer to the VF structure + */ +static void ice_vc_fdir_rem_prof_all(struct ice_vf *vf) +{ + enum ice_fltr_ptype flow; + + for (flow = ICE_FLTR_PTYPE_NONF_NONE; + flow < ICE_FLTR_PTYPE_MAX; flow++) { + ice_vc_fdir_rem_prof(vf, flow, 0); + ice_vc_fdir_rem_prof(vf, flow, 1); + } +} + +/** + * ice_vc_fdir_write_flow_prof + * @vf: pointer to the VF structure + * @flow: filter flow type + * @seg: array of one or more packet segments that describe the flow + * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter + * + * Write the flow's profile config and packet segment into the hardware + * + * Return: 0 on success, and other on error. + */ +static int +ice_vc_fdir_write_flow_prof(struct ice_vf *vf, enum ice_fltr_ptype flow, + struct ice_flow_seg_info *seg, int tun) +{ + struct ice_vf_fdir *fdir = &vf->fdir; + struct ice_vsi *vf_vsi, *ctrl_vsi; + struct ice_flow_seg_info *old_seg; + struct ice_flow_prof *prof = NULL; + struct ice_fd_hw_prof *vf_prof; + enum ice_status status; + struct device *dev; + struct ice_pf *pf; + struct ice_hw *hw; + u64 entry1_h = 0; + u64 entry2_h = 0; + u64 prof_id; + int ret; + + pf = vf->pf; + dev = ice_pf_to_dev(pf); + hw = &pf->hw; + vf_vsi = pf->vsi[vf->lan_vsi_idx]; + if (!vf_vsi) + return -EINVAL; + + ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx]; + if (!ctrl_vsi) + return -EINVAL; + + vf_prof = fdir->fdir_prof[flow]; + old_seg = vf_prof->fdir_seg[tun]; + if (old_seg) { + if (!memcmp(old_seg, seg, sizeof(*seg))) { + dev_dbg(dev, "Duplicated profile for VF %d!\n", + vf->vf_id); + return -EEXIST; + } + + if (fdir->fdir_fltr_cnt[flow][tun]) { + ret = -EINVAL; + dev_dbg(dev, "Input set conflicts for VF %d\n", + vf->vf_id); + goto err_exit; + } + + /* remove previously allocated profile */ + ice_vc_fdir_rem_prof(vf, flow, tun); + } + + prof_id = ICE_FLOW_PROF_FD(vf_vsi->vsi_num, flow, + tun ? ICE_FLTR_PTYPE_MAX : 0); + + status = ice_flow_add_prof(hw, ICE_BLK_FD, ICE_FLOW_RX, prof_id, seg, + tun + 1, &prof); + ret = ice_status_to_errno(status); + if (ret) { + dev_dbg(dev, "Could not add VSI flow 0x%x for VF %d\n", + flow, vf->vf_id); + goto err_exit; + } + + status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx, + vf_vsi->idx, ICE_FLOW_PRIO_NORMAL, + seg, &entry1_h); + ret = ice_status_to_errno(status); + if (ret) { + dev_dbg(dev, "Could not add flow 0x%x VSI entry for VF %d\n", + flow, vf->vf_id); + goto err_prof; + } + + status = ice_flow_add_entry(hw, ICE_BLK_FD, prof_id, vf_vsi->idx, + ctrl_vsi->idx, ICE_FLOW_PRIO_NORMAL, + seg, &entry2_h); + ret = ice_status_to_errno(status); + if (ret) { + dev_dbg(dev, + "Could not add flow 0x%x Ctrl VSI entry for VF %d\n", + flow, vf->vf_id); + goto err_entry_1; + } + + vf_prof->fdir_seg[tun] = seg; + vf_prof->cnt = 0; + fdir->prof_entry_cnt[flow][tun] = 0; + + vf_prof->entry_h[vf_prof->cnt][tun] = entry1_h; + vf_prof->vsi_h[vf_prof->cnt] = vf_vsi->idx; + vf_prof->cnt++; + fdir->prof_entry_cnt[flow][tun]++; + + vf_prof->entry_h[vf_prof->cnt][tun] = entry2_h; + vf_prof->vsi_h[vf_prof->cnt] = ctrl_vsi->idx; + vf_prof->cnt++; + fdir->prof_entry_cnt[flow][tun]++; + + return 0; + +err_entry_1: + ice_rem_prof_id_flow(hw, ICE_BLK_FD, + ice_get_hw_vsi_num(hw, vf_vsi->idx), prof_id); + ice_flow_rem_entry(hw, ICE_BLK_FD, entry1_h); +err_prof: + ice_flow_rem_prof(hw, ICE_BLK_FD, prof_id); +err_exit: + return ret; +} + +/** + * ice_vc_fdir_config_input_set + * @vf: pointer to the VF structure + * @fltr: virtual channel add cmd buffer + * @conf: FDIR configuration for each filter + * @tun: 0 implies non-tunnel type filter, 1 implies tunnel type filter + * + * Config the input set type and value for virtual channel add msg buffer + * + * Return: 0 on success, and other on error. + */ +static int +ice_vc_fdir_config_input_set(struct ice_vf *vf, struct virtchnl_fdir_add *fltr, + struct virtchnl_fdir_fltr_conf *conf, int tun) +{ + struct ice_fdir_fltr *input = &conf->input; + struct device *dev = ice_pf_to_dev(vf->pf); + struct ice_flow_seg_info *seg; + enum ice_fltr_ptype flow; + int ret; + + flow = input->flow_type; + ret = ice_vc_fdir_alloc_prof(vf, flow); + if (ret) { + dev_dbg(dev, "Alloc flow prof for VF %d failed\n", vf->vf_id); + return ret; + } + + seg = devm_kzalloc(dev, sizeof(*seg), GFP_KERNEL); + if (!seg) + return -ENOMEM; + + ret = ice_vc_fdir_set_flow_fld(vf, fltr, conf, seg); + if (ret) { + dev_dbg(dev, "Set flow field for VF %d failed\n", vf->vf_id); + goto err_exit; + } + + ret = ice_vc_fdir_set_flow_hdr(vf, conf, seg); + if (ret) { + dev_dbg(dev, "Set flow hdr for VF %d failed\n", vf->vf_id); + goto err_exit; + } + + ret = ice_vc_fdir_write_flow_prof(vf, flow, seg, tun); + if (ret == -EEXIST) { + devm_kfree(dev, seg); + } else if (ret) { + dev_dbg(dev, "Write flow profile for VF %d failed\n", + vf->vf_id); + goto err_exit; + } + + return 0; + +err_exit: + devm_kfree(dev, seg); + return ret; +} + +/** + * ice_vc_validate_fdir_fltr - validate the virtual channel filter + * @vf: pointer to the VF info + * @fltr: virtual channel add cmd buffer + * @conf: FDIR configuration for each filter + * + * Return: 0 on success, and other on error. + */ +static int +ice_vc_validate_fdir_fltr(struct ice_vf *vf, struct virtchnl_fdir_add *fltr, + struct virtchnl_fdir_fltr_conf *conf) +{ + /* Todo: rule validation */ + return -EINVAL; +} + +/** + * ice_vc_fdir_comp_rules - compare if two filter rules have the same value + * @conf_a: FDIR configuration for filter a + * @conf_b: FDIR configuration for filter b + * + * Return: 0 on success, and other on error. + */ +static bool +ice_vc_fdir_comp_rules(struct virtchnl_fdir_fltr_conf *conf_a, + struct virtchnl_fdir_fltr_conf *conf_b) +{ + struct ice_fdir_fltr *a = &conf_a->input; + struct ice_fdir_fltr *b = &conf_b->input; + + if (a->flow_type != b->flow_type) + return false; + if (memcmp(&a->ip, &b->ip, sizeof(a->ip))) + return false; + if (memcmp(&a->mask, &b->mask, sizeof(a->mask))) + return false; + if (memcmp(&a->ext_data, &b->ext_data, sizeof(a->ext_data))) + return false; + if (memcmp(&a->ext_mask, &b->ext_mask, sizeof(a->ext_mask))) + return false; + + return true; +} + +/** + * ice_vc_fdir_is_dup_fltr + * @vf: pointer to the VF info + * @conf: FDIR configuration for each filter + * + * Check if there is duplicated rule with same conf value + * + * Return: 0 true success, and false on error. + */ +static bool +ice_vc_fdir_is_dup_fltr(struct ice_vf *vf, struct virtchnl_fdir_fltr_conf *conf) +{ + struct ice_fdir_fltr *desc; + bool ret; + + list_for_each_entry(desc, &vf->fdir.fdir_rule_list, fltr_node) { + struct virtchnl_fdir_fltr_conf *node = + to_fltr_conf_from_desc(desc); + + ret = ice_vc_fdir_comp_rules(node, conf); + if (ret) + return true; + } + + return false; +} + +/** + * ice_vc_fdir_insert_entry + * @vf: pointer to the VF info + * @conf: FDIR configuration for each filter + * @id: pointer to ID value allocated by driver + * + * Insert FDIR conf entry into list and allocate ID for this filter + * + * Return: 0 true success, and other on error. + */ +static int +ice_vc_fdir_insert_entry(struct ice_vf *vf, + struct virtchnl_fdir_fltr_conf *conf, u32 *id) +{ + struct ice_fdir_fltr *input = &conf->input; + int i; + + /* alloc ID corresponding with conf */ + i = idr_alloc(&vf->fdir.fdir_rule_idr, conf, 0, + ICE_FDIR_MAX_FLTRS, GFP_KERNEL); + if (i < 0) + return -EINVAL; + *id = i; + + list_add(&input->fltr_node, &vf->fdir.fdir_rule_list); + return 0; +} + +/** + * ice_vc_fdir_remove_entry - remove FDIR conf entry by ID value + * @vf: pointer to the VF info + * @conf: FDIR configuration for each filter + * @id: filter rule's ID + */ +static void +ice_vc_fdir_remove_entry(struct ice_vf *vf, + struct virtchnl_fdir_fltr_conf *conf, u32 id) +{ + struct ice_fdir_fltr *input = &conf->input; + + idr_remove(&vf->fdir.fdir_rule_idr, id); + list_del(&input->fltr_node); +} + +/** + * ice_vc_fdir_lookup_entry - lookup FDIR conf entry by ID value + * @vf: pointer to the VF info + * @id: filter rule's ID + * + * Return: NULL on error, and other on success. + */ +static struct virtchnl_fdir_fltr_conf * +ice_vc_fdir_lookup_entry(struct ice_vf *vf, u32 id) +{ + return idr_find(&vf->fdir.fdir_rule_idr, id); +} + +/** + * ice_vc_fdir_flush_entry - remove all FDIR conf entry + * @vf: pointer to the VF info + */ +static void ice_vc_fdir_flush_entry(struct ice_vf *vf) +{ + struct virtchnl_fdir_fltr_conf *conf; + struct ice_fdir_fltr *desc, *temp; + + list_for_each_entry_safe(desc, temp, + &vf->fdir.fdir_rule_list, fltr_node) { + conf = to_fltr_conf_from_desc(desc); + list_del(&desc->fltr_node); + devm_kfree(ice_pf_to_dev(vf->pf), conf); + } +} + +/** + * ice_vc_fdir_write_fltr - write filter rule into hardware + * @vf: pointer to the VF info + * @conf: FDIR configuration for each filter + * @add: true implies add rule, false implies del rules + * @is_tun: false implies non-tunnel type filter, true implies tunnel filter + * + * Return: 0 on success, and other on error. + */ +static int ice_vc_fdir_write_fltr(struct ice_vf *vf, + struct virtchnl_fdir_fltr_conf *conf, + bool add, bool is_tun) +{ + struct ice_fdir_fltr *input = &conf->input; + struct ice_vsi *vsi, *ctrl_vsi; + struct ice_fltr_desc desc; + enum ice_status status; + struct device *dev; + struct ice_pf *pf; + struct ice_hw *hw; + int ret; + u8 *pkt; + + pf = vf->pf; + dev = ice_pf_to_dev(pf); + hw = &pf->hw; + vsi = pf->vsi[vf->lan_vsi_idx]; + if (!vsi) { + dev_dbg(dev, "Invalid vsi for VF %d\n", vf->vf_id); + return -EINVAL; + } + + input->dest_vsi = vsi->idx; + input->comp_report = ICE_FXD_FLTR_QW0_COMP_REPORT_SW_FAIL; + + ctrl_vsi = pf->vsi[vf->ctrl_vsi_idx]; + if (!ctrl_vsi) { + dev_dbg(dev, "Invalid ctrl_vsi for VF %d\n", vf->vf_id); + return -EINVAL; + } + + pkt = devm_kzalloc(dev, ICE_FDIR_MAX_RAW_PKT_SIZE, GFP_KERNEL); + if (!pkt) + return -ENOMEM; + + ice_fdir_get_prgm_desc(hw, input, &desc, add); + status = ice_fdir_get_gen_prgm_pkt(hw, input, pkt, false, is_tun); + ret = ice_status_to_errno(status); + if (ret) { + dev_dbg(dev, "Gen training pkt for VF %d ptype %d failed\n", + vf->vf_id, input->flow_type); + goto err_free_pkt; + } + + ret = ice_prgm_fdir_fltr(ctrl_vsi, &desc, pkt); + if (ret) + goto err_free_pkt; + + return 0; + +err_free_pkt: + devm_kfree(dev, pkt); + return ret; +} + +/** + * ice_vc_add_fdir_fltr - add a FDIR filter for VF by the msg buffer + * @vf: pointer to the VF info + * @msg: pointer to the msg buffer + * + * Return: 0 on success, and other on error. + */ +int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg) +{ + struct virtchnl_fdir_add *fltr = (struct virtchnl_fdir_add *)msg; + struct virtchnl_fdir_add *stat = NULL; + struct virtchnl_fdir_fltr_conf *conf; + enum virtchnl_status_code v_ret; + struct device *dev; + struct ice_pf *pf; + int is_tun = 0; + int len = 0; + int ret; + + pf = vf->pf; + dev = ice_pf_to_dev(pf); + ret = ice_vc_fdir_param_check(vf, fltr->vsi_id); + if (ret) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id); + goto err_exit; + } + + ret = ice_vf_start_ctrl_vsi(vf); + if (ret && (ret != -EEXIST)) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + dev_err(dev, "Init FDIR for VF %d failed, ret:%d\n", + vf->vf_id, ret); + goto err_exit; + } + + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) { + v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY; + dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id); + goto err_exit; + } + + conf = devm_kzalloc(dev, sizeof(*conf), GFP_KERNEL); + if (!conf) { + v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY; + dev_dbg(dev, "Alloc conf for VF %d failed\n", vf->vf_id); + goto err_exit; + } + + len = sizeof(*stat); + ret = ice_vc_validate_fdir_fltr(vf, fltr, conf); + if (ret) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_INVALID; + dev_dbg(dev, "Invalid FDIR filter from VF %d\n", vf->vf_id); + goto err_free_conf; + } + + if (fltr->validate_only) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_SUCCESS; + devm_kfree(dev, conf); + ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER, + v_ret, (u8 *)stat, len); + goto exit; + } + + ret = ice_vc_fdir_config_input_set(vf, fltr, conf, is_tun); + if (ret) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT; + dev_err(dev, "VF %d: FDIR input set configure failed, ret:%d\n", + vf->vf_id, ret); + goto err_free_conf; + } + + ret = ice_vc_fdir_is_dup_fltr(vf, conf); + if (ret) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_EXIST; + dev_dbg(dev, "VF %d: duplicated FDIR rule detected\n", + vf->vf_id); + goto err_free_conf; + } + + ret = ice_vc_fdir_insert_entry(vf, conf, &stat->flow_id); + if (ret) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE; + dev_dbg(dev, "VF %d: insert FDIR list failed\n", vf->vf_id); + goto err_free_conf; + } + + ret = ice_vc_fdir_write_fltr(vf, conf, true, is_tun); + if (ret) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE; + dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n", + vf->vf_id, ret); + goto err_rem_entry; + } + + vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]++; + + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_SUCCESS; +exit: + ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER, v_ret, + (u8 *)stat, len); + kfree(stat); + return ret; + +err_rem_entry: + ice_vc_fdir_remove_entry(vf, conf, stat->flow_id); +err_free_conf: + devm_kfree(dev, conf); +err_exit: + ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_ADD_FDIR_FILTER, v_ret, + (u8 *)stat, len); + kfree(stat); + return ret; +} + +/** + * ice_vc_del_fdir_fltr - delete a FDIR filter for VF by the msg buffer + * @vf: pointer to the VF info + * @msg: pointer to the msg buffer + * + * Return: 0 on success, and other on error. + */ +int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg) +{ + struct virtchnl_fdir_del *fltr = (struct virtchnl_fdir_del *)msg; + struct virtchnl_fdir_del *stat = NULL; + struct virtchnl_fdir_fltr_conf *conf; + enum virtchnl_status_code v_ret; + struct device *dev; + struct ice_pf *pf; + int is_tun = 0; + int len = 0; + int ret; + + pf = vf->pf; + dev = ice_pf_to_dev(pf); + ret = ice_vc_fdir_param_check(vf, fltr->vsi_id); + if (ret) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + dev_dbg(dev, "Parameter check for VF %d failed\n", vf->vf_id); + goto err_exit; + } + + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (!stat) { + v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY; + dev_dbg(dev, "Alloc stat for VF %d failed\n", vf->vf_id); + goto err_exit; + } + + len = sizeof(*stat); + + conf = ice_vc_fdir_lookup_entry(vf, fltr->flow_id); + if (!conf) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST; + dev_dbg(dev, "VF %d: FDIR invalid flow_id:0x%X\n", + vf->vf_id, fltr->flow_id); + goto err_exit; + } + + /* Just return failure when ctrl_vsi idx is invalid */ + if (vf->ctrl_vsi_idx == ICE_NO_VSI) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE; + dev_err(dev, "Invalid FDIR ctrl_vsi for VF %d\n", vf->vf_id); + goto err_exit; + } + + ret = ice_vc_fdir_write_fltr(vf, conf, false, is_tun); + if (ret) { + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE; + dev_err(dev, "VF %d: writing FDIR rule failed, ret:%d\n", + vf->vf_id, ret); + goto err_exit; + } + + ice_vc_fdir_remove_entry(vf, conf, fltr->flow_id); + devm_kfree(dev, conf); + vf->fdir.fdir_fltr_cnt[conf->input.flow_type][is_tun]--; + + v_ret = VIRTCHNL_STATUS_SUCCESS; + stat->status = VIRTCHNL_FDIR_SUCCESS; + +err_exit: + ret = ice_vc_send_msg_to_vf(vf, VIRTCHNL_OP_DEL_FDIR_FILTER, v_ret, + (u8 *)stat, len); + kfree(stat); + return ret; +} + +/** + * ice_vf_fdir_init - init FDIR resource for VF + * @vf: pointer to the VF info + */ +void ice_vf_fdir_init(struct ice_vf *vf) +{ + struct ice_vf_fdir *fdir = &vf->fdir; + + idr_init(&fdir->fdir_rule_idr); + INIT_LIST_HEAD(&fdir->fdir_rule_list); +} + +/** + * ice_vf_fdir_exit - destroy FDIR resource for VF + * @vf: pointer to the VF info + */ +void ice_vf_fdir_exit(struct ice_vf *vf) +{ + ice_vc_fdir_flush_entry(vf); + idr_destroy(&vf->fdir.fdir_rule_idr); + ice_vc_fdir_rem_prof_all(vf); + ice_vc_fdir_free_prof_all(vf); +} diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h new file mode 100644 index 000000000000..2a2e0e598559 --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021, Intel Corporation. */ + +#ifndef _ICE_VIRTCHNL_FDIR_H_ +#define _ICE_VIRTCHNL_FDIR_H_ + +struct ice_vf; + +/* VF FDIR information structure */ +struct ice_vf_fdir { + u16 fdir_fltr_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX]; + int prof_entry_cnt[ICE_FLTR_PTYPE_MAX][ICE_FD_HW_SEG_MAX]; + struct ice_fd_hw_prof **fdir_prof; + + struct idr fdir_rule_idr; + struct list_head fdir_rule_list; +}; + +int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg); +int ice_vc_del_fdir_fltr(struct ice_vf *vf, u8 *msg); +void ice_vf_fdir_init(struct ice_vf *vf); +void ice_vf_fdir_exit(struct ice_vf *vf); + +#endif /* _ICE_VIRTCHNL_FDIR_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index fa72b7e2e433..20343a0fe726 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -233,6 +233,7 @@ static void ice_free_vf_res(struct ice_vf *vf) * accessing the VF's VSI after it's freed or invalidated. */ clear_bit(ICE_VF_STATE_INIT, vf->vf_states); + ice_vf_fdir_exit(vf); /* free VF control VSI */ if (vf->ctrl_vsi_idx != ICE_NO_VSI) ice_vf_ctrl_vsi_release(vf); @@ -1300,6 +1301,7 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) ice_for_each_vf(pf, v) { vf = &pf->vf[v]; + ice_vf_fdir_exit(vf); /* clean VF control VSI when resetting VFs since it should be * setup only when VF creates its first FDIR rule. */ @@ -1424,6 +1426,7 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr) dev_err(dev, "disabling promiscuous mode failed\n"); } + ice_vf_fdir_exit(vf); /* clean VF control VSI when resetting VF since it should be setup * only when VF creates its first FDIR rule. */ @@ -1610,6 +1613,7 @@ static void ice_set_dflt_settings_vfs(struct ice_pf *pf) * creates its first fdir rule. */ ice_vf_ctrl_invalidate_vsi(vf); + ice_vf_fdir_init(vf); } } @@ -1909,7 +1913,7 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event) * * send msg to VF */ -static int +int ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode, enum virtchnl_status_code v_retval, u8 *msg, u16 msglen) { @@ -2057,6 +2061,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg) vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_REG; } + if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_FDIR_PF) + vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_FDIR_PF; + if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2) vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2; @@ -2145,7 +2152,7 @@ static struct ice_vsi *ice_find_vsi_from_id(struct ice_pf *pf, u16 id) * * check for the valid VSI ID */ -static bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id) +bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id) { struct ice_pf *pf = vf->pf; struct ice_vsi *vsi; @@ -3877,6 +3884,12 @@ error_handler: case VIRTCHNL_OP_DISABLE_VLAN_STRIPPING: err = ice_vc_dis_vlan_stripping(vf); break; + case VIRTCHNL_OP_ADD_FDIR_FILTER: + err = ice_vc_add_fdir_fltr(vf, msg); + break; + case VIRTCHNL_OP_DEL_FDIR_FILTER: + err = ice_vc_del_fdir_fltr(vf, msg); + break; case VIRTCHNL_OP_UNKNOWN: default: dev_err(dev, "Unsupported opcode %d from VF %d\n", v_opcode, diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index faa879d744a1..46abc5388fc7 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -4,6 +4,7 @@ #ifndef _ICE_VIRTCHNL_PF_H_ #define _ICE_VIRTCHNL_PF_H_ #include "ice.h" +#include "ice_virtchnl_fdir.h" /* Restrict number of MAC Addr and VLAN that non-trusted VF can programmed */ #define ICE_MAX_VLAN_PER_VF 8 @@ -71,6 +72,7 @@ struct ice_vf { u16 vf_id; /* VF ID in the PF space */ u16 lan_vsi_idx; /* index into PF struct */ u16 ctrl_vsi_idx; + struct ice_vf_fdir fdir; /* first vector index of this VF in the PF space */ int first_vector_idx; struct ice_sw *vf_sw_id; /* switch ID the VF VSIs connect to */ @@ -140,6 +142,10 @@ ice_vf_lan_overflow_event(struct ice_pf *pf, struct ice_rq_event_info *event); void ice_print_vfs_mdd_events(struct ice_pf *pf); void ice_print_vf_rx_mdd_event(struct ice_vf *vf); struct ice_vsi *ice_vf_ctrl_vsi_setup(struct ice_vf *vf); +int +ice_vc_send_msg_to_vf(struct ice_vf *vf, u32 v_opcode, + enum virtchnl_status_code v_retval, u8 *msg, u16 msglen); +bool ice_vc_isvalid_vsi_id(struct ice_vf *vf, u16 vsi_id); #else /* CONFIG_PCI_IOV */ #define ice_process_vflr_event(pf) do {} while (0) #define ice_free_vfs(pf) do {} while (0) diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 40bad71865ea..47482049f640 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -136,6 +136,9 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_CHANNELS = 31, VIRTCHNL_OP_ADD_CLOUD_FILTER = 32, VIRTCHNL_OP_DEL_CLOUD_FILTER = 33, + /* opcode 34 - 46 are reserved */ + VIRTCHNL_OP_ADD_FDIR_FILTER = 47, + VIRTCHNL_OP_DEL_FDIR_FILTER = 48, }; /* These macros are used to generate compilation errors if a structure/union @@ -247,6 +250,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM 0X00200000 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM 0X00400000 #define VIRTCHNL_VF_OFFLOAD_ADQ 0X00800000 +#define VIRTCHNL_VF_OFFLOAD_FDIR_PF 0X10000000 /* Define below the capability flags that are not offloads */ #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED 0x00000080 @@ -559,6 +563,11 @@ enum virtchnl_action { /* action types */ VIRTCHNL_ACTION_DROP = 0, VIRTCHNL_ACTION_TC_REDIRECT, + VIRTCHNL_ACTION_PASSTHRU, + VIRTCHNL_ACTION_QUEUE, + VIRTCHNL_ACTION_Q_REGION, + VIRTCHNL_ACTION_MARK, + VIRTCHNL_ACTION_COUNT, }; enum virtchnl_flow_type { @@ -668,6 +677,269 @@ enum virtchnl_vfr_states { VIRTCHNL_VFR_VFACTIVE, }; +#define VIRTCHNL_MAX_NUM_PROTO_HDRS 32 +#define PROTO_HDR_SHIFT 5 +#define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT) +#define PROTO_HDR_FIELD_MASK ((1UL << PROTO_HDR_SHIFT) - 1) + +/* VF use these macros to configure each protocol header. + * Specify which protocol headers and protocol header fields base on + * virtchnl_proto_hdr_type and virtchnl_proto_hdr_field. + * @param hdr: a struct of virtchnl_proto_hdr + * @param hdr_type: ETH/IPV4/TCP, etc + * @param field: SRC/DST/TEID/SPI, etc + */ +#define VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, field) \ + ((hdr)->field_selector |= BIT((field) & PROTO_HDR_FIELD_MASK)) +#define VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, field) \ + ((hdr)->field_selector &= ~BIT((field) & PROTO_HDR_FIELD_MASK)) +#define VIRTCHNL_TEST_PROTO_HDR_FIELD(hdr, val) \ + ((hdr)->field_selector & BIT((val) & PROTO_HDR_FIELD_MASK)) +#define VIRTCHNL_GET_PROTO_HDR_FIELD(hdr) ((hdr)->field_selector) + +#define VIRTCHNL_ADD_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \ + (VIRTCHNL_ADD_PROTO_HDR_FIELD(hdr, \ + VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field)) +#define VIRTCHNL_DEL_PROTO_HDR_FIELD_BIT(hdr, hdr_type, field) \ + (VIRTCHNL_DEL_PROTO_HDR_FIELD(hdr, \ + VIRTCHNL_PROTO_HDR_ ## hdr_type ## _ ## field)) + +#define VIRTCHNL_SET_PROTO_HDR_TYPE(hdr, hdr_type) \ + ((hdr)->type = VIRTCHNL_PROTO_HDR_ ## hdr_type) +#define VIRTCHNL_GET_PROTO_HDR_TYPE(hdr) \ + (((hdr)->type) >> PROTO_HDR_SHIFT) +#define VIRTCHNL_TEST_PROTO_HDR_TYPE(hdr, val) \ + ((hdr)->type == ((val) >> PROTO_HDR_SHIFT)) +#define VIRTCHNL_TEST_PROTO_HDR(hdr, val) \ + (VIRTCHNL_TEST_PROTO_HDR_TYPE((hdr), (val)) && \ + VIRTCHNL_TEST_PROTO_HDR_FIELD((hdr), (val))) + +/* Protocol header type within a packet segment. A segment consists of one or + * more protocol headers that make up a logical group of protocol headers. Each + * logical group of protocol headers encapsulates or is encapsulated using/by + * tunneling or encapsulation protocols for network virtualization. + */ +enum virtchnl_proto_hdr_type { + VIRTCHNL_PROTO_HDR_NONE, + VIRTCHNL_PROTO_HDR_ETH, + VIRTCHNL_PROTO_HDR_S_VLAN, + VIRTCHNL_PROTO_HDR_C_VLAN, + VIRTCHNL_PROTO_HDR_IPV4, + VIRTCHNL_PROTO_HDR_IPV6, + VIRTCHNL_PROTO_HDR_TCP, + VIRTCHNL_PROTO_HDR_UDP, + VIRTCHNL_PROTO_HDR_SCTP, + VIRTCHNL_PROTO_HDR_GTPU_IP, + VIRTCHNL_PROTO_HDR_GTPU_EH, + VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN, + VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP, + VIRTCHNL_PROTO_HDR_PPPOE, + VIRTCHNL_PROTO_HDR_L2TPV3, + VIRTCHNL_PROTO_HDR_ESP, + VIRTCHNL_PROTO_HDR_AH, + VIRTCHNL_PROTO_HDR_PFCP, +}; + +/* Protocol header field within a protocol header. */ +enum virtchnl_proto_hdr_field { + /* ETHER */ + VIRTCHNL_PROTO_HDR_ETH_SRC = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ETH), + VIRTCHNL_PROTO_HDR_ETH_DST, + VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE, + /* S-VLAN */ + VIRTCHNL_PROTO_HDR_S_VLAN_ID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_S_VLAN), + /* C-VLAN */ + VIRTCHNL_PROTO_HDR_C_VLAN_ID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_C_VLAN), + /* IPV4 */ + VIRTCHNL_PROTO_HDR_IPV4_SRC = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4), + VIRTCHNL_PROTO_HDR_IPV4_DST, + VIRTCHNL_PROTO_HDR_IPV4_DSCP, + VIRTCHNL_PROTO_HDR_IPV4_TTL, + VIRTCHNL_PROTO_HDR_IPV4_PROT, + /* IPV6 */ + VIRTCHNL_PROTO_HDR_IPV6_SRC = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6), + VIRTCHNL_PROTO_HDR_IPV6_DST, + VIRTCHNL_PROTO_HDR_IPV6_TC, + VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT, + VIRTCHNL_PROTO_HDR_IPV6_PROT, + /* TCP */ + VIRTCHNL_PROTO_HDR_TCP_SRC_PORT = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_TCP), + VIRTCHNL_PROTO_HDR_TCP_DST_PORT, + /* UDP */ + VIRTCHNL_PROTO_HDR_UDP_SRC_PORT = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_UDP), + VIRTCHNL_PROTO_HDR_UDP_DST_PORT, + /* SCTP */ + VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_SCTP), + VIRTCHNL_PROTO_HDR_SCTP_DST_PORT, + /* GTPU_IP */ + VIRTCHNL_PROTO_HDR_GTPU_IP_TEID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_IP), + /* GTPU_EH */ + VIRTCHNL_PROTO_HDR_GTPU_EH_PDU = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH), + VIRTCHNL_PROTO_HDR_GTPU_EH_QFI, + /* PPPOE */ + VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PPPOE), + /* L2TPV3 */ + VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_L2TPV3), + /* ESP */ + VIRTCHNL_PROTO_HDR_ESP_SPI = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ESP), + /* AH */ + VIRTCHNL_PROTO_HDR_AH_SPI = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_AH), + /* PFCP */ + VIRTCHNL_PROTO_HDR_PFCP_S_FIELD = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PFCP), + VIRTCHNL_PROTO_HDR_PFCP_SEID, +}; + +struct virtchnl_proto_hdr { + enum virtchnl_proto_hdr_type type; + u32 field_selector; /* a bit mask to select field for header type */ + u8 buffer[64]; + /** + * binary buffer in network order for specific header type. + * For example, if type = VIRTCHNL_PROTO_HDR_IPV4, a IPv4 + * header is expected to be copied into the buffer. + */ +}; + +VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_proto_hdr); + +struct virtchnl_proto_hdrs { + u8 tunnel_level; + /** + * specify where protocol header start from. + * 0 - from the outer layer + * 1 - from the first inner layer + * 2 - from the second inner layer + * .... + **/ + int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */ + struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs); + +/* action configuration for FDIR */ +struct virtchnl_filter_action { + enum virtchnl_action type; + union { + /* used for queue and qgroup action */ + struct { + u16 index; + u8 region; + } queue; + /* used for count action */ + struct { + /* share counter ID with other flow rules */ + u8 shared; + u32 id; /* counter ID */ + } count; + /* used for mark action */ + u32 mark_id; + u8 reserve[32]; + } act_conf; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(36, virtchnl_filter_action); + +#define VIRTCHNL_MAX_NUM_ACTIONS 8 + +struct virtchnl_filter_action_set { + /* action number must be less then VIRTCHNL_MAX_NUM_ACTIONS */ + int count; + struct virtchnl_filter_action actions[VIRTCHNL_MAX_NUM_ACTIONS]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(292, virtchnl_filter_action_set); + +/* pattern and action for FDIR rule */ +struct virtchnl_fdir_rule { + struct virtchnl_proto_hdrs proto_hdrs; + struct virtchnl_filter_action_set action_set; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(2604, virtchnl_fdir_rule); + +/* Status returned to VF after VF requests FDIR commands + * VIRTCHNL_FDIR_SUCCESS + * VF FDIR related request is successfully done by PF + * The request can be OP_ADD/DEL. + * + * VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE + * OP_ADD_FDIR_FILTER request is failed due to no Hardware resource. + * + * VIRTCHNL_FDIR_FAILURE_RULE_EXIST + * OP_ADD_FDIR_FILTER request is failed due to the rule is already existed. + * + * VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT + * OP_ADD_FDIR_FILTER request is failed due to conflict with existing rule. + * + * VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST + * OP_DEL_FDIR_FILTER request is failed due to this rule doesn't exist. + * + * VIRTCHNL_FDIR_FAILURE_RULE_INVALID + * OP_ADD_FDIR_FILTER request is failed due to parameters validation + * or HW doesn't support. + * + * VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT + * OP_ADD/DEL_FDIR_FILTER request is failed due to timing out + * for programming. + */ +enum virtchnl_fdir_prgm_status { + VIRTCHNL_FDIR_SUCCESS = 0, + VIRTCHNL_FDIR_FAILURE_RULE_NORESOURCE, + VIRTCHNL_FDIR_FAILURE_RULE_EXIST, + VIRTCHNL_FDIR_FAILURE_RULE_CONFLICT, + VIRTCHNL_FDIR_FAILURE_RULE_NONEXIST, + VIRTCHNL_FDIR_FAILURE_RULE_INVALID, + VIRTCHNL_FDIR_FAILURE_RULE_TIMEOUT, +}; + +/* VIRTCHNL_OP_ADD_FDIR_FILTER + * VF sends this request to PF by filling out vsi_id, + * validate_only and rule_cfg. PF will return flow_id + * if the request is successfully done and return add_status to VF. + */ +struct virtchnl_fdir_add { + u16 vsi_id; /* INPUT */ + /* + * 1 for validating a fdir rule, 0 for creating a fdir rule. + * Validate and create share one ops: VIRTCHNL_OP_ADD_FDIR_FILTER. + */ + u16 validate_only; /* INPUT */ + u32 flow_id; /* OUTPUT */ + struct virtchnl_fdir_rule rule_cfg; /* INPUT */ + enum virtchnl_fdir_prgm_status status; /* OUTPUT */ +}; + +VIRTCHNL_CHECK_STRUCT_LEN(2616, virtchnl_fdir_add); + +/* VIRTCHNL_OP_DEL_FDIR_FILTER + * VF sends this request to PF by filling out vsi_id + * and flow_id. PF will return del_status to VF. + */ +struct virtchnl_fdir_del { + u16 vsi_id; /* INPUT */ + u16 pad; + u32 flow_id; /* INPUT */ + enum virtchnl_fdir_prgm_status status; /* OUTPUT */ +}; + +VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del); + /** * virtchnl_vc_validate_vf_msg * @ver: Virtchnl version info @@ -828,6 +1100,12 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DEL_CLOUD_FILTER: valid_len = sizeof(struct virtchnl_filter); break; + case VIRTCHNL_OP_ADD_FDIR_FILTER: + valid_len = sizeof(struct virtchnl_fdir_add); + break; + case VIRTCHNL_OP_DEL_FDIR_FILTER: + valid_len = sizeof(struct virtchnl_fdir_del); + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.2.3 From f57bac3c33e761fdd78fef159fdc677056c706d0 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sun, 21 Mar 2021 22:48:49 +0900 Subject: netdev: add netdev_queue_set_dql_min_limit() Add a function to set the dynamic queue limit minimum value. Some specific drivers might have legitimate reasons to configure dql.min_limit to a given value. Typically, this is the case when the PDU of the protocol is smaller than the packet size to used to carry those frames to the device. Concrete example: a CAN (Control Area Network) device with an USB 2.0 interface. The PDU of classical CAN protocol are roughly 16 bytes but the USB packet size (which is used to carry the CAN frames to the device) might be up to 512 bytes. Wen small traffic burst occurs, BQL algorithm is not able to immediately adjust and this would result in having to send many small USB packets (i.e packet of 16 bytes for each CAN frame). Filling up the USB packet with CAN frames is relatively fast (small latency issue) but the gain of not having to send several small USB packets is huge (big throughput increase). In this case, forcing dql.min_limit to a given value that would allow to stuff the USB packet is always a win. This function is to be used by network drivers which are able to prove through a rationale and through empirical tests on several environment (with other applications, heavy context switching, virtualization...), that they constantly reach better performances with a specific predefined dql.min_limit value with no noticeable latency impact. Signed-off-by: Vincent Mailhol Signed-off-by: David S. Miller --- include/linux/netdevice.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8f003955c485..33b8ea08996e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3446,6 +3446,24 @@ netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue) return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN; } +/** + * netdev_queue_set_dql_min_limit - set dql minimum limit + * @dev_queue: pointer to transmit queue + * @min_limit: dql minimum limit + * + * Forces xmit_more() to return true until the minimum threshold + * defined by @min_limit is reached (or until the tx queue is + * empty). Warning: to be use with care, misuse will impact the + * latency. + */ +static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue, + unsigned int min_limit) +{ +#ifdef CONFIG_BQL + dev_queue->dql.min_limit = min_limit; +#endif +} + /** * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue -- cgit v1.2.3 From 405a129f59384c474343d6261a2e0a75650d29a8 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Mon, 22 Mar 2021 08:25:16 +0530 Subject: linux/qed: Mundane spelling fixes throughout the file s/unrequired/"not required"/ s/consme/consume/ .....two different places s/accros/across/ Signed-off-by: Bhaskar Chowdhury Acked-by: Igor Russkikh Signed-off-by: David S. Miller --- include/linux/qed/qed_chain.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index e339b48de32d..f34dbd0db795 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -19,7 +19,7 @@ enum qed_chain_mode { /* Each Page contains a next pointer at its end */ QED_CHAIN_MODE_NEXT_PTR, - /* Chain is a single page (next ptr) is unrequired */ + /* Chain is a single page (next ptr) is not required */ QED_CHAIN_MODE_SINGLE, /* Page pointers are located in a side list */ @@ -56,13 +56,13 @@ struct qed_chain_pbl_u32 { }; struct qed_chain_u16 { - /* Cyclic index of next element to produce/consme */ + /* Cyclic index of next element to produce/consume */ u16 prod_idx; u16 cons_idx; }; struct qed_chain_u32 { - /* Cyclic index of next element to produce/consme */ + /* Cyclic index of next element to produce/consume */ u32 prod_idx; u32 cons_idx; }; @@ -270,7 +270,7 @@ static inline dma_addr_t qed_chain_get_pbl_phys(const struct qed_chain *chain) /** * @brief qed_chain_advance_page - * - * Advance the next element accros pages for a linked chain + * Advance the next element across pages for a linked chain * * @param p_chain * @param p_next_elem -- cgit v1.2.3 From 744b8376632208137fe4acc9967b93e2970732a3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 22 Mar 2021 13:31:48 +0200 Subject: net: move the ptype_all and ptype_base declarations to include/linux/netdevice.h ptype_all and ptype_base are declared in net/core/dev.c as non-static, because they are used by net-procfs.c too. However, a "make W=1" build complains that there was no previous declaration of ptype_all and ptype_base in a header file, so this way of declaring things constitutes a violation of coding style. Let's move the extern declarations of ptype_all and ptype_base to the linux/netdevice.h file, which is included by net-procfs.c too. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/net-procfs.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 33b8ea08996e..e4a503288d9b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5336,6 +5336,9 @@ do { \ #define PTYPE_HASH_SIZE (16) #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) +extern struct list_head ptype_all __read_mostly; +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; + extern struct net_device *blackhole_netdev; #endif /* _LINUX_NETDEVICE_H */ diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index c714e6a9dad4..d8b9dbabd4a4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -10,9 +10,6 @@ #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) -extern struct list_head ptype_all __read_mostly; -extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; - static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); -- cgit v1.2.3 From add2d73631070c951b0de81a01d1463a15cfbd47 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Mar 2021 11:21:45 -0700 Subject: net: set initial device refcount to 1 When adding CONFIG_PCPU_DEV_REFCNT, I forgot that the initial net device refcount was 0. When CONFIG_PCPU_DEV_REFCNT is not set, this means the first dev_hold() triggers an illegal refcount operation (addition on 0) refcount_t: addition on 0; use-after-free. WARNING: CPU: 0 PID: 1 at lib/refcount.c:25 refcount_warn_saturate+0x128/0x1a4 Fix is to change initial (and final) refcount to be 1. Also add a missing kerneldoc piece, as reported by Stephen Rothwell. Fixes: 919067cc845f ("net: add CONFIG_PCPU_DEV_REFCNT") Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e4a503288d9b..7005ad80e8d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1792,6 +1792,7 @@ enum netdev_ml_priv_type { * * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device + * @dev_refcnt: Number of references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * diff --git a/net/core/dev.c b/net/core/dev.c index ffab3928eeeb..c9a496f5e687 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10346,7 +10346,7 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = warning_time = jiffies; refcnt = netdev_refcnt_read(dev); - while (refcnt != 0) { + while (refcnt != 1) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -10383,7 +10383,7 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; @@ -10459,7 +10459,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); @@ -10680,6 +10680,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; + dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); #endif if (dev_addr_init(dev)) -- cgit v1.2.3 From 1ab568e92bf8f6a359c977869dc546a23a6b5f13 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Mon, 22 Mar 2021 19:51:13 +0100 Subject: net: dsa: hellcreek: Report switch name and ID Report the driver name, ASIC ID and the switch name via devlink. This is a useful information for user space tooling. Signed-off-by: Kurt Kanzenbach Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/hirschmann/hellcreek.c | 18 ++++++++++++++++++ include/linux/platform_data/hirschmann-hellcreek.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index 64a73dd045c0..918be7eb626f 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -1082,6 +1082,22 @@ out: return ret; } +static int hellcreek_devlink_info_get(struct dsa_switch *ds, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct hellcreek *hellcreek = ds->priv; + int ret; + + ret = devlink_info_driver_name_put(req, "hellcreek"); + if (ret) + return ret; + + return devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_ASIC_ID, + hellcreek->pdata->name); +} + static u64 hellcreek_devlink_vlan_table_get(void *priv) { struct hellcreek *hellcreek = priv; @@ -1732,6 +1748,7 @@ static int hellcreek_port_setup_tc(struct dsa_switch *ds, int port, } static const struct dsa_switch_ops hellcreek_ds_ops = { + .devlink_info_get = hellcreek_devlink_info_get, .get_ethtool_stats = hellcreek_get_ethtool_stats, .get_sset_count = hellcreek_get_sset_count, .get_strings = hellcreek_get_strings, @@ -1909,6 +1926,7 @@ static int hellcreek_remove(struct platform_device *pdev) } static const struct hellcreek_platform_data de1soc_r1_pdata = { + .name = "r4c30", .num_ports = 4, .is_100_mbits = 1, .qbv_support = 1, diff --git a/include/linux/platform_data/hirschmann-hellcreek.h b/include/linux/platform_data/hirschmann-hellcreek.h index 388846766bb2..6a000df5541f 100644 --- a/include/linux/platform_data/hirschmann-hellcreek.h +++ b/include/linux/platform_data/hirschmann-hellcreek.h @@ -12,6 +12,7 @@ #include struct hellcreek_platform_data { + const char *name; /* Switch name */ int num_ports; /* Amount of switch ports */ int is_100_mbits; /* Is it configured to 100 or 1000 mbit/s */ int qbv_support; /* Qbv support on front TSN ports */ -- cgit v1.2.3 From 65d2dbb300197839eafc4171cfeb57a14c452724 Mon Sep 17 00:00:00 2001 From: Xie He Date: Sun, 21 Mar 2021 02:39:35 -0700 Subject: net: lapb: Make "lapb_t1timer_running" able to detect an already running timer Problem: The "lapb_t1timer_running" function in "lapb_timer.c" is used in only one place: in the "lapb_kick" function in "lapb_out.c". "lapb_kick" calls "lapb_t1timer_running" to check if the timer is already pending, and if it is not, schedule it to run. However, if the timer has already fired and is running, and is waiting to get the "lapb->lock" lock, "lapb_t1timer_running" will not detect this, and "lapb_kick" will then schedule a new timer. The old timer will then abort when it sees a new timer pending. I think this is not right. The purpose of "lapb_kick" should be ensuring that the actual work of the timer function is scheduled to be done. If the timer function is already running but waiting for the lock, "lapb_kick" should not abort and reschedule it. Changes made: I added a new field "t1timer_running" in "struct lapb_cb" for "lapb_t1timer_running" to use. "t1timer_running" will accurately reflect whether the actual work of the timer is pending. If the timer has fired but is still waiting for the lock, "t1timer_running" will still correctly reflect whether the actual work is waiting to be done. The old "t1timer_stop" field, whose only responsibility is to ask a timer (that is already running but waiting for the lock) to abort, is no longer needed, because the new "t1timer_running" field can fully take over its responsibility. Therefore "t1timer_stop" is deleted. "t1timer_running" is not simply a negation of the old "t1timer_stop". At the end of the timer function, if it does not reschedule itself, "t1timer_running" is set to false to indicate that the timer is stopped. For consistency of the code, I also added "t2timer_running" and deleted "t2timer_stop". Signed-off-by: Xie He Signed-off-by: David S. Miller --- include/net/lapb.h | 2 +- net/lapb/lapb_iface.c | 4 ++-- net/lapb/lapb_timer.c | 19 ++++++++++++------- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/lapb.h b/include/net/lapb.h index eee73442a1ba..124ee122f2c8 100644 --- a/include/net/lapb.h +++ b/include/net/lapb.h @@ -92,7 +92,7 @@ struct lapb_cb { unsigned short n2, n2count; unsigned short t1, t2; struct timer_list t1timer, t2timer; - bool t1timer_stop, t2timer_stop; + bool t1timer_running, t2timer_running; /* Internal control information */ struct sk_buff_head write_queue; diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 0511bbe4af7b..1078e14f1acf 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -122,8 +122,8 @@ static struct lapb_cb *lapb_create_cb(void) timer_setup(&lapb->t1timer, NULL, 0); timer_setup(&lapb->t2timer, NULL, 0); - lapb->t1timer_stop = true; - lapb->t2timer_stop = true; + lapb->t1timer_running = false; + lapb->t2timer_running = false; lapb->t1 = LAPB_DEFAULT_T1; lapb->t2 = LAPB_DEFAULT_T2; diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 0230b272b7d1..5be68869064d 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -40,7 +40,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb) lapb->t1timer.function = lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; - lapb->t1timer_stop = false; + lapb->t1timer_running = true; add_timer(&lapb->t1timer); } @@ -51,25 +51,25 @@ void lapb_start_t2timer(struct lapb_cb *lapb) lapb->t2timer.function = lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; - lapb->t2timer_stop = false; + lapb->t2timer_running = true; add_timer(&lapb->t2timer); } void lapb_stop_t1timer(struct lapb_cb *lapb) { - lapb->t1timer_stop = true; + lapb->t1timer_running = false; del_timer(&lapb->t1timer); } void lapb_stop_t2timer(struct lapb_cb *lapb) { - lapb->t2timer_stop = true; + lapb->t2timer_running = false; del_timer(&lapb->t2timer); } int lapb_t1timer_running(struct lapb_cb *lapb) { - return timer_pending(&lapb->t1timer); + return lapb->t1timer_running; } static void lapb_t2timer_expiry(struct timer_list *t) @@ -79,13 +79,14 @@ static void lapb_t2timer_expiry(struct timer_list *t) spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */ goto out; - if (lapb->t2timer_stop) /* The timer has been stopped */ + if (!lapb->t2timer_running) /* The timer has been stopped */ goto out; if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; lapb_timeout_response(lapb); } + lapb->t2timer_running = false; out: spin_unlock_bh(&lapb->lock); @@ -98,7 +99,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */ goto out; - if (lapb->t1timer_stop) /* The timer has been stopped */ + if (!lapb->t1timer_running) /* The timer has been stopped */ goto out; switch (lapb->state) { @@ -127,6 +128,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; @@ -151,6 +153,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; @@ -169,6 +172,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb_stop_t2timer(lapb); lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; @@ -186,6 +190,7 @@ static void lapb_t1timer_expiry(struct timer_list *t) lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev); + lapb->t1timer_running = false; goto out; } else { lapb->n2count++; -- cgit v1.2.3 From c0e715bbd50e57319f76d0b757dc282893f2d476 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 23 Mar 2021 01:51:42 +0200 Subject: net: bridge: add helper for retrieving the current bridge port STP state It may happen that we have the following topology with DSA or any other switchdev driver with LAG offload: ip link add br0 type bridge stp_state 1 ip link add bond0 type bond ip link set bond0 master br0 ip link set swp0 master bond0 ip link set swp1 master bond0 STP decides that it should put bond0 into the BLOCKING state, and that's that. The ports that are actively listening for the switchdev port attributes emitted for the bond0 bridge port (because they are offloading it) and have the honor of seeing that switchdev port attribute can react to it, so we can program swp0 and swp1 into the BLOCKING state. But if then we do: ip link set swp2 master bond0 then as far as the bridge is concerned, nothing has changed: it still has one bridge port. But this new bridge port will not see any STP state change notification and will remain FORWARDING, which is how the standalone code leaves it in. We need a function in the bridge driver which retrieves the current STP state, such that drivers can synchronize to it when they may have missed switchdev events. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 6 ++++++ net/bridge/br_stp.c | 14 ++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index b979005ea39c..920d3a02cc68 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -136,6 +136,7 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev, __u16 vid); void br_fdb_clear_offload(const struct net_device *dev, u16 vid); bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag); +u8 br_port_get_stp_state(const struct net_device *dev); #else static inline struct net_device * br_fdb_find_port(const struct net_device *br_dev, @@ -154,6 +155,11 @@ br_port_flag_is_set(const struct net_device *dev, unsigned long flag) { return false; } + +static inline u8 br_port_get_stp_state(const struct net_device *dev) +{ + return BR_STATE_DISABLED; +} #endif #endif diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 21c6781906aa..86b5e05d3f21 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -64,6 +64,20 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) } } +u8 br_port_get_stp_state(const struct net_device *dev) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + p = br_port_get_rtnl(dev); + if (!p) + return BR_STATE_DISABLED; + + return p->state; +} +EXPORT_SYMBOL_GPL(br_port_get_stp_state); + /* called under bridge lock */ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { -- cgit v1.2.3 From f1d42ea10056b9050d1c5b8e19995f66c30aeded Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 23 Mar 2021 01:51:43 +0200 Subject: net: bridge: add helper to retrieve the current ageing time The SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME attribute is only emitted from: sysfs/ioctl/netlink -> br_set_ageing_time -> __set_ageing_time therefore not at bridge port creation time, so: (a) switchdev drivers have to hardcode the initial value for the address ageing time, because they didn't get any notification (b) that hardcoded value can be out of sync, if the user changes the ageing time before enslaving the port to the bridge We need a helper in the bridge, such that switchdev drivers can query the current value of the bridge ageing time when they start offloading it. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 6 ++++++ net/bridge/br_stp.c | 13 +++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 920d3a02cc68..ebd16495459c 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -137,6 +137,7 @@ struct net_device *br_fdb_find_port(const struct net_device *br_dev, void br_fdb_clear_offload(const struct net_device *dev, u16 vid); bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag); u8 br_port_get_stp_state(const struct net_device *dev); +clock_t br_get_ageing_time(struct net_device *br_dev); #else static inline struct net_device * br_fdb_find_port(const struct net_device *br_dev, @@ -160,6 +161,11 @@ static inline u8 br_port_get_stp_state(const struct net_device *dev) { return BR_STATE_DISABLED; } + +static inline clock_t br_get_ageing_time(struct net_device *br_dev) +{ + return 0; +} #endif #endif diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 86b5e05d3f21..3dafb6143cff 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -639,6 +639,19 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) return 0; } +clock_t br_get_ageing_time(struct net_device *br_dev) +{ + struct net_bridge *br; + + if (!netif_is_bridge_master(br_dev)) + return 0; + + br = netdev_priv(br_dev); + + return jiffies_to_clock_t(br->ageing_time); +} +EXPORT_SYMBOL_GPL(br_get_ageing_time); + /* called under bridge lock */ void __br_set_topology_change(struct net_bridge *br, unsigned char val) { -- cgit v1.2.3 From 4f2673b3a2b6246729a1ff13b8945a040839dbd3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 23 Mar 2021 01:51:44 +0200 Subject: net: bridge: add helper to replay port and host-joined mdb entries I have a system with DSA ports, and udhcpcd is configured to bring interfaces up as soon as they are created. I create a bridge as follows: ip link add br0 type bridge As soon as I create the bridge and udhcpcd brings it up, I also have avahi which automatically starts sending IPv6 packets to advertise some local services, and because of that, the br0 bridge joins the following IPv6 groups due to the code path detailed below: 33:33:ff:6d:c1:9c vid 0 33:33:00:00:00:6a vid 0 33:33:00:00:00:fb vid 0 br_dev_xmit -> br_multicast_rcv -> br_ip6_multicast_add_group -> __br_multicast_add_group -> br_multicast_host_join -> br_mdb_notify This is all fine, but inside br_mdb_notify we have br_mdb_switchdev_host hooked up, and switchdev will attempt to offload the host joined groups to an empty list of ports. Of course nobody offloads them. Then when we add a port to br0: ip link set swp0 master br0 the bridge doesn't replay the host-joined MDB entries from br_add_if, and eventually the host joined addresses expire, and a switchdev notification for deleting it is emitted, but surprise, the original addition was already completely missed. The strategy to address this problem is to replay the MDB entries (both the port ones and the host joined ones) when the new port joins the bridge, similar to what vxlan_fdb_replay does (in that case, its FDB can be populated and only then attached to a bridge that you offload). However there are 2 possibilities: the addresses can be 'pushed' by the bridge into the port, or the port can 'pull' them from the bridge. Considering that in the general case, the new port can be really late to the party, and there may have been many other switchdev ports that already received the initial notification, we would like to avoid delivering duplicate events to them, since they might misbehave. And currently, the bridge calls the entire switchdev notifier chain, whereas for replaying it should just call the notifier block of the new guy. But the bridge doesn't know what is the new guy's notifier block, it just knows where the switchdev notifier chain is. So for simplification, we make this a driver-initiated pull for now, and the notifier block is passed as an argument. To emulate the calling context for mdb objects (deferred and put on the blocking notifier chain), we must iterate under RCU protection through the bridge's mdb entries, queue them, and only call them once we're out of the RCU read-side critical section. There was some opportunity for reuse between br_mdb_switchdev_host_port, br_mdb_notify and the newly added br_mdb_queue_one in how the switchdev mdb object is created, so a helper was created. Suggested-by: Ido Schimmel Signed-off-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 9 +++ include/net/switchdev.h | 1 + net/bridge/br_mdb.c | 148 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 141 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index ebd16495459c..f6472969bb44 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -69,6 +69,8 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto); bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto); bool br_multicast_enabled(const struct net_device *dev); bool br_multicast_router(const struct net_device *dev); +int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb, struct netlink_ext_ack *extack); #else static inline int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) @@ -93,6 +95,13 @@ static inline bool br_multicast_router(const struct net_device *dev) { return false; } +static inline int br_mdb_replay(struct net_device *br_dev, + struct net_device *dev, + struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index b7fc7d0f54e2..8c3218177136 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -68,6 +68,7 @@ enum switchdev_obj_id { }; struct switchdev_obj { + struct list_head list; struct net_device *orig_dev; enum switchdev_obj_id id; u32 flags; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 8846c5bcd075..95fa4af0e8dd 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -506,6 +506,134 @@ err: kfree(priv); } +static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, + const struct net_bridge_mdb_entry *mp) +{ + if (mp->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); +#if IS_ENABLED(CONFIG_IPV6) + else if (mp->addr.proto == htons(ETH_P_IPV6)) + ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); +#endif + else + ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); + + mdb->vid = mp->addr.vid; +} + +static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, + struct switchdev_obj_port_mdb *mdb, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_port_obj_info obj_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .obj = &mdb->obj, + }; + int err; + + err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + return notifier_to_errno(err); +} + +static int br_mdb_queue_one(struct list_head *mdb_list, + enum switchdev_obj_id id, + const struct net_bridge_mdb_entry *mp, + struct net_device *orig_dev) +{ + struct switchdev_obj_port_mdb *mdb; + + mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC); + if (!mdb) + return -ENOMEM; + + mdb->obj.id = id; + mdb->obj.orig_dev = orig_dev; + br_switchdev_mdb_populate(mdb, mp); + list_add_tail(&mdb->obj.list, mdb_list); + + return 0; +} + +int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb, struct netlink_ext_ack *extack) +{ + struct net_bridge_mdb_entry *mp; + struct switchdev_obj *obj, *tmp; + struct net_bridge *br; + LIST_HEAD(mdb_list); + int err = 0; + + ASSERT_RTNL(); + + if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) + return -EINVAL; + + br = netdev_priv(br_dev); + + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + return 0; + + /* We cannot walk over br->mdb_list protected just by the rtnl_mutex, + * because the write-side protection is br->multicast_lock. But we + * need to emulate the [ blocking ] calling context of a regular + * switchdev event, so since both br->multicast_lock and RCU read side + * critical sections are atomic, we have no choice but to pick the RCU + * read side lock, queue up all our events, leave the critical section + * and notify switchdev from blocking context. + */ + rcu_read_lock(); + + hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + + if (mp->host_joined) { + err = br_mdb_queue_one(&mdb_list, + SWITCHDEV_OBJ_ID_HOST_MDB, + mp, br_dev); + if (err) { + rcu_read_unlock(); + goto out_free_mdb; + } + } + + for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; + pp = &p->next) { + if (p->key.port->dev != dev) + continue; + + err = br_mdb_queue_one(&mdb_list, + SWITCHDEV_OBJ_ID_PORT_MDB, + mp, dev); + if (err) { + rcu_read_unlock(); + goto out_free_mdb; + } + } + } + + rcu_read_unlock(); + + list_for_each_entry(obj, &mdb_list, list) { + err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), + extack); + if (err) + goto out_free_mdb; + } + +out_free_mdb: + list_for_each_entry_safe(obj, tmp, &mdb_list, list) { + list_del(&obj->list); + kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); + } + + return err; +} +EXPORT_SYMBOL_GPL(br_mdb_replay); + static void br_mdb_switchdev_host_port(struct net_device *dev, struct net_device *lower_dev, struct net_bridge_mdb_entry *mp, @@ -515,18 +643,12 @@ static void br_mdb_switchdev_host_port(struct net_device *dev, .obj = { .id = SWITCHDEV_OBJ_ID_HOST_MDB, .flags = SWITCHDEV_F_DEFER, + .orig_dev = dev, }, - .vid = mp->addr.vid, }; - if (mp->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); -#if IS_ENABLED(CONFIG_IPV6) - else - ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); -#endif + br_switchdev_mdb_populate(&mdb, mp); - mdb.obj.orig_dev = dev; switch (type) { case RTM_NEWMDB: switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); @@ -558,21 +680,13 @@ void br_mdb_notify(struct net_device *dev, .id = SWITCHDEV_OBJ_ID_PORT_MDB, .flags = SWITCHDEV_F_DEFER, }, - .vid = mp->addr.vid, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; if (pg) { - if (mp->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(mp->addr.dst.ip4, mdb.addr); -#if IS_ENABLED(CONFIG_IPV6) - else if (mp->addr.proto == htons(ETH_P_IPV6)) - ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb.addr); -#endif - else - ether_addr_copy(mdb.addr, mp->addr.dst.mac_addr); + br_switchdev_mdb_populate(&mdb, mp); mdb.obj.orig_dev = pg->key.port->dev; switch (type) { -- cgit v1.2.3 From 04846f903b53b32d29453e865646309db29f255a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 23 Mar 2021 01:51:45 +0200 Subject: net: bridge: add helper to replay port and local fdb entries When a switchdev port starts offloading a LAG that is already in a bridge and has an FDB entry pointing to it: ip link set bond0 master br0 bridge fdb add dev bond0 00:01:02:03:04:05 master static ip link set swp0 master bond0 the switchdev driver will have no idea that this FDB entry is there, because it missed the switchdev event emitted at its creation. Ido Schimmel pointed this out during a discussion about challenges with switchdev offloading of stacked interfaces between the physical port and the bridge, and recommended to just catch that condition and deny the CHANGEUPPER event: https://lore.kernel.org/netdev/20210210105949.GB287766@shredder.lan/ But in fact, we might need to deal with the hard thing anyway, which is to replay all FDB addresses relevant to this port, because it isn't just static FDB entries, but also local addresses (ones that are not forwarded but terminated by the bridge). There, we can't just say 'oh yeah, there was an upper already so I'm not joining that'. So, similar to the logic for replaying MDB entries, add a function that must be called by individual switchdev drivers and replays local FDB entries as well as ones pointing towards a bridge port. This time, we use the atomic switchdev notifier block, since that's what FDB entries expect for some reason. Reported-by: Ido Schimmel Signed-off-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 9 +++++++++ net/bridge/br_fdb.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index f6472969bb44..b564c4486a45 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -147,6 +147,8 @@ void br_fdb_clear_offload(const struct net_device *dev, u16 vid); bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag); u8 br_port_get_stp_state(const struct net_device *dev); clock_t br_get_ageing_time(struct net_device *br_dev); +int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb); #else static inline struct net_device * br_fdb_find_port(const struct net_device *br_dev, @@ -175,6 +177,13 @@ static inline clock_t br_get_ageing_time(struct net_device *br_dev) { return 0; } + +static inline int br_fdb_replay(struct net_device *br_dev, + struct net_device *dev, + struct notifier_block *nb) +{ + return -EOPNOTSUPP; +} #endif #endif diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index b7490237f3fc..698b79747d32 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -726,6 +726,56 @@ static inline size_t fdb_nlmsg_size(void) + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ } +static int br_fdb_replay_one(struct notifier_block *nb, + struct net_bridge_fdb_entry *fdb, + struct net_device *dev) +{ + struct switchdev_notifier_fdb_info item; + int err; + + item.addr = fdb->key.addr.addr; + item.vid = fdb->key.vlan_id; + item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); + item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); + item.info.dev = dev; + + err = nb->notifier_call(nb, SWITCHDEV_FDB_ADD_TO_DEVICE, &item); + return notifier_to_errno(err); +} + +int br_fdb_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb) +{ + struct net_bridge_fdb_entry *fdb; + struct net_bridge *br; + int err = 0; + + if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) + return -EINVAL; + + br = netdev_priv(br_dev); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { + struct net_bridge_port *dst = READ_ONCE(fdb->dst); + struct net_device *dst_dev; + + dst_dev = dst ? dst->dev : br->dev; + if (dst_dev != br_dev && dst_dev != dev) + continue; + + err = br_fdb_replay_one(nb, fdb, dst_dev); + if (err) + break; + } + + rcu_read_unlock(); + + return err; +} +EXPORT_SYMBOL_GPL(br_fdb_replay); + static void fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type, bool swdev_notify) -- cgit v1.2.3 From 22f67cdfae6aaa7e841ced17207391fb368c8e9e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 23 Mar 2021 01:51:46 +0200 Subject: net: bridge: add helper to replay VLANs installed on port Currently this simple setup with DSA: ip link add br0 type bridge vlan_filtering 1 ip link add bond0 type bond ip link set bond0 master br0 ip link set swp0 master bond0 will not work because the bridge has created the PVID in br_add_if -> nbp_vlan_init, and it has notified switchdev of the existence of VLAN 1, but that was too early, since swp0 was not yet a lower of bond0, so it had no reason to act upon that notification. We need a helper in the bridge to replay the switchdev VLAN objects that were notified since the bridge port creation, because some of them may have been missed. As opposed to the br_mdb_replay function, the vg->vlan_list write side protection is offered by the rtnl_mutex which is sleepable, so we don't need to queue up the objects in atomic context, we can replay them right away. Signed-off-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 10 +++++++ net/bridge/br_vlan.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index b564c4486a45..2cc35038a8ca 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -111,6 +111,8 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto); int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); +int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb, struct netlink_ext_ack *extack); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -137,6 +139,14 @@ static inline int br_vlan_get_info(const struct net_device *dev, u16 vid, { return -EINVAL; } + +static inline int br_vlan_replay(struct net_device *br_dev, + struct net_device *dev, + struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 8829f621b8ec..ca8daccff217 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1751,6 +1751,79 @@ out_kfree: kfree_skb(skb); } +static int br_vlan_replay_one(struct notifier_block *nb, + struct net_device *dev, + struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_port_obj_info obj_info = { + .info = { + .dev = dev, + .extack = extack, + }, + .obj = &vlan->obj, + }; + int err; + + err = nb->notifier_call(nb, SWITCHDEV_PORT_OBJ_ADD, &obj_info); + return notifier_to_errno(err); +} + +int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, + struct notifier_block *nb, struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + struct net_bridge *br; + int err = 0; + u16 pvid; + + ASSERT_RTNL(); + + if (!netif_is_bridge_master(br_dev)) + return -EINVAL; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) + return -EINVAL; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + p = NULL; + } else { + p = br_port_get_rtnl(dev); + if (WARN_ON(!p)) + return -EINVAL; + vg = nbp_vlan_group(p); + br = p->br; + } + + if (!vg) + return 0; + + pvid = br_get_pvid(vg); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct switchdev_obj_port_vlan vlan = { + .obj.orig_dev = dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = br_vlan_flags(v, pvid), + .vid = v->vid, + }; + + if (!br_vlan_should_use(v)) + continue; + + br_vlan_replay_one(nb, dev, &vlan, extack); + if (err) + return err; + } + + return err; +} +EXPORT_SYMBOL_GPL(br_vlan_replay); + /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) -- cgit v1.2.3 From e4bd44e89dcf37345e4851c5e775cb5abf38ab62 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 23 Mar 2021 01:51:52 +0200 Subject: net: ocelot: replay switchdev events when joining bridge The premise of this change is that the switchdev port attributes and objects offloaded by ocelot might have been missed when we are joining an already existing bridge port, such as a bonding interface. The patch pulls these switchdev attributes and objects from the bridge, on behalf of the 'bridge port' net device which might be either the ocelot switch interface, or the bonding upper interface. The ocelot_net.c belongs strictly to the switchdev ocelot driver, while ocelot.c is part of a library shared with the DSA felix driver. The ocelot_port_bridge_leave function (part of the common library) used to call ocelot_port_vlan_filtering(false), something which is not necessary for DSA, since the framework deals with that already there. So we move this function to ocelot_switchdev_unsync, which is specific to the switchdev driver. The code movement described above makes ocelot_port_bridge_leave no longer return an error code, so we change its type from int to void. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 4 +- drivers/net/ethernet/mscc/Kconfig | 3 +- drivers/net/ethernet/mscc/ocelot.c | 18 ++--- drivers/net/ethernet/mscc/ocelot_net.c | 117 ++++++++++++++++++++++++++++----- include/soc/mscc/ocelot.h | 6 +- 5 files changed, 113 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 628afb47b579..6b5442be0230 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -719,7 +719,9 @@ static int felix_bridge_join(struct dsa_switch *ds, int port, { struct ocelot *ocelot = ds->priv; - return ocelot_port_bridge_join(ocelot, port, br); + ocelot_port_bridge_join(ocelot, port, br); + + return 0; } static void felix_bridge_leave(struct dsa_switch *ds, int port, diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig index 05cb040c2677..2d3157e4d081 100644 --- a/drivers/net/ethernet/mscc/Kconfig +++ b/drivers/net/ethernet/mscc/Kconfig @@ -11,7 +11,7 @@ config NET_VENDOR_MICROSEMI if NET_VENDOR_MICROSEMI -# Users should depend on NET_SWITCHDEV, HAS_IOMEM +# Users should depend on NET_SWITCHDEV, HAS_IOMEM, BRIDGE config MSCC_OCELOT_SWITCH_LIB select NET_DEVLINK select REGMAP_MMIO @@ -24,6 +24,7 @@ config MSCC_OCELOT_SWITCH_LIB config MSCC_OCELOT_SWITCH tristate "Ocelot switch driver" + depends on BRIDGE || BRIDGE=n depends on NET_SWITCHDEV depends on HAS_IOMEM depends on OF_NET diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ce57929ba3d1..1a36b416fd9b 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1514,34 +1514,28 @@ int ocelot_port_mdb_del(struct ocelot *ocelot, int port, } EXPORT_SYMBOL(ocelot_port_mdb_del); -int ocelot_port_bridge_join(struct ocelot *ocelot, int port, - struct net_device *bridge) +void ocelot_port_bridge_join(struct ocelot *ocelot, int port, + struct net_device *bridge) { struct ocelot_port *ocelot_port = ocelot->ports[port]; ocelot_port->bridge = bridge; - return 0; + ocelot_apply_bridge_fwd_mask(ocelot); } EXPORT_SYMBOL(ocelot_port_bridge_join); -int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, - struct net_device *bridge) +void ocelot_port_bridge_leave(struct ocelot *ocelot, int port, + struct net_device *bridge) { struct ocelot_port *ocelot_port = ocelot->ports[port]; struct ocelot_vlan pvid = {0}, native_vlan = {0}; - int ret; ocelot_port->bridge = NULL; - ret = ocelot_port_vlan_filtering(ocelot, port, false); - if (ret) - return ret; - ocelot_port_set_pvid(ocelot, port, pvid); ocelot_port_set_native_vlan(ocelot, port, native_vlan); - - return 0; + ocelot_apply_bridge_fwd_mask(ocelot); } EXPORT_SYMBOL(ocelot_port_bridge_leave); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index d1376f7b34fd..36f32a4d9b0f 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1117,47 +1117,126 @@ static int ocelot_port_obj_del(struct net_device *dev, return ret; } +static void ocelot_inherit_brport_flags(struct ocelot *ocelot, int port, + struct net_device *brport_dev) +{ + struct switchdev_brport_flags flags = {0}; + int flag; + + flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; + + for_each_set_bit(flag, &flags.mask, 32) + if (br_port_flag_is_set(brport_dev, BIT(flag))) + flags.val |= BIT(flag); + + ocelot_port_bridge_flags(ocelot, port, flags); +} + +static void ocelot_clear_brport_flags(struct ocelot *ocelot, int port) +{ + struct switchdev_brport_flags flags; + + flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; + flags.val = flags.mask & ~BR_LEARNING; + + ocelot_port_bridge_flags(ocelot, port, flags); +} + +static int ocelot_switchdev_sync(struct ocelot *ocelot, int port, + struct net_device *brport_dev, + struct net_device *bridge_dev, + struct netlink_ext_ack *extack) +{ + clock_t ageing_time; + u8 stp_state; + int err; + + ocelot_inherit_brport_flags(ocelot, port, brport_dev); + + stp_state = br_port_get_stp_state(brport_dev); + ocelot_bridge_stp_state_set(ocelot, port, stp_state); + + err = ocelot_port_vlan_filtering(ocelot, port, + br_vlan_enabled(bridge_dev)); + if (err) + return err; + + ageing_time = br_get_ageing_time(bridge_dev); + ocelot_port_attr_ageing_set(ocelot, port, ageing_time); + + err = br_mdb_replay(bridge_dev, brport_dev, + &ocelot_switchdev_blocking_nb, extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_fdb_replay(bridge_dev, brport_dev, &ocelot_switchdev_nb); + if (err) + return err; + + err = br_vlan_replay(bridge_dev, brport_dev, + &ocelot_switchdev_blocking_nb, extack); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +static int ocelot_switchdev_unsync(struct ocelot *ocelot, int port) +{ + int err; + + err = ocelot_port_vlan_filtering(ocelot, port, false); + if (err) + return err; + + ocelot_clear_brport_flags(ocelot, port); + + ocelot_bridge_stp_state_set(ocelot, port, BR_STATE_FORWARDING); + + return 0; +} + static int ocelot_netdevice_bridge_join(struct net_device *dev, + struct net_device *brport_dev, struct net_device *bridge, struct netlink_ext_ack *extack) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot_port *ocelot_port = &priv->port; struct ocelot *ocelot = ocelot_port->ocelot; - struct switchdev_brport_flags flags; int port = priv->chip_port; int err; - flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; - flags.val = flags.mask; + ocelot_port_bridge_join(ocelot, port, bridge); - err = ocelot_port_bridge_join(ocelot, port, bridge); + err = ocelot_switchdev_sync(ocelot, port, brport_dev, bridge, extack); if (err) - return err; - - ocelot_port_bridge_flags(ocelot, port, flags); + goto err_switchdev_sync; return 0; + +err_switchdev_sync: + ocelot_port_bridge_leave(ocelot, port, bridge); + return err; } static int ocelot_netdevice_bridge_leave(struct net_device *dev, + struct net_device *brport_dev, struct net_device *bridge) { struct ocelot_port_private *priv = netdev_priv(dev); struct ocelot_port *ocelot_port = &priv->port; struct ocelot *ocelot = ocelot_port->ocelot; - struct switchdev_brport_flags flags; int port = priv->chip_port; int err; - flags.mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; - flags.val = flags.mask & ~BR_LEARNING; + err = ocelot_switchdev_unsync(ocelot, port); + if (err) + return err; - err = ocelot_port_bridge_leave(ocelot, port, bridge); + ocelot_port_bridge_leave(ocelot, port, bridge); - ocelot_port_bridge_flags(ocelot, port, flags); - - return err; + return 0; } static int ocelot_netdevice_lag_join(struct net_device *dev, @@ -1182,7 +1261,7 @@ static int ocelot_netdevice_lag_join(struct net_device *dev, if (!bridge_dev || !netif_is_bridge_master(bridge_dev)) return 0; - err = ocelot_netdevice_bridge_join(dev, bridge_dev, extack); + err = ocelot_netdevice_bridge_join(dev, bond, bridge_dev, extack); if (err) goto err_bridge_join; @@ -1208,7 +1287,7 @@ static int ocelot_netdevice_lag_leave(struct net_device *dev, if (!bridge_dev || !netif_is_bridge_master(bridge_dev)) return 0; - return ocelot_netdevice_bridge_leave(dev, bridge_dev); + return ocelot_netdevice_bridge_leave(dev, bond, bridge_dev); } static int ocelot_netdevice_changeupper(struct net_device *dev, @@ -1221,10 +1300,12 @@ static int ocelot_netdevice_changeupper(struct net_device *dev, if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) - err = ocelot_netdevice_bridge_join(dev, info->upper_dev, + err = ocelot_netdevice_bridge_join(dev, dev, + info->upper_dev, extack); else - err = ocelot_netdevice_bridge_leave(dev, info->upper_dev); + err = ocelot_netdevice_bridge_leave(dev, dev, + info->upper_dev); } if (netif_is_lag_master(info->upper_dev)) { if (info->linking) diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index ce7e5c1bd90d..68cdc7ceaf4d 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -803,10 +803,10 @@ int ocelot_port_pre_bridge_flags(struct ocelot *ocelot, int port, struct switchdev_brport_flags val); void ocelot_port_bridge_flags(struct ocelot *ocelot, int port, struct switchdev_brport_flags val); -int ocelot_port_bridge_join(struct ocelot *ocelot, int port, - struct net_device *bridge); -int ocelot_port_bridge_leave(struct ocelot *ocelot, int port, +void ocelot_port_bridge_join(struct ocelot *ocelot, int port, struct net_device *bridge); +void ocelot_port_bridge_leave(struct ocelot *ocelot, int port, + struct net_device *bridge); int ocelot_fdb_dump(struct ocelot *ocelot, int port, dsa_fdb_dump_cb_t *cb, void *data); int ocelot_fdb_add(struct ocelot *ocelot, int port, -- cgit v1.2.3 From 5aa3afe107d9099fc0dea2acf82c3e3c8f0f20e2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 23 Mar 2021 07:49:23 +0100 Subject: net: make unregister netdev warning timeout configurable netdev_wait_allrefs() issues a warning if refcount does not drop to 0 after 10 seconds. While 10 second wait generally should not happen under normal workload in normal environment, it seems to fire falsely very often during fuzzing and/or in qemu emulation (~10x slower). At least it's not possible to understand if it's really a false positive or not. Automated testing generally bumps all timeouts to very high values to avoid flake failures. Add net.core.netdev_unregister_timeout_secs sysctl to make the timeout configurable for automated testing systems. Lowering the timeout may also be useful for e.g. manual bisection. The default value matches the current behavior. Signed-off-by: Dmitry Vyukov Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=211877 Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 11 +++++++++++ include/linux/netdevice.h | 1 + net/core/dev.c | 6 +++++- net/core/sysctl_net_core.c | 10 ++++++++++ 4 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index f2ab8a5b6a4b..2090bfc69aa5 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -311,6 +311,17 @@ permit to distribute the load on several cpus. If set to 1 (default), timestamps are sampled as soon as possible, before queueing. +netdev_unregister_timeout_secs +------------------------------ + +Unregister network device timeout in seconds. +This option controls the timeout (in seconds) used to issue a warning while +waiting for a network device refcount to drop to 0 during device +unregistration. A lower value may be useful during bisection to detect +a leaked reference faster. A larger value may be useful to prevent false +warnings on slow/loaded systems. +Default value is 10, minimum 0, maximum 3600. + optmem_max ---------- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7005ad80e8d1..5fa66db0cb5d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4661,6 +4661,7 @@ void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; +extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; diff --git a/net/core/dev.c b/net/core/dev.c index c9a496f5e687..515309573cb8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10322,6 +10322,8 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); +int netdev_unregister_timeout_secs __read_mostly = 10; + #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** @@ -10383,7 +10385,9 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt && + time_after(jiffies, warning_time + + netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4567de519603..d84c8a1b280e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ static int two = 2; static int three = 3; +static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -570,6 +571,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "netdev_unregister_timeout_secs", + .data = &netdev_unregister_timeout_secs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &int_3600, + }, { } }; -- cgit v1.2.3 From ddb94eafab8b597b05904c8277194ea2d6357fa9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:32 +0100 Subject: net: resolve forwarding path from virtual netdevice and HW destination address This patch adds dev_fill_forward_path() which resolves the path to reach the real netdevice from the IP forwarding side. This function takes as input the netdevice and the destination hardware address and it walks down the devices calling .ndo_fill_forward_path() for each device until the real device is found. For instance, assuming the following topology: IP forwarding / \ br0 eth0 / \ eth1 eth2 . . . ethX ab:cd:ef:ab:cd:ef where eth1 and eth2 are bridge ports and eth0 provides WAN connectivity. ethX is the interface in another box which is connected to the eth1 bridge port. For packets going through IP forwarding to br0 whose destination MAC address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the following path: br0 -> eth1 .ndo_fill_forward_path for br0 looks up at the FDB for the bridge port from the destination MAC address to get the bridge port eth1. This information allows to create a fast path that bypasses the classic bridge and IP forwarding paths, so packets go directly from the bridge port eth1 to eth0 (wan interface) and vice versa. fast path .------------------------. / \ | IP forwarding | | / \ \/ | br0 eth0 . / \ -> eth1 eth2 . . . ethX ab:cd:ef:ab:cd:ef Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 27 +++++++++++++++++++++++++++ net/core/dev.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5fa66db0cb5d..03cff88c7292 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -848,6 +848,27 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); +enum net_device_path_type { + DEV_PATH_ETHERNET = 0, +}; + +struct net_device_path { + enum net_device_path_type type; + const struct net_device *dev; +}; + +#define NET_DEVICE_PATH_STACK_MAX 5 + +struct net_device_path_stack { + int num_paths; + struct net_device_path path[NET_DEVICE_PATH_STACK_MAX]; +}; + +struct net_device_path_ctx { + const struct net_device *dev; + const u8 *daddr; +}; + enum tc_setup_type { TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, @@ -1282,6 +1303,8 @@ struct netdev_net_notifier { * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); * If a device is paired with a peer device, return the peer instance. * The caller must be under RCU read context. + * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); + * Get the forwarding path to reach the real device from the HW destination address */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1488,6 +1511,8 @@ struct net_device_ops { int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); + int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, + struct net_device_path *path); }; /** @@ -2870,6 +2895,8 @@ void dev_remove_offload(struct packet_offload *po); int dev_get_iflink(const struct net_device *dev); int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack); struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags, unsigned short mask); struct net_device *dev_get_by_name(struct net *net, const char *name); diff --git a/net/core/dev.c b/net/core/dev.c index 515309573cb8..4bb6dcdbed8b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) +{ + int k = stack->num_paths++; + + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; +} + +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + .daddr = daddr, + }; + struct net_device_path *path; + int ret = 0; + + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; + + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; + + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; + + return ret; +} +EXPORT_SYMBOL_GPL(dev_fill_forward_path); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace -- cgit v1.2.3 From e4417d6950b06fe6c520e937b337daff093220ff Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:33 +0100 Subject: net: 8021q: resolve forwarding path for vlan devices Add .ndo_fill_forward_path for vlan devices. For instance, assuming the following topology: IP forwarding / \ eth0.100 eth0 | eth0 . . . ethX ab:cd:ef:ab:cd:ef For packets going through IP forwarding to eth0.100 whose destination MAC address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the following path: eth0.100 -> eth0 Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/8021q/vlan_dev.c | 15 +++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03cff88c7292..8823a56744f1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -850,11 +850,18 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, enum net_device_path_type { DEV_PATH_ETHERNET = 0, + DEV_PATH_VLAN, }; struct net_device_path { enum net_device_path_type type; const struct net_device *dev; + union { + struct { + u16 id; + __be16 proto; + } encap; + }; }; #define NET_DEVICE_PATH_STACK_MAX 5 diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index dc1a197792e6..1b1955a63f7f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -776,6 +776,20 @@ static int vlan_dev_get_iflink(const struct net_device *dev) return real_dev->ifindex; } +static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev); + + path->type = DEV_PATH_VLAN; + path->encap.id = vlan->vlan_id; + path->encap.proto = vlan->vlan_proto; + path->dev = ctx->dev; + ctx->dev = vlan->real_dev; + + return 0; +} + static const struct ethtool_ops vlan_ethtool_ops = { .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, @@ -814,6 +828,7 @@ static const struct net_device_ops vlan_netdev_ops = { #endif .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, + .ndo_fill_forward_path = vlan_dev_fill_forward_path, }; static void vlan_dev_free(struct net_device *dev) -- cgit v1.2.3 From ec9d16bab615ceda8ac22a7b4d2c7601bbe172cb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:34 +0100 Subject: net: bridge: resolve forwarding path for bridge devices Add .ndo_fill_forward_path for bridge devices. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/bridge/br_device.c | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8823a56744f1..a24270b0d200 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -851,6 +851,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, enum net_device_path_type { DEV_PATH_ETHERNET = 0, DEV_PATH_VLAN, + DEV_PATH_BRIDGE, }; struct net_device_path { diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 3f2f06b4dd27..c241719013f4 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -385,6 +385,32 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) return br_del_if(br, slave_dev); } +static int br_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct net_bridge_fdb_entry *f; + struct net_bridge_port *dst; + struct net_bridge *br; + + if (netif_is_bridge_port(ctx->dev)) + return -1; + + br = netdev_priv(ctx->dev); + f = br_fdb_find_rcu(br, ctx->daddr, 0); + if (!f || !f->dst) + return -1; + + dst = READ_ONCE(f->dst); + if (!dst) + return -1; + + path->type = DEV_PATH_BRIDGE; + path->dev = dst->br->dev; + ctx->dev = dst->dev; + + return 0; +} + static const struct ethtool_ops br_ethtool_ops = { .get_drvinfo = br_getinfo, .get_link = ethtool_op_get_link, @@ -419,6 +445,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, .ndo_features_check = passthru_features_check, + .ndo_fill_forward_path = br_fill_forward_path, }; static struct device_type br_type = { -- cgit v1.2.3 From bcf2766b1377421b7c9259865b25c1b62a7fa686 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 24 Mar 2021 02:30:35 +0100 Subject: net: bridge: resolve forwarding path for VLAN tag actions in bridge devices Depending on the VLAN settings of the bridge and the port, the bridge can either add or remove a tag. When vlan filtering is enabled, the fdb lookup also needs to know the VLAN tag/proto for the destination address To provide this, keep track of the stack of VLAN tags for the path in the lookup context Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 ++++++++++++++ net/8021q/vlan_dev.c | 6 ++++++ net/bridge/br_device.c | 23 +++++++++++++++++++- net/bridge/br_private.h | 20 ++++++++++++++++++ net/bridge/br_vlan.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 117 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a24270b0d200..344d9c0c9b22 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -862,10 +862,20 @@ struct net_device_path { u16 id; __be16 proto; } encap; + struct { + enum { + DEV_PATH_BR_VLAN_KEEP, + DEV_PATH_BR_VLAN_TAG, + DEV_PATH_BR_VLAN_UNTAG, + } vlan_mode; + u16 vlan_id; + __be16 vlan_proto; + } bridge; }; }; #define NET_DEVICE_PATH_STACK_MAX 5 +#define NET_DEVICE_PATH_VLAN_MAX 2 struct net_device_path_stack { int num_paths; @@ -875,6 +885,12 @@ struct net_device_path_stack { struct net_device_path_ctx { const struct net_device *dev; const u8 *daddr; + + int num_vlans; + struct { + u16 id; + __be16 proto; + } vlan[NET_DEVICE_PATH_VLAN_MAX]; }; enum tc_setup_type { diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 1b1955a63f7f..4db3f0621959 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -786,6 +786,12 @@ static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, path->encap.proto = vlan->vlan_proto; path->dev = ctx->dev; ctx->dev = vlan->real_dev; + if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) + return -ENOSPC; + + ctx->vlan[ctx->num_vlans].id = vlan->vlan_id; + ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto; + ctx->num_vlans++; return 0; } diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index c241719013f4..0c72503e0d39 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -396,7 +396,10 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx, return -1; br = netdev_priv(ctx->dev); - f = br_fdb_find_rcu(br, ctx->daddr, 0); + + br_vlan_fill_forward_path_pvid(br, ctx, path); + + f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id); if (!f || !f->dst) return -1; @@ -404,10 +407,28 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx, if (!dst) return -1; + if (br_vlan_fill_forward_path_mode(br, dst, path)) + return -1; + path->type = DEV_PATH_BRIDGE; path->dev = dst->br->dev; ctx->dev = dst->dev; + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_TAG: + if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) + return -ENOSPC; + ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id; + ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto; + ctx->num_vlans++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + ctx->num_vlans--; + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d7d167e10b70..50747990188e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1118,6 +1118,13 @@ void br_vlan_notify(const struct net_bridge *br, bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end); +void br_vlan_fill_forward_path_pvid(struct net_bridge *br, + struct net_device_path_ctx *ctx, + struct net_device_path *path); +int br_vlan_fill_forward_path_mode(struct net_bridge *br, + struct net_bridge_port *dst, + struct net_device_path *path); + static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { @@ -1277,6 +1284,19 @@ static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p, return 0; } +static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br, + struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ +} + +static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br, + struct net_bridge_port *dst, + struct net_device_path *path) +{ + return 0; +} + static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6f961cb5f771..c92240b21c4a 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1339,6 +1339,59 @@ int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) } EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); +void br_vlan_fill_forward_path_pvid(struct net_bridge *br, + struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct net_bridge_vlan_group *vg; + int idx = ctx->num_vlans - 1; + u16 vid; + + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; + + if (!br_opt_get(br, BROPT_VLAN_ENABLED)) + return; + + vg = br_vlan_group(br); + + if (idx >= 0 && + ctx->vlan[idx].proto == br->vlan_proto) { + vid = ctx->vlan[idx].id; + } else { + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; + vid = br_get_pvid(vg); + } + + path->bridge.vlan_id = vid; + path->bridge.vlan_proto = br->vlan_proto; +} + +int br_vlan_fill_forward_path_mode(struct net_bridge *br, + struct net_bridge_port *dst, + struct net_device_path *path) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + + if (!br_opt_get(br, BROPT_VLAN_ENABLED)) + return 0; + + vg = nbp_vlan_group_rcu(dst); + v = br_vlan_find(vg, path->bridge.vlan_id); + if (!v || !br_vlan_should_use(v)) + return -EINVAL; + + if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) + return 0; + + if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; + else + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; + + return 0; +} + int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { -- cgit v1.2.3 From f6efc675c9dd8d93f826b79ae7e33e03301db609 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 24 Mar 2021 02:30:36 +0100 Subject: net: ppp: resolve forwarding path for bridge pppoe devices Pass on the PPPoE session ID, destination hardware address and the real device. Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 22 ++++++++++++++++++++++ drivers/net/ppp/pppoe.c | 23 +++++++++++++++++++++++ include/linux/netdevice.h | 2 ++ include/linux/ppp_channel.h | 3 +++ 4 files changed, 50 insertions(+) (limited to 'include') diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index d445ecb1d0c7..930e49ef15f6 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1560,12 +1560,34 @@ static void ppp_dev_priv_destructor(struct net_device *dev) ppp_destroy_interface(ppp); } +static int ppp_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ppp *ppp = netdev_priv(ctx->dev); + struct ppp_channel *chan; + struct channel *pch; + + if (ppp->flags & SC_MULTILINK) + return -EOPNOTSUPP; + + if (list_empty(&ppp->channels)) + return -ENODEV; + + pch = list_first_entry(&ppp->channels, struct channel, clist); + chan = pch->chan; + if (!chan->ops->fill_forward_path) + return -EOPNOTSUPP; + + return chan->ops->fill_forward_path(ctx, path, chan); +} + static const struct net_device_ops ppp_netdev_ops = { .ndo_init = ppp_dev_init, .ndo_uninit = ppp_dev_uninit, .ndo_start_xmit = ppp_start_xmit, .ndo_do_ioctl = ppp_net_ioctl, .ndo_get_stats64 = ppp_get_stats64, + .ndo_fill_forward_path = ppp_fill_forward_path, }; static struct device_type ppp_type = { diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 9dc7f4b93d51..3619520340b7 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -972,8 +972,31 @@ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) return __pppoe_xmit(sk, skb); } +static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path, + const struct ppp_channel *chan) +{ + struct sock *sk = (struct sock *)chan->private; + struct pppox_sock *po = pppox_sk(sk); + struct net_device *dev = po->pppoe_dev; + + if (sock_flag(sk, SOCK_DEAD) || + !(sk->sk_state & PPPOX_CONNECTED) || !dev) + return -1; + + path->type = DEV_PATH_PPPOE; + path->encap.proto = htons(ETH_P_PPP_SES); + path->encap.id = be16_to_cpu(po->num); + memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN); + path->dev = ctx->dev; + ctx->dev = dev; + + return 0; +} + static const struct ppp_channel_ops pppoe_chan_ops = { .start_xmit = pppoe_xmit, + .fill_forward_path = pppoe_fill_forward_path, }; static int pppoe_recvmsg(struct socket *sock, struct msghdr *m, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 344d9c0c9b22..dd54f7cc3f12 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -852,6 +852,7 @@ enum net_device_path_type { DEV_PATH_ETHERNET = 0, DEV_PATH_VLAN, DEV_PATH_BRIDGE, + DEV_PATH_PPPOE, }; struct net_device_path { @@ -861,6 +862,7 @@ struct net_device_path { struct { u16 id; __be16 proto; + u8 h_dest[ETH_ALEN]; } encap; struct { enum { diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index 98966064ee68..91f9a928344e 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -28,6 +28,9 @@ struct ppp_channel_ops { int (*start_xmit)(struct ppp_channel *, struct sk_buff *); /* Handle an ioctl call that has come in via /dev/ppp. */ int (*ioctl)(struct ppp_channel *, unsigned int, unsigned long); + int (*fill_forward_path)(struct net_device_path_ctx *, + struct net_device_path *, + const struct ppp_channel *); }; struct ppp_channel { -- cgit v1.2.3 From 0994d492a1b78dff96671ccf6ad8294cc2bd909e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 24 Mar 2021 02:30:37 +0100 Subject: net: dsa: resolve forwarding path for dsa slave ports Add .ndo_fill_forward_path for dsa slave port devices Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ net/dsa/slave.c | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd54f7cc3f12..90db74132090 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -853,6 +853,7 @@ enum net_device_path_type { DEV_PATH_VLAN, DEV_PATH_BRIDGE, DEV_PATH_PPPOE, + DEV_PATH_DSA, }; struct net_device_path { @@ -873,6 +874,10 @@ struct net_device_path { u16 vlan_id; __be16 vlan_proto; } bridge; + struct { + int port; + u16 proto; + } dsa; }; }; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c51e52418a62..7453ceca2c7e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1654,6 +1654,21 @@ static void dsa_slave_get_stats64(struct net_device *dev, dev_get_tstats64(dev, s); } +static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct dsa_port *dp = dsa_slave_to_port(ctx->dev); + struct dsa_port *cpu_dp = dp->cpu_dp; + + path->dev = ctx->dev; + path->type = DEV_PATH_DSA; + path->dsa.proto = cpu_dp->tag_ops->proto; + path->dsa.port = dp->index; + ctx->dev = cpu_dp->master; + + return 0; +} + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -1679,6 +1694,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, .ndo_get_devlink_port = dsa_slave_get_devlink_port, .ndo_change_mtu = dsa_slave_change_mtu, + .ndo_fill_forward_path = dsa_slave_fill_forward_path, }; static struct device_type dsa_type = { -- cgit v1.2.3 From 5139c0c007250c01c61337d584db4072c4786bf6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:38 +0100 Subject: netfilter: flowtable: add xmit path types Add the xmit_type field that defines the two supported xmit paths in the flowtable data plane, which are the neighbour and the xfrm xmit paths. This patch prepares for new flowtable xmit path types to come. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 11 +++++++++-- net/netfilter/nf_flow_table_core.c | 1 + net/netfilter/nf_flow_table_ip.c | 14 ++++++++------ net/netfilter/nft_flow_offload.c | 20 ++++++++++++++++++-- 4 files changed, 36 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index fb165697c8a1..828fcfbd5e6f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -89,6 +89,11 @@ enum flow_offload_tuple_dir { }; #define FLOW_OFFLOAD_DIR_MAX IP_CT_DIR_MAX +enum flow_offload_xmit_type { + FLOW_OFFLOAD_XMIT_NEIGH = 0, + FLOW_OFFLOAD_XMIT_XFRM, +}; + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -111,7 +116,8 @@ struct flow_offload_tuple { /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; - u8 dir; + u8 dir:6, + xmit_type:2; u16 mtu; @@ -158,7 +164,8 @@ static inline __s32 nf_flow_timeout_delta(unsigned int timeout) struct nf_flow_route { struct { - struct dst_entry *dst; + struct dst_entry *dst; + enum flow_offload_xmit_type xmit_type; } tuple[FLOW_OFFLOAD_DIR_MAX]; }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 8ffd3f3c288c..573be4d1efb5 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -95,6 +95,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, } flow_tuple->iifidx = other_dst->dev->ifindex; + flow_tuple->xmit_type = route->tuple[dir].xmit_type; flow_tuple->dst_cache = dst; return 0; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 3be58b6d60af..e9bef38a356b 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -235,8 +235,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; - outdev = rt->dst.dev; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; @@ -265,13 +263,16 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - if (unlikely(dst_xfrm(&rt->dst))) { + rt = (struct rtable *)tuplehash->tuple.dst_cache; + + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); skb_dst_set_noref(skb, &rt->dst); @@ -456,8 +457,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache; - outdev = rt->dst.dev; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) return NF_ACCEPT; @@ -485,13 +484,16 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - if (unlikely(dst_xfrm(&rt->dst))) { + rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + + if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } + outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); skb_dst_set_noref(skb, &rt->dst); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 3a6c84fb2c90..1da2bb24f6c0 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -19,6 +19,22 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + static int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, struct nf_flow_route *route, @@ -44,8 +60,8 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, if (!other_dst) return -ENOENT; - route->tuple[dir].dst = this_dst; - route->tuple[!dir].dst = other_dst; + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); return 0; } -- cgit v1.2.3 From c63a7cc4d795c004b70cb935e8ba77d9e764f0ba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:39 +0100 Subject: netfilter: flowtable: use dev_fill_forward_path() to obtain ingress device Obtain the ingress device in the tuple from the route in the reply direction. Use dev_fill_forward_path() instead to get the real ingress device for this flow. Fall back to use the ingress device that the IP forwarding route provides if: - dev_fill_forward_path() finds no real ingress device. - the ingress device that is obtained is not part of the flowtable devices. - this route has a xfrm policy. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 3 + net/netfilter/nf_flow_table_core.c | 3 +- net/netfilter/nft_flow_offload.c | 102 +++++++++++++++++++++++++++++++++- 3 files changed, 103 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 828fcfbd5e6f..dca9fc66405f 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -165,6 +165,9 @@ static inline __s32 nf_flow_timeout_delta(unsigned int timeout) struct nf_flow_route { struct { struct dst_entry *dst; + struct { + u32 ifindex; + } in; enum flow_offload_xmit_type xmit_type; } tuple[FLOW_OFFLOAD_DIR_MAX]; }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 573be4d1efb5..51e3e1b08e1c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -79,7 +79,6 @@ static int flow_offload_fill_route(struct flow_offload *flow, enum flow_offload_tuple_dir dir) { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; - struct dst_entry *other_dst = route->tuple[!dir].dst; struct dst_entry *dst = route->tuple[dir].dst; if (!dst_hold_safe(route->tuple[dir].dst)) @@ -94,7 +93,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, break; } - flow_tuple->iifidx = other_dst->dev->ifindex; + flow_tuple->iifidx = route->tuple[dir].in.ifindex; flow_tuple->xmit_type = route->tuple[dir].xmit_type; flow_tuple->dst_cache = dst; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 1da2bb24f6c0..15f90c31feb0 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -31,14 +31,104 @@ static void nft_default_forward_path(struct nf_flow_route *route, struct dst_entry *dst_cache, enum ip_conntrack_dir dir) { + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; route->tuple[dir].dst = dst_cache; route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); } +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + unsigned char ha[ETH_ALEN]; + struct neighbour *n; + u8 nud_state; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; +}; + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info) +{ + const struct net_device_path *path; + int i; + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + info->indev = path->dev; + break; + case DEV_PATH_VLAN: + case DEV_PATH_BRIDGE: + default: + info->indev = NULL; + break; + } + } +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (hook->ops.dev != dev) + continue; + + found = true; + break; + } + + return found; +} + +static void nft_dev_forward_path(struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, &stack) >= 0) + nft_dev_path_info(&stack, &info); + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; +} + static int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, struct nf_flow_route *route, - enum ip_conntrack_dir dir) + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) { struct dst_entry *this_dst = skb_dst(pkt->skb); struct dst_entry *other_dst = NULL; @@ -63,6 +153,12 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, nft_default_forward_path(route, this_dst, dir); nft_default_forward_path(route, other_dst, !dir); + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && + route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { + nft_dev_forward_path(route, ct, dir, ft); + nft_dev_forward_path(route, ct, !dir, ft); + } + return 0; } @@ -90,8 +186,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; + struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; - struct nf_flow_route route; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; @@ -128,7 +224,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto out; dir = CTINFO2DIR(ctinfo); - if (nft_flow_route(pkt, ct, &route, dir) < 0) + if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); -- cgit v1.2.3 From 7a27f6ab41356ecba47ec2bec6d635704c169779 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:40 +0100 Subject: netfilter: flowtable: use dev_fill_forward_path() to obtain egress device The egress device in the tuple is obtained from route. Use dev_fill_forward_path() instead to provide the real egress device for this flow whenever this is available. The new FLOW_OFFLOAD_XMIT_DIRECT type uses dev_queue_xmit() to transmit ethernet frames. Cache the source and destination hardware address to use dev_queue_xmit() to transfer packets. The FLOW_OFFLOAD_XMIT_DIRECT replaces FLOW_OFFLOAD_XMIT_NEIGH if dev_fill_forward_path() finds a direct transmit path. In case of topology updates, if peer is moved to different bridge port, the connection will time out, reconnect will result in a new entry with the correct path. Snooping fdb updates would allow for cleaning up stale flowtable entries. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 16 ++++++- net/netfilter/nf_flow_table_core.c | 35 +++++++++++--- net/netfilter/nf_flow_table_ip.c | 88 ++++++++++++++++++++++++++--------- net/netfilter/nft_flow_offload.c | 35 +++++++++++--- 4 files changed, 137 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index dca9fc66405f..41c8436bc77e 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -92,6 +92,7 @@ enum flow_offload_tuple_dir { enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_NEIGH = 0, FLOW_OFFLOAD_XMIT_XFRM, + FLOW_OFFLOAD_XMIT_DIRECT, }; struct flow_offload_tuple { @@ -120,8 +121,14 @@ struct flow_offload_tuple { xmit_type:2; u16 mtu; - - struct dst_entry *dst_cache; + union { + struct dst_entry *dst_cache; + struct { + u32 ifidx; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + } out; + }; }; struct flow_offload_tuple_rhash { @@ -168,6 +175,11 @@ struct nf_flow_route { struct { u32 ifindex; } in; + struct { + u32 ifindex; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + } out; enum flow_offload_xmit_type xmit_type; } tuple[FLOW_OFFLOAD_DIR_MAX]; }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 51e3e1b08e1c..a92acb3ed019 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -81,9 +81,6 @@ static int flow_offload_fill_route(struct flow_offload *flow, struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; struct dst_entry *dst = route->tuple[dir].dst; - if (!dst_hold_safe(route->tuple[dir].dst)) - return -1; - switch (flow_tuple->l3proto) { case NFPROTO_IPV4: flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); @@ -94,12 +91,36 @@ static int flow_offload_fill_route(struct flow_offload *flow, } flow_tuple->iifidx = route->tuple[dir].in.ifindex; + + switch (route->tuple[dir].xmit_type) { + case FLOW_OFFLOAD_XMIT_DIRECT: + memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, + ETH_ALEN); + memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, + ETH_ALEN); + flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; + break; + case FLOW_OFFLOAD_XMIT_XFRM: + case FLOW_OFFLOAD_XMIT_NEIGH: + if (!dst_hold_safe(route->tuple[dir].dst)) + return -1; + + flow_tuple->dst_cache = dst; + break; + } flow_tuple->xmit_type = route->tuple[dir].xmit_type; - flow_tuple->dst_cache = dst; return 0; } +static void nft_flow_dst_release(struct flow_offload *flow, + enum flow_offload_tuple_dir dir) +{ + if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || + flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) + dst_release(flow->tuplehash[dir].tuple.dst_cache); +} + int flow_offload_route_init(struct flow_offload *flow, const struct nf_flow_route *route) { @@ -118,7 +139,7 @@ int flow_offload_route_init(struct flow_offload *flow, return 0; err_route_reply: - dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); return err; } @@ -169,8 +190,8 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) static void flow_offload_route_release(struct flow_offload *flow) { - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); - dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); + nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); } void flow_offload_free(struct flow_offload *flow) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e9bef38a356b..9e84767a7e9b 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -207,6 +207,24 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, + const struct flow_offload_tuple_rhash *tuplehash, + unsigned short type) +{ + struct net_device *outdev; + + outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); + if (!outdev) + return NF_DROP; + + skb->dev = outdev; + dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, + tuplehash->tuple.out.h_source, skb->len); + dev_queue_xmit(skb); + + return NF_STOLEN; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -222,6 +240,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, struct iphdr *iph; __be32 nexthop; u32 hdrsize; + int ret; if (skb->protocol != htons(ETH_P_IP)) return NF_ACCEPT; @@ -244,9 +263,13 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) return NF_ACCEPT; - if (!dst_check(&rt->dst, 0)) { - flow_offload_teardown(flow); - return NF_ACCEPT; + if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || + tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { + rt = (struct rtable *)tuplehash->tuple.dst_cache; + if (!dst_check(&rt->dst, 0)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } } if (skb_try_make_writable(skb, thoff + hdrsize)) @@ -263,8 +286,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - rt = (struct rtable *)tuplehash->tuple.dst_cache; - if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; @@ -272,13 +293,23 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, return nf_flow_xmit_xfrm(skb, state, &rt->dst); } - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); - skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + switch (tuplehash->tuple.xmit_type) { + case FLOW_OFFLOAD_XMIT_NEIGH: + outdev = rt->dst.dev; + skb->dev = outdev; + nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + skb_dst_set_noref(skb, &rt->dst); + neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); + ret = NF_STOLEN; + break; + case FLOW_OFFLOAD_XMIT_DIRECT: + ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); + if (ret == NF_DROP) + flow_offload_teardown(flow); + break; + } - return NF_STOLEN; + return ret; } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); @@ -444,6 +475,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, struct ipv6hdr *ip6h; struct rt6_info *rt; u32 hdrsize; + int ret; if (skb->protocol != htons(ETH_P_IPV6)) return NF_ACCEPT; @@ -465,9 +497,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; - if (!dst_check(&rt->dst, 0)) { - flow_offload_teardown(flow); - return NF_ACCEPT; + if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || + tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { + rt = (struct rt6_info *)tuplehash->tuple.dst_cache; + if (!dst_check(&rt->dst, 0)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } } if (skb_try_make_writable(skb, sizeof(*ip6h) + hdrsize)) @@ -484,8 +520,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; - if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; @@ -493,12 +527,22 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, return nf_flow_xmit_xfrm(skb, state, &rt->dst); } - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); - skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + switch (tuplehash->tuple.xmit_type) { + case FLOW_OFFLOAD_XMIT_NEIGH: + outdev = rt->dst.dev; + skb->dev = outdev; + nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + skb_dst_set_noref(skb, &rt->dst); + neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); + ret = NF_STOLEN; + break; + case FLOW_OFFLOAD_XMIT_DIRECT: + ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); + if (ret == NF_DROP) + flow_offload_teardown(flow); + break; + } - return NF_STOLEN; + return ret; } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 15f90c31feb0..a6595dca1b1f 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -39,12 +39,11 @@ static void nft_default_forward_path(struct nf_flow_route *route, static int nft_dev_fill_forward_path(const struct nf_flow_route *route, const struct dst_entry *dst_cache, const struct nf_conn *ct, - enum ip_conntrack_dir dir, + enum ip_conntrack_dir dir, u8 *ha, struct net_device_path_stack *stack) { const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; struct net_device *dev = dst_cache->dev; - unsigned char ha[ETH_ALEN]; struct neighbour *n; u8 nud_state; @@ -66,27 +65,43 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, struct nft_forward_info { const struct net_device *indev; + const struct net_device *outdev; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; }; static void nft_dev_path_info(const struct net_device_path_stack *stack, - struct nft_forward_info *info) + struct nft_forward_info *info, + unsigned char *ha) { const struct net_device_path *path; int i; + memcpy(info->h_dest, ha, ETH_ALEN); + for (i = 0; i < stack->num_paths; i++) { path = &stack->path[i]; switch (path->type) { case DEV_PATH_ETHERNET: info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); break; - case DEV_PATH_VLAN: case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + case DEV_PATH_VLAN: default: info->indev = NULL; break; } } + if (!info->outdev) + info->outdev = info->indev; } static bool nft_flowtable_find_dev(const struct net_device *dev, @@ -114,14 +129,22 @@ static void nft_dev_forward_path(struct nf_flow_route *route, const struct dst_entry *dst = route->tuple[dir].dst; struct net_device_path_stack stack; struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; - if (nft_dev_fill_forward_path(route, dst, ct, dir, &stack) >= 0) - nft_dev_path_info(&stack, &info); + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha); if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) return; route->tuple[!dir].in.ifindex = info.indev->ifindex; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].out.ifindex = info.outdev->ifindex; + route->tuple[dir].xmit_type = info.xmit_type; + } } static int nft_flow_route(const struct nft_pktinfo *pkt, -- cgit v1.2.3 From 4cd91f7c290f64fe430867ddbae10bff34657b6a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:41 +0100 Subject: netfilter: flowtable: add vlan support Add the vlan id and protocol to the flow tuple to uniquely identify flows from the receive path. For the transmit path, dev_hard_header() on the vlan device push the headers. This patch includes support for two vlan headers (QinQ) from the ingress path. Add a generic encap field to the flowtable entry which stores the protocol and the tag id. This allows to reuse these fields in the PPPoE support coming in a later patch. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 17 ++++- net/netfilter/nf_flow_table_core.c | 7 ++ net/netfilter/nf_flow_table_ip.c | 121 +++++++++++++++++++++++++++------- net/netfilter/nft_flow_offload.c | 26 +++++++- 4 files changed, 142 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 41c8436bc77e..e34fd3eb4bb5 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -95,6 +95,8 @@ enum flow_offload_xmit_type { FLOW_OFFLOAD_XMIT_DIRECT, }; +#define NF_FLOW_TABLE_ENCAP_MAX 2 + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -113,13 +115,17 @@ struct flow_offload_tuple { u8 l3proto; u8 l4proto; + struct { + u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; - u8 dir:6, - xmit_type:2; - + u8 dir:4, + xmit_type:2, + encap_num:2; u16 mtu; union { struct dst_entry *dst_cache; @@ -174,6 +180,11 @@ struct nf_flow_route { struct dst_entry *dst; struct { u32 ifindex; + struct { + u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; } in; struct { u32 ifindex; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index a92acb3ed019..595f4434b84d 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -80,6 +80,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, { struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; struct dst_entry *dst = route->tuple[dir].dst; + int i, j = 0; switch (flow_tuple->l3proto) { case NFPROTO_IPV4: @@ -91,6 +92,12 @@ static int flow_offload_fill_route(struct flow_offload *flow, } flow_tuple->iifidx = route->tuple[dir].in.ifindex; + for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { + flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; + flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; + j++; + } + flow_tuple->encap_num = route->tuple[dir].in.num_encaps; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9e84767a7e9b..d90636295b0d 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -136,23 +136,44 @@ static bool ip_has_options(unsigned int thoff) return thoff != sizeof(struct iphdr); } +static void nf_flow_tuple_encap(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + int i = 0; + + if (skb_vlan_tag_present(skb)) { + tuple->encap[i].id = skb_vlan_tag_get(skb); + tuple->encap[i].proto = skb->vlan_proto; + i++; + } + if (skb->protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb_mac_header(skb); + + tuple->encap[i].id = ntohs(veth->h_vlan_TCI); + tuple->encap[i].proto = skb->protocol; + } +} + static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize) + struct flow_offload_tuple *tuple, u32 *hdrsize, + u32 offset) { struct flow_ports *ports; unsigned int thoff; struct iphdr *iph; - if (!pskb_may_pull(skb, sizeof(*iph))) + if (!pskb_may_pull(skb, sizeof(*iph) + offset)) return -1; - iph = ip_hdr(skb); - thoff = iph->ihl * 4; + iph = (struct iphdr *)(skb_network_header(skb) + offset); + thoff = (iph->ihl * 4); if (ip_is_fragment(iph) || unlikely(ip_has_options(thoff))) return -1; + thoff += offset; + switch (iph->protocol) { case IPPROTO_TCP: *hdrsize = sizeof(struct tcphdr); @@ -167,11 +188,10 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, if (iph->ttl <= 1) return -1; - thoff = iph->ihl * 4; if (!pskb_may_pull(skb, thoff + *hdrsize)) return -1; - iph = ip_hdr(skb); + iph = (struct iphdr *)(skb_network_header(skb) + offset); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v4.s_addr = iph->saddr; @@ -181,6 +201,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev, tuple->l3proto = AF_INET; tuple->l4proto = iph->protocol; tuple->iifidx = dev->ifindex; + nf_flow_tuple_encap(skb, tuple); return 0; } @@ -207,6 +228,43 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, + u32 *offset) +{ + if (skb->protocol == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veth; + + veth = (struct vlan_ethhdr *)skb_mac_header(skb); + if (veth->h_vlan_encapsulated_proto == proto) { + *offset += VLAN_HLEN; + return true; + } + } + + return false; +} + +static void nf_flow_encap_pop(struct sk_buff *skb, + struct flow_offload_tuple_rhash *tuplehash) +{ + struct vlan_hdr *vlan_hdr; + int i; + + for (i = 0; i < tuplehash->tuple.encap_num; i++) { + if (skb_vlan_tag_present(skb)) { + __vlan_hwaccel_clear_tag(skb); + continue; + } + if (skb->protocol == htons(ETH_P_8021Q)) { + vlan_hdr = (struct vlan_hdr *)skb->data; + __skb_pull(skb, VLAN_HLEN); + vlan_set_encap_proto(skb, vlan_hdr); + skb_reset_network_header(skb); + break; + } + } +} + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, const struct flow_offload_tuple_rhash *tuplehash, unsigned short type) @@ -235,17 +293,18 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, enum flow_offload_tuple_dir dir; struct flow_offload *flow; struct net_device *outdev; + u32 hdrsize, offset = 0; + unsigned int thoff, mtu; struct rtable *rt; - unsigned int thoff; struct iphdr *iph; __be32 nexthop; - u32 hdrsize; int ret; - if (skb->protocol != htons(ETH_P_IP)) + if (skb->protocol != htons(ETH_P_IP) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset)) return NF_ACCEPT; - if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize) < 0) + if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0) return NF_ACCEPT; tuplehash = flow_offload_lookup(flow_table, &tuple); @@ -255,11 +314,12 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) + mtu = flow->tuplehash[dir].tuple.mtu + offset; + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return NF_ACCEPT; - iph = ip_hdr(skb); - thoff = iph->ihl * 4; + iph = (struct iphdr *)(skb_network_header(skb) + offset); + thoff = (iph->ihl * 4) + offset; if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) return NF_ACCEPT; @@ -277,6 +337,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, flow_offload_refresh(flow_table, flow); + nf_flow_encap_pop(skb, tuplehash); + thoff -= offset; + iph = ip_hdr(skb); nf_flow_nat_ip(flow, skb, thoff, dir, iph); @@ -418,16 +481,18 @@ static void nf_flow_nat_ipv6(const struct flow_offload *flow, } static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, - struct flow_offload_tuple *tuple, u32 *hdrsize) + struct flow_offload_tuple *tuple, u32 *hdrsize, + u32 offset) { struct flow_ports *ports; struct ipv6hdr *ip6h; unsigned int thoff; - if (!pskb_may_pull(skb, sizeof(*ip6h))) + thoff = sizeof(*ip6h) + offset; + if (!pskb_may_pull(skb, thoff)) return -1; - ip6h = ipv6_hdr(skb); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); switch (ip6h->nexthdr) { case IPPROTO_TCP: @@ -443,11 +508,10 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, if (ip6h->hop_limit <= 1) return -1; - thoff = sizeof(*ip6h); if (!pskb_may_pull(skb, thoff + *hdrsize)) return -1; - ip6h = ipv6_hdr(skb); + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_v6 = ip6h->saddr; @@ -457,6 +521,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev, tuple->l3proto = AF_INET6; tuple->l4proto = ip6h->nexthdr; tuple->iifidx = dev->ifindex; + nf_flow_tuple_encap(skb, tuple); return 0; } @@ -472,15 +537,17 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct in6_addr *nexthop; struct flow_offload *flow; struct net_device *outdev; + unsigned int thoff, mtu; + u32 hdrsize, offset = 0; struct ipv6hdr *ip6h; struct rt6_info *rt; - u32 hdrsize; int ret; - if (skb->protocol != htons(ETH_P_IPV6)) + if (skb->protocol != htons(ETH_P_IPV6) && + !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset)) return NF_ACCEPT; - if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize) < 0) + if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0) return NF_ACCEPT; tuplehash = flow_offload_lookup(flow_table, &tuple); @@ -490,11 +557,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu))) + mtu = flow->tuplehash[dir].tuple.mtu + offset; + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return NF_ACCEPT; - if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb, - sizeof(*ip6h))) + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + thoff = sizeof(*ip6h) + offset; + if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) return NF_ACCEPT; if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || @@ -506,11 +575,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, } } - if (skb_try_make_writable(skb, sizeof(*ip6h) + hdrsize)) + if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; flow_offload_refresh(flow_table, flow); + nf_flow_encap_pop(skb, tuplehash); + ip6h = ipv6_hdr(skb); nf_flow_nat_ipv6(flow, skb, dir, ip6h); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index a6595dca1b1f..8392b1a8108b 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -66,6 +66,11 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, struct nft_forward_info { const struct net_device *indev; const struct net_device *outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; enum flow_offload_xmit_type xmit_type; @@ -84,9 +89,23 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, path = &stack->path[i]; switch (path->type) { case DEV_PATH_ETHERNET: + case DEV_PATH_VLAN: info->indev = path->dev; if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + + /* DEV_PATH_VLAN */ + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->outdev = path->dev; + info->encap[info->num_encaps].id = path->encap.id; + info->encap[info->num_encaps].proto = path->encap.proto; + info->num_encaps++; break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) @@ -94,7 +113,6 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; break; - case DEV_PATH_VLAN: default: info->indev = NULL; break; @@ -130,6 +148,7 @@ static void nft_dev_forward_path(struct nf_flow_route *route, struct net_device_path_stack stack; struct nft_forward_info info = {}; unsigned char ha[ETH_ALEN]; + int i; if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) nft_dev_path_info(&stack, &info, ha); @@ -138,6 +157,11 @@ static void nft_dev_forward_path(struct nf_flow_route *route, return; route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + route->tuple[!dir].in.num_encaps = info.num_encaps; if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); -- cgit v1.2.3 From 73f97025a972cd1506e8b1986264b2fb8833df7c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:47 +0100 Subject: netfilter: nft_flow_offload: use direct xmit if hardware offload is enabled If there is a forward path to reach an ethernet device and hardware offload is enabled, then use the direct xmit path. Moreover, store the real device in the direct xmit path info since software datapath uses dev_hard_header() to push the layer encapsulation headers while hardware offload refers to the real device. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_flow_table.h | 2 ++ net/netfilter/nf_flow_table_core.c | 1 + net/netfilter/nf_flow_table_offload.c | 2 +- net/netfilter/nft_flow_offload.c | 21 +++++++++++++++++++-- 4 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index e34fd3eb4bb5..52afcee6e999 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -131,6 +131,7 @@ struct flow_offload_tuple { struct dst_entry *dst_cache; struct { u32 ifidx; + u32 hw_ifidx; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; @@ -188,6 +189,7 @@ struct nf_flow_route { } in; struct { u32 ifindex; + u32 hw_ifindex; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 595f4434b84d..f728c955b1dc 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -106,6 +106,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; + flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 3fbed447132a..e0d079601fcb 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -508,7 +508,7 @@ static void flow_offload_redirect(struct net *net, switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: this_tuple = &flow->tuplehash[dir].tuple; - ifindex = this_tuple->out.ifidx; + ifindex = this_tuple->out.hw_ifidx; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 143d049fd7f1..d25b4b109e25 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -66,6 +66,7 @@ static int nft_dev_fill_forward_path(const struct nf_flow_route *route, struct nft_forward_info { const struct net_device *indev; const struct net_device *outdev; + const struct net_device *hw_outdev; struct id { __u16 id; __be16 proto; @@ -76,9 +77,18 @@ struct nft_forward_info { enum flow_offload_xmit_type xmit_type; }; +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + static void nft_dev_path_info(const struct net_device_path_stack *stack, struct nft_forward_info *info, - unsigned char *ha) + unsigned char *ha, struct nf_flowtable *flowtable) { const struct net_device_path *path; int i; @@ -140,6 +150,12 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, } if (!info->outdev) info->outdev = info->indev; + + info->hw_outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; } static bool nft_flowtable_find_dev(const struct net_device *dev, @@ -171,7 +187,7 @@ static void nft_dev_forward_path(struct nf_flow_route *route, int i; if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) - nft_dev_path_info(&stack, &info, ha); + nft_dev_path_info(&stack, &info, ha, &ft->data); if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) return; @@ -187,6 +203,7 @@ static void nft_dev_forward_path(struct nf_flow_route *route, memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); route->tuple[dir].out.ifindex = info.outdev->ifindex; + route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; route->tuple[dir].xmit_type = info.xmit_type; } } -- cgit v1.2.3 From 26267bf9bb57d504c785d8659adc8e02b6629c95 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 24 Mar 2021 02:30:48 +0100 Subject: netfilter: flowtable: bridge vlan hardware offload and switchdev The switch might have already added the VLAN tag through PVID hardware offload. Keep this extra VLAN in the flowtable but skip it on egress. Signed-off-by: Felix Fietkau Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/net/netfilter/nf_flow_table.h | 8 +++++--- net/bridge/br_device.c | 1 + net/bridge/br_vlan.c | 2 ++ net/netfilter/nf_flow_table_core.c | 2 ++ net/netfilter/nf_flow_table_offload.c | 6 +++++- net/netfilter/nft_flow_offload.c | 5 +++++ 7 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 90db74132090..02fa1da8cd22 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -870,6 +870,7 @@ struct net_device_path { DEV_PATH_BR_VLAN_KEEP, DEV_PATH_BR_VLAN_TAG, DEV_PATH_BR_VLAN_UNTAG, + DEV_PATH_BR_VLAN_UNTAG_HW, } vlan_mode; u16 vlan_id; __be16 vlan_proto; diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 52afcee6e999..4d991c1e93ef 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -123,9 +123,10 @@ struct flow_offload_tuple { /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; - u8 dir:4, + u8 dir:2, xmit_type:2, - encap_num:2; + encap_num:2, + in_vlan_ingress:2; u16 mtu; union { struct dst_entry *dst_cache; @@ -185,7 +186,8 @@ struct nf_flow_route { u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; - u8 num_encaps; + u8 num_encaps:2, + ingress_vlans:2; } in; struct { u32 ifindex; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0c72503e0d39..e8b626cc6bfd 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -422,6 +422,7 @@ static int br_fill_forward_path(struct net_device_path_ctx *ctx, ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto; ctx->num_vlans++; break; + case DEV_PATH_BR_VLAN_UNTAG_HW: case DEV_PATH_BR_VLAN_UNTAG: ctx->num_vlans--; break; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index c92240b21c4a..da3256a3eed0 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1386,6 +1386,8 @@ int br_vlan_fill_forward_path_mode(struct net_bridge *br, if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; + else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) + path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; else path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index f728c955b1dc..8fa7bf9d5f3f 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -95,6 +95,8 @@ static int flow_offload_fill_route(struct flow_offload *flow, for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; + if (route->tuple[dir].in.ingress_vlans & BIT(i)) + flow_tuple->in_vlan_ingress |= BIT(j); j++; } flow_tuple->encap_num = route->tuple[dir].in.num_encaps; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e0d079601fcb..9326ba74745e 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -594,8 +594,12 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, other_tuple = &flow->tuplehash[!dir].tuple; for (i = 0; i < other_tuple->encap_num; i++) { - struct flow_action_entry *entry = flow_action_entry_next(flow_rule); + struct flow_action_entry *entry; + if (other_tuple->in_vlan_ingress & BIT(i)) + continue; + + entry = flow_action_entry_next(flow_rule); entry->id = FLOW_ACTION_VLAN_PUSH; entry->vlan.vid = other_tuple->encap[i].id; entry->vlan.proto = other_tuple->encap[i].proto; diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index d25b4b109e25..4843dd2b410c 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -72,6 +72,7 @@ struct nft_forward_info { __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; u8 num_encaps; + u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; enum flow_offload_xmit_type xmit_type; @@ -130,6 +131,9 @@ static void nft_dev_path_info(const struct net_device_path_stack *stack, memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; case DEV_PATH_BR_VLAN_TAG: info->encap[info->num_encaps].id = path->bridge.vlan_id; info->encap[info->num_encaps].proto = path->bridge.vlan_proto; @@ -198,6 +202,7 @@ static void nft_dev_forward_path(struct nf_flow_route *route, route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; } route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); -- cgit v1.2.3 From 563ae557dd4eebb11472a1c264d40bfc08470395 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:49 +0100 Subject: net: flow_offload: add FLOW_ACTION_PPPOE_PUSH Add an action to represent the PPPoE hardware offload support that includes the session ID. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index fde025c57b4f..dc5c1e69cd9f 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -147,6 +147,7 @@ enum flow_action_id { FLOW_ACTION_MPLS_POP, FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, + FLOW_ACTION_PPPOE_PUSH, NUM_FLOW_ACTIONS, }; @@ -274,6 +275,9 @@ struct flow_action_entry { u32 num_entries; struct action_gate_entry *entries; } gate; + struct { /* FLOW_ACTION_PPPOE_PUSH */ + u16 sid; + } pppoe; }; struct flow_action_cookie *cookie; /* user defined action cookie */ }; -- cgit v1.2.3 From 341f67e424e572bfc034daa534c6fa667533e6a4 Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Tue, 23 Mar 2021 19:07:34 +0800 Subject: net: stmmac: Add hardware supported cross-timestamp Cross timestamping is supported on Integrated Ethernet Controller in Intel SoC such as EHL and TGL with Always Running Timer. The hardware cross-timestamp result is made available to applications through the PTP_SYS_OFFSET_PRECISE ioctl which calls stmmac_getcrosststamp(). Device time is stored in the MAC Auxiliary register. The 64-bit System time (ART timestamp) is stored in registers that are only addressable by using MDIO space. Signed-off-by: Tan Tee Min Co-developed-by: Wong Vee Khee Signed-off-by: Wong Vee Khee Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 2 + drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 108 +++++++++++++++++++++ drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 8 ++ drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c | 2 + drivers/net/ethernet/stmicro/stmmac/hwif.h | 3 + .../net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 11 +++ drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 32 ++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h | 23 +++++ include/linux/stmmac.h | 4 + 9 files changed, 193 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 1c0c60bdf854..95469059dca1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -388,6 +388,8 @@ struct dma_features { unsigned int estsel; unsigned int fpesel; unsigned int tbssel; + /* Numbers of Auxiliary Snapshot Inputs */ + unsigned int aux_snapshot_n; }; /* RX Buffer size must be multiple of 4/8/16 bytes */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 763b549e3c2d..992294d25706 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -8,6 +8,7 @@ #include "dwmac-intel.h" #include "dwmac4.h" #include "stmmac.h" +#include "stmmac_ptp.h" #define INTEL_MGBE_ADHOC_ADDR 0x15 #define INTEL_MGBE_XPCS_ADDR 0x16 @@ -240,6 +241,108 @@ static void intel_mgbe_ptp_clk_freq_config(void *npriv) writel(gpio_value, priv->ioaddr + GMAC_GPIO_STATUS); } +static void get_arttime(struct mii_bus *mii, int intel_adhoc_addr, + u64 *art_time) +{ + u64 ns; + + ns = mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE3); + ns <<= GMAC4_ART_TIME_SHIFT; + ns |= mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE2); + ns <<= GMAC4_ART_TIME_SHIFT; + ns |= mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE1); + ns <<= GMAC4_ART_TIME_SHIFT; + ns |= mdiobus_read(mii, intel_adhoc_addr, PMC_ART_VALUE0); + + *art_time = ns; +} + +static int intel_crosststamp(ktime_t *device, + struct system_counterval_t *system, + void *ctx) +{ + struct intel_priv_data *intel_priv; + + struct stmmac_priv *priv = (struct stmmac_priv *)ctx; + void __iomem *ptpaddr = priv->ptpaddr; + void __iomem *ioaddr = priv->hw->pcsr; + unsigned long flags; + u64 art_time = 0; + u64 ptp_time = 0; + u32 num_snapshot; + u32 gpio_value; + u32 acr_value; + int ret; + u32 v; + int i; + + if (!boot_cpu_has(X86_FEATURE_ART)) + return -EOPNOTSUPP; + + intel_priv = priv->plat->bsp_priv; + + /* Enable Internal snapshot trigger */ + acr_value = readl(ptpaddr + PTP_ACR); + acr_value &= ~PTP_ACR_MASK; + switch (priv->plat->int_snapshot_num) { + case AUX_SNAPSHOT0: + acr_value |= PTP_ACR_ATSEN0; + break; + case AUX_SNAPSHOT1: + acr_value |= PTP_ACR_ATSEN1; + break; + case AUX_SNAPSHOT2: + acr_value |= PTP_ACR_ATSEN2; + break; + case AUX_SNAPSHOT3: + acr_value |= PTP_ACR_ATSEN3; + break; + default: + return -EINVAL; + } + writel(acr_value, ptpaddr + PTP_ACR); + + /* Clear FIFO */ + acr_value = readl(ptpaddr + PTP_ACR); + acr_value |= PTP_ACR_ATSFC; + writel(acr_value, ptpaddr + PTP_ACR); + + /* Trigger Internal snapshot signal + * Create a rising edge by just toggle the GPO1 to low + * and back to high. + */ + gpio_value = readl(ioaddr + GMAC_GPIO_STATUS); + gpio_value &= ~GMAC_GPO1; + writel(gpio_value, ioaddr + GMAC_GPIO_STATUS); + gpio_value |= GMAC_GPO1; + writel(gpio_value, ioaddr + GMAC_GPIO_STATUS); + + /* Poll for time sync operation done */ + ret = readl_poll_timeout(priv->ioaddr + GMAC_INT_STATUS, v, + (v & GMAC_INT_TSIE), 100, 10000); + + if (ret == -ETIMEDOUT) { + pr_err("%s: Wait for time sync operation timeout\n", __func__); + return ret; + } + + num_snapshot = (readl(ioaddr + GMAC_TIMESTAMP_STATUS) & + GMAC_TIMESTAMP_ATSNS_MASK) >> + GMAC_TIMESTAMP_ATSNS_SHIFT; + + /* Repeat until the timestamps are from the FIFO last segment */ + for (i = 0; i < num_snapshot; i++) { + spin_lock_irqsave(&priv->ptp_lock, flags); + stmmac_get_ptptime(priv, ptpaddr, &ptp_time); + *device = ns_to_ktime(ptp_time); + spin_unlock_irqrestore(&priv->ptp_lock, flags); + get_arttime(priv->mii, intel_priv->mdio_adhoc_addr, &art_time); + *system = convert_art_to_tsc(art_time); + } + + return 0; +} + static void common_default_data(struct plat_stmmacenet_data *plat) { plat->clk_csr = 2; /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ @@ -384,6 +487,11 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->mdio_bus_data->phy_mask = 1 << INTEL_MGBE_ADHOC_ADDR; plat->mdio_bus_data->phy_mask |= 1 << INTEL_MGBE_XPCS_ADDR; + plat->int_snapshot_num = AUX_SNAPSHOT1; + + plat->has_crossts = true; + plat->crosststamp = intel_crosststamp; + return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index ef8502d2b6e6..462ca7ed095a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -50,6 +50,7 @@ #define GMAC_L4_ADDR(reg) (0x904 + (reg) * 0x30) #define GMAC_L3_ADDR0(reg) (0x910 + (reg) * 0x30) #define GMAC_L3_ADDR1(reg) (0x914 + (reg) * 0x30) +#define GMAC_TIMESTAMP_STATUS 0x00000b20 /* RX Queues Routing */ #define GMAC_RXQCTRL_AVCPQ_MASK GENMASK(2, 0) @@ -144,6 +145,7 @@ #define GMAC_INT_PCS_PHYIS BIT(3) #define GMAC_INT_PMT_EN BIT(4) #define GMAC_INT_LPI_EN BIT(5) +#define GMAC_INT_TSIE BIT(12) #define GMAC_PCS_IRQ_DEFAULT (GMAC_INT_RGSMIIS | GMAC_INT_PCS_LINK | \ GMAC_INT_PCS_ANE) @@ -260,6 +262,7 @@ enum power_event { #define GMAC_HW_RXFIFOSIZE GENMASK(4, 0) /* MAC HW features2 bitmap */ +#define GMAC_HW_FEAT_AUXSNAPNUM GENMASK(30, 28) #define GMAC_HW_FEAT_PPSOUTNUM GENMASK(26, 24) #define GMAC_HW_FEAT_TXCHCNT GENMASK(21, 18) #define GMAC_HW_FEAT_RXCHCNT GENMASK(15, 12) @@ -305,6 +308,11 @@ enum power_event { #define GMAC_L4DP0_SHIFT 16 #define GMAC_L4SP0 GENMASK(15, 0) +/* MAC Timestamp Status */ +#define GMAC_TIMESTAMP_AUXTSTRIG BIT(2) +#define GMAC_TIMESTAMP_ATSNS_MASK GENMASK(29, 25) +#define GMAC_TIMESTAMP_ATSNS_SHIFT 25 + /* MTL registers */ #define MTL_OPERATION_MODE 0x00000c00 #define MTL_FRPE BIT(15) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index 8958778d16b7..8954b85eb850 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -412,6 +412,8 @@ static void dwmac4_get_hw_feature(void __iomem *ioaddr, /* IEEE 1588-2002 */ dma_cap->time_stamp = 0; + /* Number of Auxiliary Snapshot Inputs */ + dma_cap->aux_snapshot_n = (hw_cap & GMAC_HW_FEAT_AUXSNAPNUM) >> 28; /* MAC HW feature3 */ hw_cap = readl(ioaddr + GMAC_HW_FEATURE3); diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 692541c7b419..59bf7078a754 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -508,6 +508,7 @@ struct stmmac_hwtimestamp { int (*adjust_systime) (void __iomem *ioaddr, u32 sec, u32 nsec, int add_sub, int gmac4); void (*get_systime) (void __iomem *ioaddr, u64 *systime); + void (*get_ptptime)(void __iomem *ioaddr, u64 *ptp_time); }; #define stmmac_config_hw_tstamping(__priv, __args...) \ @@ -522,6 +523,8 @@ struct stmmac_hwtimestamp { stmmac_do_callback(__priv, ptp, adjust_systime, __args) #define stmmac_get_systime(__priv, __args...) \ stmmac_do_void_callback(__priv, ptp, get_systime, __args) +#define stmmac_get_ptptime(__priv, __args...) \ + stmmac_do_void_callback(__priv, ptp, get_ptptime, __args) /* Helpers to manage the descriptors for chain and ring modes */ struct stmmac_mode_ops { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index d291612eeafb..113c51bcc0b5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -153,6 +153,16 @@ static void get_systime(void __iomem *ioaddr, u64 *systime) *systime = ns; } +static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time) +{ + u64 ns; + + ns = readl(ptpaddr + PTP_ATNR); + ns += readl(ptpaddr + PTP_ATSR) * NSEC_PER_SEC; + + *ptp_time = ns; +} + const struct stmmac_hwtimestamp stmmac_ptp = { .config_hw_tstamping = config_hw_tstamping, .init_systime = init_systime, @@ -160,4 +170,5 @@ const struct stmmac_hwtimestamp stmmac_ptp = { .config_addend = config_addend, .adjust_systime = adjust_systime, .get_systime = get_systime, + .get_ptptime = get_ptptime, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index 8b10fd10446f..b164ae22e35f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -9,6 +9,7 @@ *******************************************************************************/ #include "stmmac.h" #include "stmmac_ptp.h" +#include "dwmac4.h" /** * stmmac_adjust_freq @@ -165,6 +166,36 @@ static int stmmac_enable(struct ptp_clock_info *ptp, return ret; } +/** + * stmmac_get_syncdevicetime + * @device: current device time + * @system: system counter value read synchronously with device time + * @ctx: context provided by timekeeping code + * Description: Read device and system clock simultaneously and return the + * corrected clock values in ns. + **/ +static int stmmac_get_syncdevicetime(ktime_t *device, + struct system_counterval_t *system, + void *ctx) +{ + struct stmmac_priv *priv = (struct stmmac_priv *)ctx; + + if (priv->plat->crosststamp) + return priv->plat->crosststamp(device, system, ctx); + else + return -EOPNOTSUPP; +} + +static int stmmac_getcrosststamp(struct ptp_clock_info *ptp, + struct system_device_crosststamp *xtstamp) +{ + struct stmmac_priv *priv = + container_of(ptp, struct stmmac_priv, ptp_clock_ops); + + return get_device_system_crosststamp(stmmac_get_syncdevicetime, + priv, NULL, xtstamp); +} + /* structure describing a PTP hardware clock */ static struct ptp_clock_info stmmac_ptp_clock_ops = { .owner = THIS_MODULE, @@ -180,6 +211,7 @@ static struct ptp_clock_info stmmac_ptp_clock_ops = { .gettime64 = stmmac_get_time, .settime64 = stmmac_set_time, .enable = stmmac_enable, + .getcrosststamp = stmmac_getcrosststamp, }; /** diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h index 7abb1d47e7da..f88727ce4d30 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h @@ -23,6 +23,9 @@ #define PTP_STSUR 0x10 /* System Time – Seconds Update Reg */ #define PTP_STNSUR 0x14 /* System Time – Nanoseconds Update Reg */ #define PTP_TAR 0x18 /* Timestamp Addend Reg */ +#define PTP_ACR 0x40 /* Auxiliary Control Reg */ +#define PTP_ATNR 0x48 /* Auxiliary Timestamp - Nanoseconds Reg */ +#define PTP_ATSR 0x4c /* Auxiliary Timestamp - Seconds Reg */ #define PTP_STNSUR_ADDSUB_SHIFT 31 #define PTP_DIGITAL_ROLLOVER_MODE 0x3B9ACA00 /* 10e9-1 ns */ @@ -64,4 +67,24 @@ #define PTP_SSIR_SSINC_MASK 0xff #define GMAC4_PTP_SSIR_SSINC_SHIFT 16 +/* Auxiliary Control defines */ +#define PTP_ACR_ATSFC BIT(0) /* Auxiliary Snapshot FIFO Clear */ +#define PTP_ACR_ATSEN0 BIT(4) /* Auxiliary Snapshot 0 Enable */ +#define PTP_ACR_ATSEN1 BIT(5) /* Auxiliary Snapshot 1 Enable */ +#define PTP_ACR_ATSEN2 BIT(6) /* Auxiliary Snapshot 2 Enable */ +#define PTP_ACR_ATSEN3 BIT(7) /* Auxiliary Snapshot 3 Enable */ +#define PTP_ACR_MASK GENMASK(7, 4) /* Aux Snapshot Mask */ +#define PMC_ART_VALUE0 0x01 /* PMC_ART[15:0] timer value */ +#define PMC_ART_VALUE1 0x02 /* PMC_ART[31:16] timer value */ +#define PMC_ART_VALUE2 0x03 /* PMC_ART[47:32] timer value */ +#define PMC_ART_VALUE3 0x04 /* PMC_ART[63:48] timer value */ +#define GMAC4_ART_TIME_SHIFT 16 /* ART TIME 16-bits shift */ + +enum aux_snapshot { + AUX_SNAPSHOT0 = 0x10, + AUX_SNAPSHOT1 = 0x20, + AUX_SNAPSHOT2 = 0x40, + AUX_SNAPSHOT3 = 0x80, +}; + #endif /* __STMMAC_PTP_H__ */ diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 10abc80b601e..5134e802f39a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -186,6 +186,8 @@ struct plat_stmmacenet_data { void (*exit)(struct platform_device *pdev, void *priv); struct mac_device_info *(*setup)(void *priv); int (*clks_config)(void *priv, bool enabled); + int (*crosststamp)(ktime_t *device, struct system_counterval_t *system, + void *ctx); void *bsp_priv; struct clk *stmmac_clk; struct clk *pclk; @@ -206,5 +208,7 @@ struct plat_stmmacenet_data { u8 vlan_fail_q; unsigned int eee_usecs_rate; struct pci_dev *pdev; + bool has_crossts; + int int_snapshot_num; }; #endif -- cgit v1.2.3 From 0ef25ed104ac17fa0586fbb076f24a5e8940b966 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Wed, 24 Mar 2021 00:46:40 +0800 Subject: net: phy: add genphy_c45_loopback Add generic code to enable C45 PHY loopback into the common phy-c45.c file. This will allow C45 PHY drivers aceess this by setting .set_loopback. Suggested-by: Heiner Kallweit Signed-off-by: Wong Vee Khee Reviewed-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy-c45.c | 8 ++++++++ include/linux/phy.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 077f2929c45e..91e3acb9e397 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -560,6 +560,14 @@ int gen10g_config_aneg(struct phy_device *phydev) } EXPORT_SYMBOL_GPL(gen10g_config_aneg); +int genphy_c45_loopback(struct phy_device *phydev, bool enable) +{ + return phy_modify_mmd(phydev, MDIO_MMD_PCS, MDIO_CTRL1, + MDIO_PCS_CTRL1_LOOPBACK, + enable ? MDIO_PCS_CTRL1_LOOPBACK : 0); +} +EXPORT_SYMBOL_GPL(genphy_c45_loopback); + struct phy_driver genphy_c45_driver = { .phy_id = 0xffffffff, .phy_id_mask = 0xffffffff, diff --git a/include/linux/phy.h b/include/linux/phy.h index 1a12e4436b5b..8e2cf84b2318 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1532,6 +1532,7 @@ int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); +int genphy_c45_loopback(struct phy_device *phydev, bool enable); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; -- cgit v1.2.3 From 3c85a8b81cc89c712c4f44cb1a4d5547e472fb1f Mon Sep 17 00:00:00 2001 From: Cooper Lees Date: Tue, 23 Mar 2021 20:47:38 -0700 Subject: Add Open Routing Protocol ID to `rtnetlink.h` - The Open Routing (Open/R) network protocol netlink handler uses ID 99 - Will also add to `/etc/iproute2/rt_protos` once this is accepted - For more information: https://github.com/facebook/openr Signed-off-by: From: Cooper Lees Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index d35953bc7d53..5888492a5257 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -290,6 +290,7 @@ enum { #define RTPROT_MROUTED 17 /* Multicast daemon */ #define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ #define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */ #define RTPROT_BGP 186 /* BGP Routes */ #define RTPROT_ISIS 187 /* ISIS Routes */ #define RTPROT_OSPF 188 /* OSPF Routes */ -- cgit v1.2.3 From d1c5688087a0904606a77ae4803f13777f8dd7d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Mar 2021 15:01:38 -0700 Subject: tcp_metrics: tcpm_hash_bucket is strictly local After commit 098a697b497e ("tcp_metrics: Use a single hash table for all network namespaces."), tcpm_hash_bucket is local to net/ipv4/tcp_metrics.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 70a2a085dd1a..9e3cb2722b80 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -11,7 +11,6 @@ #include #include -struct tcpm_hash_bucket; struct ctl_table_header; struct ipv4_devconf; struct fib_rules_ops; -- cgit v1.2.3 From 5a5586112b929546e16029261a987c9197bfdfa2 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Wed, 24 Mar 2021 17:07:42 +0800 Subject: net: stmmac: support FPE link partner hand-shaking procedure In order to discover whether remote station supports frame preemption, local station sends verify mPacket and expects response mPacket in return from the remote station. So, we add the functions to send and handle event when verify mPacket and response mPacket are exchanged between the networked stations. The mechanism to handle different FPE states between local and remote station (link partner) is implemented using workqueue which starts a task each time there is some sign of verify & response mPacket exchange as check in FPE IRQ event. The task retries couple of times to try to spot the states that both stations are ready to enter FPE ON. This allows different end points to enable FPE at different time and verify-response mPacket can happen asynchronously. Ultimately, the task will only turn FPE ON when local station have both exchange response in both directions. Thanks to Voon Weifeng for implementing the core functions for detecting FPE events and send mPacket and phylink related change. Signed-off-by: Ong Boon Leong Co-developed-by: Voon Weifeng Signed-off-by: Voon Weifeng Co-developed-by: Tan Tee Min Signed-off-by: Tan Tee Min Co-developed-by: Mohammad Athari Bin Ismail Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 7 + drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 8 + drivers/net/ethernet/stmicro/stmmac/dwmac5.c | 49 ++++++ drivers/net/ethernet/stmicro/stmmac/dwmac5.h | 11 ++ drivers/net/ethernet/stmicro/stmmac/hwif.h | 7 + drivers/net/ethernet/stmicro/stmmac/stmmac.h | 7 + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 183 ++++++++++++++++++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 39 ++++- include/linux/stmmac.h | 27 ++++ 9 files changed, 331 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 95469059dca1..d065b11b7b10 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -315,6 +315,13 @@ enum dma_irq_status { #define CORE_IRQ_RX_PATH_IN_LPI_MODE (1 << 2) #define CORE_IRQ_RX_PATH_EXIT_LPI_MODE (1 << 3) +/* FPE defines */ +#define FPE_EVENT_UNKNOWN 0 +#define FPE_EVENT_TRSP BIT(0) +#define FPE_EVENT_TVER BIT(1) +#define FPE_EVENT_RRSP BIT(2) +#define FPE_EVENT_RVER BIT(3) + #define CORE_IRQ_MTL_RX_OVERFLOW BIT(8) /* Physical Coding Sublayer */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 29f765a246a0..95864f014ffa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -53,6 +53,10 @@ static void dwmac4_core_init(struct mac_device_info *hw, if (hw->pcs) value |= GMAC_PCS_IRQ_DEFAULT; + /* Enable FPE interrupt */ + if ((GMAC_HW_FEAT_FPESEL & readl(ioaddr + GMAC_HW_FEATURE3)) >> 26) + value |= GMAC_INT_FPE_EN; + writel(value, ioaddr + GMAC_INT_EN); } @@ -1245,6 +1249,8 @@ const struct stmmac_ops dwmac410_ops = { .config_l4_filter = dwmac4_config_l4_filter, .est_configure = dwmac5_est_configure, .fpe_configure = dwmac5_fpe_configure, + .fpe_send_mpacket = dwmac5_fpe_send_mpacket, + .fpe_irq_status = dwmac5_fpe_irq_status, .add_hw_vlan_rx_fltr = dwmac4_add_hw_vlan_rx_fltr, .del_hw_vlan_rx_fltr = dwmac4_del_hw_vlan_rx_fltr, .restore_hw_vlan_rx_fltr = dwmac4_restore_hw_vlan_rx_fltr, @@ -1294,6 +1300,8 @@ const struct stmmac_ops dwmac510_ops = { .config_l4_filter = dwmac4_config_l4_filter, .est_configure = dwmac5_est_configure, .fpe_configure = dwmac5_fpe_configure, + .fpe_send_mpacket = dwmac5_fpe_send_mpacket, + .fpe_irq_status = dwmac5_fpe_irq_status, .add_hw_vlan_rx_fltr = dwmac4_add_hw_vlan_rx_fltr, .del_hw_vlan_rx_fltr = dwmac4_del_hw_vlan_rx_fltr, .restore_hw_vlan_rx_fltr = dwmac4_restore_hw_vlan_rx_fltr, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c index 0ae85f8adf67..5b010ebfede9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c @@ -707,3 +707,52 @@ void dwmac5_fpe_configure(void __iomem *ioaddr, u32 num_txq, u32 num_rxq, value |= EFPE; writel(value, ioaddr + MAC_FPE_CTRL_STS); } + +int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev) +{ + u32 value; + int status; + + status = FPE_EVENT_UNKNOWN; + + value = readl(ioaddr + MAC_FPE_CTRL_STS); + + if (value & TRSP) { + status |= FPE_EVENT_TRSP; + netdev_info(dev, "FPE: Respond mPacket is transmitted\n"); + } + + if (value & TVER) { + status |= FPE_EVENT_TVER; + netdev_info(dev, "FPE: Verify mPacket is transmitted\n"); + } + + if (value & RRSP) { + status |= FPE_EVENT_RRSP; + netdev_info(dev, "FPE: Respond mPacket is received\n"); + } + + if (value & RVER) { + status |= FPE_EVENT_RVER; + netdev_info(dev, "FPE: Verify mPacket is received\n"); + } + + return status; +} + +void dwmac5_fpe_send_mpacket(void __iomem *ioaddr, enum stmmac_mpacket_type type) +{ + u32 value; + + value = readl(ioaddr + MAC_FPE_CTRL_STS); + + if (type == MPACKET_VERIFY) { + value &= ~SRSP; + value |= SVER; + } else { + value &= ~SVER; + value |= SRSP; + } + + writel(value, ioaddr + MAC_FPE_CTRL_STS); +} diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h index 709bbfc9ae61..ff555d8b0cdf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h @@ -12,6 +12,12 @@ #define TMOUTEN BIT(0) #define MAC_FPE_CTRL_STS 0x00000234 +#define TRSP BIT(19) +#define TVER BIT(18) +#define RRSP BIT(17) +#define RVER BIT(16) +#define SRSP BIT(2) +#define SVER BIT(1) #define EFPE BIT(0) #define MAC_PPS_CONTROL 0x00000b70 @@ -128,6 +134,8 @@ #define GMAC_RXQCTRL_VFFQ_SHIFT 17 #define GMAC_RXQCTRL_VFFQE BIT(16) +#define GMAC_INT_FPE_EN BIT(17) + int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp); int dwmac5_safety_feat_irq_status(struct net_device *ndev, void __iomem *ioaddr, unsigned int asp, @@ -145,5 +153,8 @@ void dwmac5_est_irq_status(void __iomem *ioaddr, struct net_device *dev, struct stmmac_extra_stats *x, u32 txqcnt); void dwmac5_fpe_configure(void __iomem *ioaddr, u32 num_txq, u32 num_rxq, bool enable); +void dwmac5_fpe_send_mpacket(void __iomem *ioaddr, + enum stmmac_mpacket_type type); +int dwmac5_fpe_irq_status(void __iomem *ioaddr, struct net_device *dev); #endif /* __DWMAC5_H__ */ diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 59bf7078a754..45edac5f60db 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -397,6 +397,9 @@ struct stmmac_ops { struct stmmac_extra_stats *x, u32 txqcnt); void (*fpe_configure)(void __iomem *ioaddr, u32 num_txq, u32 num_rxq, bool enable); + void (*fpe_send_mpacket)(void __iomem *ioaddr, + enum stmmac_mpacket_type type); + int (*fpe_irq_status)(void __iomem *ioaddr, struct net_device *dev); }; #define stmmac_core_init(__priv, __args...) \ @@ -497,6 +500,10 @@ struct stmmac_ops { stmmac_do_void_callback(__priv, mac, est_irq_status, __args) #define stmmac_fpe_configure(__priv, __args...) \ stmmac_do_void_callback(__priv, mac, fpe_configure, __args) +#define stmmac_fpe_send_mpacket(__priv, __args...) \ + stmmac_do_void_callback(__priv, mac, fpe_send_mpacket, __args) +#define stmmac_fpe_irq_status(__priv, __args...) \ + stmmac_do_callback(__priv, mac, fpe_irq_status, __args) /* PTP and HW Timer helpers */ struct stmmac_hwtimestamp { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 375c503d2df8..4faad331a4ca 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -234,6 +234,12 @@ struct stmmac_priv { struct workqueue_struct *wq; struct work_struct service_task; + /* Workqueue for handling FPE hand-shaking */ + unsigned long fpe_task_state; + struct workqueue_struct *fpe_wq; + struct work_struct fpe_task; + char wq_name[IFNAMSIZ + 4]; + /* TC Handling */ unsigned int tc_entries_max; unsigned int tc_off_max; @@ -273,6 +279,7 @@ bool stmmac_eee_init(struct stmmac_priv *priv); int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt); int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size); int stmmac_bus_clks_config(struct stmmac_priv *priv, bool enabled); +void stmmac_fpe_handshake(struct stmmac_priv *priv, bool enable); #if IS_ENABLED(CONFIG_STMMAC_SELFTESTS) void stmmac_selftest_run(struct net_device *dev, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 8d7015d3a537..170296820af0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -971,6 +971,21 @@ static void stmmac_mac_an_restart(struct phylink_config *config) /* Not Supported */ } +static void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up) +{ + struct stmmac_fpe_cfg *fpe_cfg = priv->plat->fpe_cfg; + enum stmmac_fpe_state *lo_state = &fpe_cfg->lo_fpe_state; + enum stmmac_fpe_state *lp_state = &fpe_cfg->lp_fpe_state; + bool *hs_enable = &fpe_cfg->hs_enable; + + if (is_up && *hs_enable) { + stmmac_fpe_send_mpacket(priv, priv->ioaddr, MPACKET_VERIFY); + } else { + *lo_state = FPE_EVENT_UNKNOWN; + *lp_state = FPE_EVENT_UNKNOWN; + } +} + static void stmmac_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) { @@ -981,6 +996,8 @@ static void stmmac_mac_link_down(struct phylink_config *config, priv->tx_lpi_enabled = false; stmmac_eee_init(priv); stmmac_set_eee_pls(priv, priv->hw, false); + + stmmac_fpe_link_state_handle(priv, false); } static void stmmac_mac_link_up(struct phylink_config *config, @@ -1079,6 +1096,8 @@ static void stmmac_mac_link_up(struct phylink_config *config, priv->tx_lpi_enabled = priv->eee_enabled; stmmac_set_eee_pls(priv, priv->hw, true); } + + stmmac_fpe_link_state_handle(priv, true); } static const struct phylink_mac_ops stmmac_phylink_mac_ops = { @@ -2793,6 +2812,26 @@ static void stmmac_safety_feat_configuration(struct stmmac_priv *priv) } } +static int stmmac_fpe_start_wq(struct stmmac_priv *priv) +{ + char *name; + + clear_bit(__FPE_TASK_SCHED, &priv->fpe_task_state); + + name = priv->wq_name; + sprintf(name, "%s-fpe", priv->dev->name); + + priv->fpe_wq = create_singlethread_workqueue(name); + if (!priv->fpe_wq) { + netdev_err(priv->dev, "%s: Failed to create workqueue\n", name); + + return -ENOMEM; + } + netdev_info(priv->dev, "FPE workqueue start"); + + return 0; +} + /** * stmmac_hw_setup - setup mac in a usable state. * @dev : pointer to the device structure. @@ -2929,6 +2968,13 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) /* Start the ball rolling... */ stmmac_start_all_dma(priv); + if (priv->dma_cap.fpesel) { + stmmac_fpe_start_wq(priv); + + if (priv->plat->fpe_cfg->enable) + stmmac_fpe_handshake(priv, true); + } + return 0; } @@ -3090,6 +3136,16 @@ init_phy_error: return ret; } +static void stmmac_fpe_stop_wq(struct stmmac_priv *priv) +{ + set_bit(__FPE_REMOVING, &priv->fpe_task_state); + + if (priv->fpe_wq) + destroy_workqueue(priv->fpe_wq); + + netdev_info(priv->dev, "FPE workqueue stop"); +} + /** * stmmac_release - close entry point of the driver * @dev : device pointer. @@ -3139,6 +3195,9 @@ static int stmmac_release(struct net_device *dev) pm_runtime_put(priv->device); + if (priv->dma_cap.fpesel) + stmmac_fpe_stop_wq(priv); + return 0; } @@ -4280,6 +4339,48 @@ static int stmmac_set_features(struct net_device *netdev, return 0; } +static void stmmac_fpe_event_status(struct stmmac_priv *priv, int status) +{ + struct stmmac_fpe_cfg *fpe_cfg = priv->plat->fpe_cfg; + enum stmmac_fpe_state *lo_state = &fpe_cfg->lo_fpe_state; + enum stmmac_fpe_state *lp_state = &fpe_cfg->lp_fpe_state; + bool *hs_enable = &fpe_cfg->hs_enable; + + if (status == FPE_EVENT_UNKNOWN || !*hs_enable) + return; + + /* If LP has sent verify mPacket, LP is FPE capable */ + if ((status & FPE_EVENT_RVER) == FPE_EVENT_RVER) { + if (*lp_state < FPE_STATE_CAPABLE) + *lp_state = FPE_STATE_CAPABLE; + + /* If user has requested FPE enable, quickly response */ + if (*hs_enable) + stmmac_fpe_send_mpacket(priv, priv->ioaddr, + MPACKET_RESPONSE); + } + + /* If Local has sent verify mPacket, Local is FPE capable */ + if ((status & FPE_EVENT_TVER) == FPE_EVENT_TVER) { + if (*lo_state < FPE_STATE_CAPABLE) + *lo_state = FPE_STATE_CAPABLE; + } + + /* If LP has sent response mPacket, LP is entering FPE ON */ + if ((status & FPE_EVENT_RRSP) == FPE_EVENT_RRSP) + *lp_state = FPE_STATE_ENTERING_ON; + + /* If Local has sent response mPacket, Local is entering FPE ON */ + if ((status & FPE_EVENT_TRSP) == FPE_EVENT_TRSP) + *lo_state = FPE_STATE_ENTERING_ON; + + if (!test_bit(__FPE_REMOVING, &priv->fpe_task_state) && + !test_and_set_bit(__FPE_TASK_SCHED, &priv->fpe_task_state) && + priv->fpe_wq) { + queue_work(priv->fpe_wq, &priv->fpe_task); + } +} + /** * stmmac_interrupt - main ISR * @irq: interrupt number. @@ -4318,6 +4419,13 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id) stmmac_est_irq_status(priv, priv->ioaddr, priv->dev, &priv->xstats, tx_cnt); + if (priv->dma_cap.fpesel) { + int status = stmmac_fpe_irq_status(priv, priv->ioaddr, + priv->dev); + + stmmac_fpe_event_status(priv, status); + } + /* To handle GMAC own interrupts */ if ((priv->plat->has_gmac) || xmac) { int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats); @@ -5065,6 +5173,68 @@ int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size) return ret; } +#define SEND_VERIFY_MPAKCET_FMT "Send Verify mPacket lo_state=%d lp_state=%d\n" +static void stmmac_fpe_lp_task(struct work_struct *work) +{ + struct stmmac_priv *priv = container_of(work, struct stmmac_priv, + fpe_task); + struct stmmac_fpe_cfg *fpe_cfg = priv->plat->fpe_cfg; + enum stmmac_fpe_state *lo_state = &fpe_cfg->lo_fpe_state; + enum stmmac_fpe_state *lp_state = &fpe_cfg->lp_fpe_state; + bool *hs_enable = &fpe_cfg->hs_enable; + bool *enable = &fpe_cfg->enable; + int retries = 20; + + while (retries-- > 0) { + /* Bail out immediately if FPE handshake is OFF */ + if (*lo_state == FPE_STATE_OFF || !*hs_enable) + break; + + if (*lo_state == FPE_STATE_ENTERING_ON && + *lp_state == FPE_STATE_ENTERING_ON) { + stmmac_fpe_configure(priv, priv->ioaddr, + priv->plat->tx_queues_to_use, + priv->plat->rx_queues_to_use, + *enable); + + netdev_info(priv->dev, "configured FPE\n"); + + *lo_state = FPE_STATE_ON; + *lp_state = FPE_STATE_ON; + netdev_info(priv->dev, "!!! BOTH FPE stations ON\n"); + break; + } + + if ((*lo_state == FPE_STATE_CAPABLE || + *lo_state == FPE_STATE_ENTERING_ON) && + *lp_state != FPE_STATE_ON) { + netdev_info(priv->dev, SEND_VERIFY_MPAKCET_FMT, + *lo_state, *lp_state); + stmmac_fpe_send_mpacket(priv, priv->ioaddr, + MPACKET_VERIFY); + } + /* Sleep then retry */ + msleep(500); + } + + clear_bit(__FPE_TASK_SCHED, &priv->fpe_task_state); +} + +void stmmac_fpe_handshake(struct stmmac_priv *priv, bool enable) +{ + if (priv->plat->fpe_cfg->hs_enable != enable) { + if (enable) { + stmmac_fpe_send_mpacket(priv, priv->ioaddr, + MPACKET_VERIFY); + } else { + priv->plat->fpe_cfg->lo_fpe_state = FPE_STATE_OFF; + priv->plat->fpe_cfg->lp_fpe_state = FPE_STATE_OFF; + } + + priv->plat->fpe_cfg->hs_enable = enable; + } +} + /** * stmmac_dvr_probe * @device: device pointer @@ -5122,6 +5292,9 @@ int stmmac_dvr_probe(struct device *device, INIT_WORK(&priv->service_task, stmmac_service_task); + /* Initialize Link Partner FPE workqueue */ + INIT_WORK(&priv->fpe_task, stmmac_fpe_lp_task); + /* Override with kernel parameters if supplied XXX CRS XXX * this needs to have multiple instances */ @@ -5435,8 +5608,18 @@ int stmmac_suspend(struct device *dev) if (ret) return ret; } + mutex_unlock(&priv->lock); + if (priv->dma_cap.fpesel) { + /* Disable FPE */ + stmmac_fpe_configure(priv, priv->ioaddr, + priv->plat->tx_queues_to_use, + priv->plat->rx_queues_to_use, false); + + stmmac_fpe_handshake(priv, false); + } + priv->speed = SPEED_UNKNOWN; return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index b80cb2985b39..1d84ee359808 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -297,6 +297,17 @@ static int tc_init(struct stmmac_priv *priv) dev_info(priv->device, "Enabling HW TC (entries=%d, max_off=%d)\n", priv->tc_entries_max, priv->tc_off_max); + + if (!priv->plat->fpe_cfg) { + priv->plat->fpe_cfg = devm_kzalloc(priv->device, + sizeof(*priv->plat->fpe_cfg), + GFP_KERNEL); + if (!priv->plat->fpe_cfg) + return -ENOMEM; + } else { + memset(priv->plat->fpe_cfg, 0, sizeof(*priv->plat->fpe_cfg)); + } + return 0; } @@ -829,13 +840,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv, if (fpe && !priv->dma_cap.fpesel) return -EOPNOTSUPP; - ret = stmmac_fpe_configure(priv, priv->ioaddr, - priv->plat->tx_queues_to_use, - priv->plat->rx_queues_to_use, fpe); - if (ret && fpe) { - netdev_err(priv->dev, "failed to enable Frame Preemption\n"); - return ret; - } + /* Actual FPE register configuration will be done after FPE handshake + * is success. + */ + priv->plat->fpe_cfg->enable = fpe; ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est, priv->plat->clk_ptp_rate); @@ -845,12 +853,29 @@ static int tc_setup_taprio(struct stmmac_priv *priv, } netdev_info(priv->dev, "configured EST\n"); + + if (fpe) { + stmmac_fpe_handshake(priv, true); + netdev_info(priv->dev, "start FPE handshake\n"); + } + return 0; disable: priv->plat->est->enable = false; stmmac_est_configure(priv, priv->ioaddr, priv->plat->est, priv->plat->clk_ptp_rate); + + priv->plat->fpe_cfg->enable = false; + stmmac_fpe_configure(priv, priv->ioaddr, + priv->plat->tx_queues_to_use, + priv->plat->rx_queues_to_use, + false); + netdev_info(priv->dev, "disabled FPE\n"); + + stmmac_fpe_handshake(priv, false); + netdev_info(priv->dev, "stop FPE handshake\n"); + return ret; } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 5134e802f39a..febdb43d27e5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -144,6 +144,32 @@ struct stmmac_txq_cfg { int tbs_en; }; +/* FPE link state */ +enum stmmac_fpe_state { + FPE_STATE_OFF = 0, + FPE_STATE_CAPABLE = 1, + FPE_STATE_ENTERING_ON = 2, + FPE_STATE_ON = 3, +}; + +/* FPE link-partner hand-shaking mPacket type */ +enum stmmac_mpacket_type { + MPACKET_VERIFY = 0, + MPACKET_RESPONSE = 1, +}; + +enum stmmac_fpe_task_state_t { + __FPE_REMOVING, + __FPE_TASK_SCHED, +}; + +struct stmmac_fpe_cfg { + bool enable; /* FPE enable */ + bool hs_enable; /* FPE handshake enable */ + enum stmmac_fpe_state lp_fpe_state; /* Link Partner FPE state */ + enum stmmac_fpe_state lo_fpe_state; /* Local station FPE state */ +}; + struct plat_stmmacenet_data { int bus_id; int phy_addr; @@ -155,6 +181,7 @@ struct plat_stmmacenet_data { struct device_node *mdio_node; struct stmmac_dma_cfg *dma_cfg; struct stmmac_est *est; + struct stmmac_fpe_cfg *fpe_cfg; int clk_csr; int has_gmac; int enh_desc; -- cgit v1.2.3 From ed3038158e7b58dcf966cb7ec4ae98d152a5b794 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:55 -0700 Subject: ethtool: fec: fix typo in kdoc s/porte/the port/ Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index cde753bb2093..1433d6278018 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1378,7 +1378,7 @@ struct ethtool_per_queue_op { /** * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on porte + * @active_fec: FEC mode which is active on the port * @fec: Bitmask of supported/configured FEC modes * @rsvd: Reserved for future extensions. i.e FEC bypass feature. * -- cgit v1.2.3 From 408386817a9d32c88c9ac528749e9999d0e3f6a1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:56 -0700 Subject: ethtool: fec: remove long structure description Digging through the mailing list archive @autoneg was part of the first version of the RFC, this left over comment was pointed out twice in review but wasn't removed. The sentence is an exact copy-paste from pauseparam. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 1433d6278018..36bf435d232c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1381,10 +1381,6 @@ struct ethtool_per_queue_op { * @active_fec: FEC mode which is active on the port * @fec: Bitmask of supported/configured FEC modes * @rsvd: Reserved for future extensions. i.e FEC bypass feature. - * - * Drivers should reject a non-zero setting of @autoneg when - * autoneogotiation is disabled (or not supported) for the link. - * */ struct ethtool_fecparam { __u32 cmd; -- cgit v1.2.3 From 240e114411e74d2ee8121643e0c67717eb7c6982 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:57 -0700 Subject: ethtool: fec: sanitize ethtool_fecparam->reserved struct ethtool_fecparam::reserved is never looked at by the core. Make sure it's actually 0. Unfortunately we can't return an error because old ethtool doesn't zero-initialize the structure for SET. On GET we can be more verbose, there are no in tree (ab)users. Fix up the kdoc on the structure. Remove the mention of FEC bypass. Seems like a niche thing to configure in the first place. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 +- net/ethtool/ioctl.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 36bf435d232c..9e2682a67460 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1380,7 +1380,7 @@ struct ethtool_per_queue_op { * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM * @active_fec: FEC mode which is active on the port * @fec: Bitmask of supported/configured FEC modes - * @rsvd: Reserved for future extensions. i.e FEC bypass feature. + * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. */ struct ethtool_fecparam { __u32 cmd; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 0788cc3b3114..be3549023d89 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2568,6 +2568,9 @@ static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) if (rc) return rc; + if (WARN_ON_ONCE(fecparam.reserved)) + fecparam.reserved = 0; + if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; return 0; @@ -2583,6 +2586,8 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; + fecparam.reserved = 0; + return dev->ethtool_ops->set_fecparam(dev, &fecparam); } -- cgit v1.2.3 From d3b37fc805d9ef697451730ebdfc7e35e6c2ace8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:11:58 -0700 Subject: ethtool: fec: sanitize ethtool_fecparam->active_fec struct ethtool_fecparam::active_fec is a GET-only field, all in-tree drivers correctly ignore it on SET. Clear the field on SET to avoid any confusion. Again, we can't reject non-zero now since ethtool user space does not zero-init the param correctly. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 +- net/ethtool/ioctl.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9e2682a67460..517b68c5fcec 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1378,7 +1378,7 @@ struct ethtool_per_queue_op { /** * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on the port + * @active_fec: FEC mode which is active on the port, GET only. * @fec: Bitmask of supported/configured FEC modes * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index be3549023d89..237ffe5440ef 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2586,6 +2586,7 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; + fecparam.active_fec = 0; fecparam.reserved = 0; return dev->ethtool_ops->set_fecparam(dev, &fecparam); -- cgit v1.2.3 From 6dbf94b264e62641d521975d0eddbeef36bacf3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Mar 2021 18:12:00 -0700 Subject: ethtool: clarify the ethtool FEC interface The definition of the FEC driver interface is quite unclear. Improve the documentation. This is based on current driver and user space code, as well as the discussions about the interface: RFC v1 (24 Oct 2016): https://lore.kernel.org/netdev/1477363849-36517-1-git-send-email-vidya@cumulusnetworks.com/ - this version has the autoneg field - no active_fec field - none vs off confusion is already present RFC v2 (10 Feb 2017): https://lore.kernel.org/netdev/1486727004-11316-1-git-send-email-vidya@cumulusnetworks.com/ - autoneg removed - active_fec added v1 (10 Feb 2017): https://lore.kernel.org/netdev/1486751311-42019-1-git-send-email-vidya@cumulusnetworks.com/ - no changes in the code v1 (24 Jun 2017): https://lore.kernel.org/netdev/1498331985-8525-1-git-send-email-roopa@cumulusnetworks.com/ - include in tree user v2 (27 Jul 2017): https://lore.kernel.org/netdev/1501199248-24695-1-git-send-email-roopa@cumulusnetworks.com/ Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 517b68c5fcec..f6ef7d42c7a1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1376,11 +1376,29 @@ struct ethtool_per_queue_op { }; /** - * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters + * struct ethtool_fecparam - Ethernet Forward Error Correction parameters * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM - * @active_fec: FEC mode which is active on the port, GET only. - * @fec: Bitmask of supported/configured FEC modes + * @active_fec: FEC mode which is active on the port, single bit set, GET only. + * @fec: Bitmask of configured FEC modes. * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. + * + * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. + * FEC settings are configured by link autonegotiation whenever it's enabled. + * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. + * + * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings. + * It is recommended that drivers only accept a single bit set in @fec. + * When multiple bits are set in @fec drivers may pick mode in an implementation + * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other + * FEC modes, because it's unclear whether in this case other modes constrain + * AUTO or are independent choices. + * Drivers must reject SET requests if they support none of the requested modes. + * + * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead + * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM. + * + * See enum ethtool_fec_config_bits for definition of valid bits for both + * @fec and @active_fec. */ struct ethtool_fecparam { __u32 cmd; @@ -1392,11 +1410,16 @@ struct ethtool_fecparam { /** * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration - * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported - * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver + * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO: Select default/best FEC mode automatically, usually based + * link mode and SFP parameters read from module's EEPROM. + * This bit does _not_ mean autonegotiation. * @ETHTOOL_FEC_OFF: No FEC Mode - * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode - * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode + * @ETHTOOL_FEC_RS: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) */ enum ethtool_fec_config_bits { ETHTOOL_FEC_NONE_BIT, -- cgit v1.2.3 From 5d9034938720a15fa0f62db3e195c0c473c72c1b Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Thu, 18 Mar 2021 21:22:22 +0100 Subject: bpf: Fix typo 'accesible' into 'accessible' Trivial fix. Signed-off-by: Ricardo Ribalda Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210318202223.164873-8-ribalda@chromium.org --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 39dce9d3c3a5..24678d6ecbcf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,7 +56,7 @@ struct bpf_iter_seq_info { u32 seq_priv_size; }; -/* map is generic key/value storage optionally accesible by eBPF programs */ +/* map is generic key/value storage optionally accessible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ int (*map_alloc_check)(union bpf_attr *attr); -- cgit v1.2.3 From 8532f613bc78b6e0e32b486e720848d3f5569287 Mon Sep 17 00:00:00 2001 From: Ong Boon Leong Date: Fri, 26 Mar 2021 01:39:14 +0800 Subject: net: stmmac: introduce MSI Interrupt routines for mac, safety, RX & TX Now we introduce MSI interrupt service routines and hook these routines up if stmmac_open() sees valid irq line being requested:- stmmac_mac_interrupt() :- MAC (dev->irq), WOL (wol_irq), LPI (lpi_irq) stmmac_safety_interrupt() :- Safety Feat Correctible Error (sfty_ce_irq) & Uncorrectible Error (sfty_ue_irq) stmmac_msi_intr_rx() :- For all RX MSI irq (rx_irq) stmmac_msi_intr_tx() :- For all TX MSI irq (tx_irq) Each of IRQs will have its unique name so that we can differentiate them easily under /proc/interrupts. Signed-off-by: Ong Boon Leong Signed-off-by: Voon Weifeng Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 15 + drivers/net/ethernet/stmicro/stmmac/stmmac.h | 16 + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 437 +++++++++++++++++++--- include/linux/stmmac.h | 8 + 4 files changed, 431 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 5afb36a5c94c..c54a56b732b3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -259,6 +259,9 @@ struct stmmac_safety_stats { #define DMA_HW_FEAT_ACTPHYIF 0x70000000 /* Active/selected PHY iface */ #define DEFAULT_DMA_PBL 8 +/* MSI defines */ +#define STMMAC_MSI_VEC_MAX 32 + /* PCS status and mask defines */ #define PCS_ANE_IRQ BIT(2) /* PCS Auto-Negotiation */ #define PCS_LINK_IRQ BIT(1) /* PCS Link */ @@ -315,6 +318,18 @@ enum dma_irq_dir { DMA_DIR_RXTX = 0x3, }; +enum request_irq_err { + REQ_IRQ_ERR_ALL, + REQ_IRQ_ERR_TX, + REQ_IRQ_ERR_RX, + REQ_IRQ_ERR_SFTY_UE, + REQ_IRQ_ERR_SFTY_CE, + REQ_IRQ_ERR_LPI, + REQ_IRQ_ERR_WOL, + REQ_IRQ_ERR_MAC, + REQ_IRQ_ERR_NO, +}; + /* EEE and LPI defines */ #define CORE_IRQ_TX_PATH_IN_LPI_MODE (1 << 0) #define CORE_IRQ_TX_PATH_EXIT_LPI_MODE (1 << 1) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 4faad331a4ca..9966f6f10905 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -30,6 +30,10 @@ struct stmmac_resources { int wol_irq; int lpi_irq; int irq; + int sfty_ce_irq; + int sfty_ue_irq; + int rx_irq[MTL_MAX_RX_QUEUES]; + int tx_irq[MTL_MAX_TX_QUEUES]; }; struct stmmac_tx_info { @@ -225,6 +229,18 @@ struct stmmac_priv { void __iomem *mmcaddr; void __iomem *ptpaddr; unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; + int sfty_ce_irq; + int sfty_ue_irq; + int rx_irq[MTL_MAX_RX_QUEUES]; + int tx_irq[MTL_MAX_TX_QUEUES]; + /*irq name */ + char int_name_mac[IFNAMSIZ + 9]; + char int_name_wol[IFNAMSIZ + 9]; + char int_name_lpi[IFNAMSIZ + 9]; + char int_name_sfty_ce[IFNAMSIZ + 10]; + char int_name_sfty_ue[IFNAMSIZ + 10]; + char int_name_rx_irq[MTL_MAX_TX_QUEUES][IFNAMSIZ + 14]; + char int_name_tx_irq[MTL_MAX_TX_QUEUES][IFNAMSIZ + 18]; #ifdef CONFIG_DEBUG_FS struct dentry *dbgfs_dir; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index abe990b9b07b..459477db455c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -105,6 +105,11 @@ module_param(chain_mode, int, 0444); MODULE_PARM_DESC(chain_mode, "To use chain instead of ring mode"); static irqreturn_t stmmac_interrupt(int irq, void *dev_id); +/* For MSI interrupts handling */ +static irqreturn_t stmmac_mac_interrupt(int irq, void *dev_id); +static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id); +static irqreturn_t stmmac_msi_intr_tx(int irq, void *data); +static irqreturn_t stmmac_msi_intr_rx(int irq, void *data); #ifdef CONFIG_DEBUG_FS static const struct net_device_ops stmmac_netdev_ops; @@ -2986,6 +2991,260 @@ static void stmmac_hw_teardown(struct net_device *dev) clk_disable_unprepare(priv->plat->clk_ptp_ref); } +static void stmmac_free_irq(struct net_device *dev, + enum request_irq_err irq_err, int irq_idx) +{ + struct stmmac_priv *priv = netdev_priv(dev); + int j; + + switch (irq_err) { + case REQ_IRQ_ERR_ALL: + irq_idx = priv->plat->tx_queues_to_use; + fallthrough; + case REQ_IRQ_ERR_TX: + for (j = irq_idx - 1; j >= 0; j--) { + if (priv->tx_irq[j] > 0) + free_irq(priv->tx_irq[j], &priv->tx_queue[j]); + } + irq_idx = priv->plat->rx_queues_to_use; + fallthrough; + case REQ_IRQ_ERR_RX: + for (j = irq_idx - 1; j >= 0; j--) { + if (priv->rx_irq[j] > 0) + free_irq(priv->rx_irq[j], &priv->rx_queue[j]); + } + + if (priv->sfty_ue_irq > 0 && priv->sfty_ue_irq != dev->irq) + free_irq(priv->sfty_ue_irq, dev); + fallthrough; + case REQ_IRQ_ERR_SFTY_UE: + if (priv->sfty_ce_irq > 0 && priv->sfty_ce_irq != dev->irq) + free_irq(priv->sfty_ce_irq, dev); + fallthrough; + case REQ_IRQ_ERR_SFTY_CE: + if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) + free_irq(priv->lpi_irq, dev); + fallthrough; + case REQ_IRQ_ERR_LPI: + if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) + free_irq(priv->wol_irq, dev); + fallthrough; + case REQ_IRQ_ERR_WOL: + free_irq(dev->irq, dev); + fallthrough; + case REQ_IRQ_ERR_MAC: + case REQ_IRQ_ERR_NO: + /* If MAC IRQ request error, no more IRQ to free */ + break; + } +} + +static int stmmac_request_irq_multi_msi(struct net_device *dev) +{ + enum request_irq_err irq_err = REQ_IRQ_ERR_NO; + struct stmmac_priv *priv = netdev_priv(dev); + int irq_idx = 0; + char *int_name; + int ret; + int i; + + /* For common interrupt */ + int_name = priv->int_name_mac; + sprintf(int_name, "%s:%s", dev->name, "mac"); + ret = request_irq(dev->irq, stmmac_mac_interrupt, + 0, int_name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: alloc mac MSI %d (error: %d)\n", + __func__, dev->irq, ret); + irq_err = REQ_IRQ_ERR_MAC; + goto irq_error; + } + + /* Request the Wake IRQ in case of another line + * is used for WoL + */ + if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) { + int_name = priv->int_name_wol; + sprintf(int_name, "%s:%s", dev->name, "wol"); + ret = request_irq(priv->wol_irq, + stmmac_mac_interrupt, + 0, int_name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: alloc wol MSI %d (error: %d)\n", + __func__, priv->wol_irq, ret); + irq_err = REQ_IRQ_ERR_WOL; + goto irq_error; + } + } + + /* Request the LPI IRQ in case of another line + * is used for LPI + */ + if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) { + int_name = priv->int_name_lpi; + sprintf(int_name, "%s:%s", dev->name, "lpi"); + ret = request_irq(priv->lpi_irq, + stmmac_mac_interrupt, + 0, int_name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: alloc lpi MSI %d (error: %d)\n", + __func__, priv->lpi_irq, ret); + irq_err = REQ_IRQ_ERR_LPI; + goto irq_error; + } + } + + /* Request the Safety Feature Correctible Error line in + * case of another line is used + */ + if (priv->sfty_ce_irq > 0 && priv->sfty_ce_irq != dev->irq) { + int_name = priv->int_name_sfty_ce; + sprintf(int_name, "%s:%s", dev->name, "safety-ce"); + ret = request_irq(priv->sfty_ce_irq, + stmmac_safety_interrupt, + 0, int_name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: alloc sfty ce MSI %d (error: %d)\n", + __func__, priv->sfty_ce_irq, ret); + irq_err = REQ_IRQ_ERR_SFTY_CE; + goto irq_error; + } + } + + /* Request the Safety Feature Uncorrectible Error line in + * case of another line is used + */ + if (priv->sfty_ue_irq > 0 && priv->sfty_ue_irq != dev->irq) { + int_name = priv->int_name_sfty_ue; + sprintf(int_name, "%s:%s", dev->name, "safety-ue"); + ret = request_irq(priv->sfty_ue_irq, + stmmac_safety_interrupt, + 0, int_name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: alloc sfty ue MSI %d (error: %d)\n", + __func__, priv->sfty_ue_irq, ret); + irq_err = REQ_IRQ_ERR_SFTY_UE; + goto irq_error; + } + } + + /* Request Rx MSI irq */ + for (i = 0; i < priv->plat->rx_queues_to_use; i++) { + if (priv->rx_irq[i] == 0) + continue; + + int_name = priv->int_name_rx_irq[i]; + sprintf(int_name, "%s:%s-%d", dev->name, "rx", i); + ret = request_irq(priv->rx_irq[i], + stmmac_msi_intr_rx, + 0, int_name, &priv->rx_queue[i]); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: alloc rx-%d MSI %d (error: %d)\n", + __func__, i, priv->rx_irq[i], ret); + irq_err = REQ_IRQ_ERR_RX; + irq_idx = i; + goto irq_error; + } + } + + /* Request Tx MSI irq */ + for (i = 0; i < priv->plat->tx_queues_to_use; i++) { + if (priv->tx_irq[i] == 0) + continue; + + int_name = priv->int_name_tx_irq[i]; + sprintf(int_name, "%s:%s-%d", dev->name, "tx", i); + ret = request_irq(priv->tx_irq[i], + stmmac_msi_intr_tx, + 0, int_name, &priv->tx_queue[i]); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: alloc tx-%d MSI %d (error: %d)\n", + __func__, i, priv->tx_irq[i], ret); + irq_err = REQ_IRQ_ERR_TX; + irq_idx = i; + goto irq_error; + } + } + + return 0; + +irq_error: + stmmac_free_irq(dev, irq_err, irq_idx); + return ret; +} + +static int stmmac_request_irq_single(struct net_device *dev) +{ + enum request_irq_err irq_err = REQ_IRQ_ERR_NO; + struct stmmac_priv *priv = netdev_priv(dev); + int ret; + + ret = request_irq(dev->irq, stmmac_interrupt, + IRQF_SHARED, dev->name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: ERROR: allocating the IRQ %d (error: %d)\n", + __func__, dev->irq, ret); + irq_err = REQ_IRQ_ERR_MAC; + return ret; + } + + /* Request the Wake IRQ in case of another line + * is used for WoL + */ + if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) { + ret = request_irq(priv->wol_irq, stmmac_interrupt, + IRQF_SHARED, dev->name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: ERROR: allocating the WoL IRQ %d (%d)\n", + __func__, priv->wol_irq, ret); + irq_err = REQ_IRQ_ERR_WOL; + return ret; + } + } + + /* Request the IRQ lines */ + if (priv->lpi_irq > 0 && priv->lpi_irq != dev->irq) { + ret = request_irq(priv->lpi_irq, stmmac_interrupt, + IRQF_SHARED, dev->name, dev); + if (unlikely(ret < 0)) { + netdev_err(priv->dev, + "%s: ERROR: allocating the LPI IRQ %d (%d)\n", + __func__, priv->lpi_irq, ret); + irq_err = REQ_IRQ_ERR_LPI; + goto irq_error; + } + } + + return 0; + +irq_error: + stmmac_free_irq(dev, irq_err, 0); + return ret; +} + +static int stmmac_request_irq(struct net_device *dev) +{ + struct stmmac_priv *priv = netdev_priv(dev); + int ret; + + /* Request the IRQ lines */ + if (priv->plat->multi_msi_en) + ret = stmmac_request_irq_multi_msi(dev); + else + ret = stmmac_request_irq_single(dev); + + return ret; +} + /** * stmmac_open - open entry point of the driver * @dev : pointer to the device structure. @@ -3077,50 +3336,15 @@ static int stmmac_open(struct net_device *dev) /* We may have called phylink_speed_down before */ phylink_speed_up(priv->phylink); - /* Request the IRQ lines */ - ret = request_irq(dev->irq, stmmac_interrupt, - IRQF_SHARED, dev->name, dev); - if (unlikely(ret < 0)) { - netdev_err(priv->dev, - "%s: ERROR: allocating the IRQ %d (error: %d)\n", - __func__, dev->irq, ret); + ret = stmmac_request_irq(dev); + if (ret) goto irq_error; - } - - /* Request the Wake IRQ in case of another line is used for WoL */ - if (priv->wol_irq != dev->irq) { - ret = request_irq(priv->wol_irq, stmmac_interrupt, - IRQF_SHARED, dev->name, dev); - if (unlikely(ret < 0)) { - netdev_err(priv->dev, - "%s: ERROR: allocating the WoL IRQ %d (%d)\n", - __func__, priv->wol_irq, ret); - goto wolirq_error; - } - } - - /* Request the IRQ lines */ - if (priv->lpi_irq > 0) { - ret = request_irq(priv->lpi_irq, stmmac_interrupt, IRQF_SHARED, - dev->name, dev); - if (unlikely(ret < 0)) { - netdev_err(priv->dev, - "%s: ERROR: allocating the LPI IRQ %d (%d)\n", - __func__, priv->lpi_irq, ret); - goto lpiirq_error; - } - } stmmac_enable_all_queues(priv); netif_tx_start_all_queues(priv->dev); return 0; -lpiirq_error: - if (priv->wol_irq != dev->irq) - free_irq(priv->wol_irq, dev); -wolirq_error: - free_irq(dev->irq, dev); irq_error: phylink_stop(priv->phylink); @@ -3170,11 +3394,7 @@ static int stmmac_release(struct net_device *dev) hrtimer_cancel(&priv->tx_queue[chan].txtimer); /* Free the IRQ lines */ - free_irq(dev->irq, dev); - if (priv->wol_irq != dev->irq) - free_irq(priv->wol_irq, dev); - if (priv->lpi_irq > 0) - free_irq(priv->lpi_irq, dev); + stmmac_free_irq(dev, REQ_IRQ_ERR_ALL, 0); if (priv->eee_enabled) { priv->tx_path_in_lpi_mode = false; @@ -4477,15 +4697,136 @@ static irqreturn_t stmmac_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static irqreturn_t stmmac_mac_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = (struct net_device *)dev_id; + struct stmmac_priv *priv = netdev_priv(dev); + + if (unlikely(!dev)) { + netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); + return IRQ_NONE; + } + + /* Check if adapter is up */ + if (test_bit(STMMAC_DOWN, &priv->state)) + return IRQ_HANDLED; + + /* To handle Common interrupts */ + stmmac_common_interrupt(priv); + + return IRQ_HANDLED; +} + +static irqreturn_t stmmac_safety_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = (struct net_device *)dev_id; + struct stmmac_priv *priv = netdev_priv(dev); + + if (unlikely(!dev)) { + netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); + return IRQ_NONE; + } + + /* Check if adapter is up */ + if (test_bit(STMMAC_DOWN, &priv->state)) + return IRQ_HANDLED; + + /* Check if a fatal error happened */ + stmmac_safety_feat_interrupt(priv); + + return IRQ_HANDLED; +} + +static irqreturn_t stmmac_msi_intr_tx(int irq, void *data) +{ + struct stmmac_tx_queue *tx_q = (struct stmmac_tx_queue *)data; + int chan = tx_q->queue_index; + struct stmmac_priv *priv; + int status; + + priv = container_of(tx_q, struct stmmac_priv, tx_queue[chan]); + + if (unlikely(!data)) { + netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); + return IRQ_NONE; + } + + /* Check if adapter is up */ + if (test_bit(STMMAC_DOWN, &priv->state)) + return IRQ_HANDLED; + + status = stmmac_napi_check(priv, chan, DMA_DIR_TX); + + if (unlikely(status & tx_hard_error_bump_tc)) { + /* Try to bump up the dma threshold on this failure */ + if (unlikely(priv->xstats.threshold != SF_DMA_MODE) && + tc <= 256) { + tc += 64; + if (priv->plat->force_thresh_dma_mode) + stmmac_set_dma_operation_mode(priv, + tc, + tc, + chan); + else + stmmac_set_dma_operation_mode(priv, + tc, + SF_DMA_MODE, + chan); + priv->xstats.threshold = tc; + } + } else if (unlikely(status == tx_hard_error)) { + stmmac_tx_err(priv, chan); + } + + return IRQ_HANDLED; +} + +static irqreturn_t stmmac_msi_intr_rx(int irq, void *data) +{ + struct stmmac_rx_queue *rx_q = (struct stmmac_rx_queue *)data; + int chan = rx_q->queue_index; + struct stmmac_priv *priv; + + priv = container_of(rx_q, struct stmmac_priv, rx_queue[chan]); + + if (unlikely(!data)) { + netdev_err(priv->dev, "%s: invalid dev pointer\n", __func__); + return IRQ_NONE; + } + + /* Check if adapter is up */ + if (test_bit(STMMAC_DOWN, &priv->state)) + return IRQ_HANDLED; + + stmmac_napi_check(priv, chan, DMA_DIR_RX); + + return IRQ_HANDLED; +} + #ifdef CONFIG_NET_POLL_CONTROLLER /* Polling receive - used by NETCONSOLE and other diagnostic tools * to allow network I/O with interrupts disabled. */ static void stmmac_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); - stmmac_interrupt(dev->irq, dev); - enable_irq(dev->irq); + struct stmmac_priv *priv = netdev_priv(dev); + int i; + + /* If adapter is down, do nothing */ + if (test_bit(STMMAC_DOWN, &priv->state)) + return; + + if (priv->plat->multi_msi_en) { + for (i = 0; i < priv->plat->rx_queues_to_use; i++) + stmmac_msi_intr_rx(0, &priv->rx_queue[i]); + + for (i = 0; i < priv->plat->tx_queues_to_use; i++) + stmmac_msi_intr_tx(0, &priv->tx_queue[i]); + } else { + disable_irq(dev->irq); + stmmac_interrupt(dev->irq, dev); + enable_irq(dev->irq); + } } #endif @@ -5283,6 +5624,12 @@ int stmmac_dvr_probe(struct device *device, priv->dev->irq = res->irq; priv->wol_irq = res->wol_irq; priv->lpi_irq = res->lpi_irq; + priv->sfty_ce_irq = res->sfty_ce_irq; + priv->sfty_ue_irq = res->sfty_ue_irq; + for (i = 0; i < MTL_MAX_RX_QUEUES; i++) + priv->rx_irq[i] = res->rx_irq[i]; + for (i = 0; i < MTL_MAX_TX_QUEUES; i++) + priv->tx_irq[i] = res->tx_irq[i]; if (!IS_ERR_OR_NULL(res->mac)) memcpy(priv->dev->dev_addr, res->mac, ETH_ALEN); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index febdb43d27e5..afc12b9385db 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -237,5 +237,13 @@ struct plat_stmmacenet_data { struct pci_dev *pdev; bool has_crossts; int int_snapshot_num; + bool multi_msi_en; + int msi_mac_vec; + int msi_wol_vec; + int msi_lpi_vec; + int msi_sfty_ce_vec; + int msi_sfty_ue_vec; + int msi_rx_base_vec; + int msi_tx_base_vec; }; #endif -- cgit v1.2.3 From 6ccf12ae111e49324b439410066e8cc359aeee6d Mon Sep 17 00:00:00 2001 From: "Wong, Vee Khee" Date: Fri, 26 Mar 2021 01:39:16 +0800 Subject: net: stmmac: use interrupt mode INTM=1 for multi-MSI For interrupt mode INTM=0, TX/RX transfer complete will trigger signal not only on sbd_perch_[tx|rx]_intr_o (Transmit/Receive Per Channel) but also on the sbd_intr_o (Common). As for multi-MSI implementation, setting interrupt mode INTM=1 is more efficient as each TX intr and RX intr (TI/RI) will be handled by TX/RX ISR without the need of calling the common MAC ISR. Updated the TX/RX NORMAL interrupts status checking process as the NIS status bit is not asserted for any RI/TI events for INTM=1. Signed-off-by: Wong, Vee Khee Co-developed-by: Voon Weifeng Signed-off-by: Voon Weifeng Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c | 7 +++++++ drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h | 3 +++ drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c | 23 +++++++++++------------ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + include/linux/stmmac.h | 1 + 5 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index 8954b85eb850..cb17f6c35e54 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -161,6 +161,13 @@ static void dwmac4_dma_init(void __iomem *ioaddr, value |= DMA_SYS_BUS_EAME; writel(value, ioaddr + DMA_SYS_BUS_MODE); + + if (dma_cfg->multi_msi_en) { + value = readl(ioaddr + DMA_BUS_MODE); + value &= ~DMA_BUS_MODE_INTM_MASK; + value |= (DMA_BUS_MODE_INTM_MODE1 << DMA_BUS_MODE_INTM_SHIFT); + writel(value, ioaddr + DMA_BUS_MODE); + } } static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h index 5c0c53832adb..05481eb13ba6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h @@ -25,6 +25,9 @@ #define DMA_TBS_CTRL 0x00001050 /* DMA Bus Mode bitmap */ +#define DMA_BUS_MODE_INTM_MASK GENMASK(17, 16) +#define DMA_BUS_MODE_INTM_SHIFT 16 +#define DMA_BUS_MODE_INTM_MODE1 0x1 #define DMA_BUS_MODE_SFT_RESET BIT(0) /* DMA SYS Bus Mode bitmap */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c index 3fa602dabf49..e63270267578 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_lib.c @@ -166,20 +166,19 @@ int dwmac4_dma_interrupt(void __iomem *ioaddr, } } /* TX/RX NORMAL interrupts */ - if (likely(intr_status & DMA_CHAN_STATUS_NIS)) { + if (likely(intr_status & DMA_CHAN_STATUS_NIS)) x->normal_irq_n++; - if (likely(intr_status & DMA_CHAN_STATUS_RI)) { - x->rx_normal_irq_n++; - ret |= handle_rx; - } - if (likely(intr_status & (DMA_CHAN_STATUS_TI | - DMA_CHAN_STATUS_TBU))) { - x->tx_normal_irq_n++; - ret |= handle_tx; - } - if (unlikely(intr_status & DMA_CHAN_STATUS_ERI)) - x->rx_early_irq++; + if (likely(intr_status & DMA_CHAN_STATUS_RI)) { + x->rx_normal_irq_n++; + ret |= handle_rx; + } + if (likely(intr_status & (DMA_CHAN_STATUS_TI | + DMA_CHAN_STATUS_TBU))) { + x->tx_normal_irq_n++; + ret |= handle_tx; } + if (unlikely(intr_status & DMA_CHAN_STATUS_ERI)) + x->rx_early_irq++; writel(intr_status & intr_en, ioaddr + DMA_CHAN_STATUS(chan)); return ret; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 459477db455c..f4fa5402cd64 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5620,6 +5620,7 @@ int stmmac_dvr_probe(struct device *device, priv->plat = plat_dat; priv->ioaddr = res->addr; priv->dev->base_addr = (unsigned long)res->addr; + priv->plat->dma_cfg->multi_msi_en = priv->plat->multi_msi_en; priv->dev->irq = res->irq; priv->wol_irq = res->wol_irq; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index afc12b9385db..e338ef7abc00 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -96,6 +96,7 @@ struct stmmac_dma_cfg { int mixed_burst; bool aal; bool eame; + bool multi_msi_en; }; #define AXI_BLEN 7 -- cgit v1.2.3 From cb9444130662c6c13022579c861098f212db2562 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Mar 2021 11:08:13 -0700 Subject: sysctl: add proc_dou8vec_minmax() Networking has many sysctls that could fit in one u8. This patch adds proc_dou8vec_minmax() for this purpose. Note that the .extra1 and .extra2 fields are pointing to integers, because it makes conversions easier. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- fs/proc/proc_sysctl.c | 6 +++++ include/linux/sysctl.h | 2 ++ kernel/sysctl.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) (limited to 'include') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 984e42f8cb11..7256b8962e3c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1108,6 +1108,11 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) err |= sysctl_err(path, table, "array not allowed"); } + if (table->proc_handler == proc_dou8vec_minmax) { + if (table->maxlen != sizeof(u8)) + err |= sysctl_err(path, table, "array not allowed"); + } + return err; } @@ -1123,6 +1128,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table) (table->proc_handler == proc_douintvec) || (table->proc_handler == proc_douintvec_minmax) || (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dou8vec_minmax) || (table->proc_handler == proc_dointvec_jiffies) || (table->proc_handler == proc_dointvec_userhz_jiffies) || (table->proc_handler == proc_dointvec_ms_jiffies) || diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 51298a4f4623..d99ca99837de 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -53,6 +53,8 @@ int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +int proc_dou8vec_minmax(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 62fbd09b5dc1..90d2892ef6a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1034,6 +1034,65 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } +/** + * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars + * values from/to the user buffer, treated as an ASCII string. Negative + * strings are not allowed. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success or an error on write when the range check fails. + */ +int proc_dou8vec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table tmp; + unsigned int min = 0, max = 255U, val; + u8 *data = table->data; + struct do_proc_douintvec_minmax_conv_param param = { + .min = &min, + .max = &max, + }; + int res; + + /* Do not support arrays yet. */ + if (table->maxlen != sizeof(u8)) + return -EINVAL; + + if (table->extra1) { + min = *(unsigned int *) table->extra1; + if (min > 255U) + return -EINVAL; + } + if (table->extra2) { + max = *(unsigned int *) table->extra2; + if (max > 255U) + return -EINVAL; + } + + tmp = *table; + + tmp.maxlen = sizeof(val); + tmp.data = &val; + val = *data; + res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, + do_proc_douintvec_minmax_conv, ¶m); + if (res) + return res; + if (write) + *data = val; + return 0; +} +EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); + static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) @@ -1582,6 +1641,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_dou8vec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + int proc_dointvec_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From 4b6bbf17d4e1939afa72821879fc033d725e9491 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Mar 2021 11:08:14 -0700 Subject: ipv4: shrink netns_ipv4 with sysctl conversions These sysctls that can fit in one byte instead of one int are converted to save space and thus reduce cache line misses. - icmp_echo_ignore_all, icmp_echo_ignore_broadcasts, - icmp_ignore_bogus_error_responses, icmp_errors_use_inbound_ifaddr - tcp_ecn, tcp_ecn_fallback - ip_default_ttl, ip_no_pmtu_disc, ip_fwd_use_pmtu - ip_nonlocal_bind, ip_autobind_reuse - ip_dynaddr, ip_early_demux, raw_l3mdev_accept - nexthop_compat_mode, fwmark_reflect Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 32 +++++++++++------------ net/ipv4/sysctl_net_ipv4.c | 64 +++++++++++++++++++++++----------------------- 2 files changed, 48 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9e3cb2722b80..7b572d468fde 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -83,36 +83,36 @@ struct netns_ipv4 { struct xt_table *nat_table; #endif - int sysctl_icmp_echo_ignore_all; - int sysctl_icmp_echo_ignore_broadcasts; - int sysctl_icmp_ignore_bogus_error_responses; + u8 sysctl_icmp_echo_ignore_all; + u8 sysctl_icmp_echo_ignore_broadcasts; + u8 sysctl_icmp_ignore_bogus_error_responses; + u8 sysctl_icmp_errors_use_inbound_ifaddr; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; - int sysctl_icmp_errors_use_inbound_ifaddr; struct local_ports ip_local_ports; - int sysctl_tcp_ecn; - int sysctl_tcp_ecn_fallback; + u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_fallback; - int sysctl_ip_default_ttl; - int sysctl_ip_no_pmtu_disc; - int sysctl_ip_fwd_use_pmtu; + u8 sysctl_ip_default_ttl; + u8 sysctl_ip_no_pmtu_disc; + u8 sysctl_ip_fwd_use_pmtu; int sysctl_ip_fwd_update_priority; - int sysctl_ip_nonlocal_bind; - int sysctl_ip_autobind_reuse; + u8 sysctl_ip_nonlocal_bind; + u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ - int sysctl_ip_dynaddr; - int sysctl_ip_early_demux; + u8 sysctl_ip_dynaddr; + u8 sysctl_ip_early_demux; #ifdef CONFIG_NET_L3_MASTER_DEV - int sysctl_raw_l3mdev_accept; + u8 sysctl_raw_l3mdev_accept; #endif int sysctl_tcp_early_demux; int sysctl_udp_early_demux; - int sysctl_nexthop_compat_mode; + u8 sysctl_nexthop_compat_mode; - int sysctl_fwmark_reflect; + u8 sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_tcp_l3mdev_accept; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index f55095d3ed16..e5ff17526603 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -595,30 +595,30 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "icmp_ratelimit", @@ -645,9 +645,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "raw_l3mdev_accept", .data = &init_net.ipv4.sysctl_raw_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -655,30 +655,30 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_ecn", .data = &init_net.ipv4.sysctl_tcp_ecn, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_dynaddr", .data = &init_net.ipv4.sysctl_ip_dynaddr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_early_demux", .data = &init_net.ipv4.sysctl_ip_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "udp_early_demux", @@ -697,18 +697,18 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "nexthop_compat_mode", .data = &init_net.ipv4.sysctl_nexthop_compat_mode, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = &ip_ttl_min, .extra2 = &ip_ttl_max, }, @@ -729,16 +729,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_use_pmtu", .data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_forward_update_priority", @@ -752,25 +752,25 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_nonlocal_bind", .data = &init_net.ipv4.sysctl_ip_nonlocal_bind, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_autobind_reuse", .data = &init_net.ipv4.sysctl_ip_autobind_reuse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fwmark_reflect", .data = &init_net.ipv4.sysctl_fwmark_reflect, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fwmark_accept", -- cgit v1.2.3 From 1c69dedc8fa7c9684d48dc89994b4e0aceeae588 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Mar 2021 11:08:15 -0700 Subject: ipv4: convert ip_forward_update_priority sysctl to u8 This sysctl uses ip_fwd_update_priority() helper, so the conversion needs to change it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7b572d468fde..d2c0a6592ff6 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -98,7 +98,7 @@ struct netns_ipv4 { u8 sysctl_ip_default_ttl; u8 sysctl_ip_no_pmtu_disc; u8 sysctl_ip_fwd_use_pmtu; - int sysctl_ip_fwd_update_priority; + u8 sysctl_ip_fwd_update_priority; u8 sysctl_ip_nonlocal_bind; u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e5ff17526603..713e0c0c91e9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -209,7 +209,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, net = container_of(table->data, struct net, ipv4.sysctl_ip_fwd_update_priority); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE, net); @@ -743,7 +743,7 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "ip_forward_update_priority", .data = &init_net.ipv4.sysctl_ip_fwd_update_priority, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = ipv4_fwd_update_priority, .extra1 = SYSCTL_ZERO, -- cgit v1.2.3 From 2932bcda070d9a02548e57119b1ada8f018c40b5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Mar 2021 11:08:16 -0700 Subject: inet: convert tcp_early_demux and udp_early_demux to u8 For these sysctls, their dedicated helpers have to use proc_dou8vec_minmax(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 4 ++-- net/ipv4/sysctl_net_ipv4.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d2c0a6592ff6..00f250ee4419 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -107,8 +107,8 @@ struct netns_ipv4 { #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif - int sysctl_tcp_early_demux; - int sysctl_udp_early_demux; + u8 sysctl_tcp_early_demux; + u8 sysctl_udp_early_demux; u8 sysctl_nexthop_compat_mode; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 713e0c0c91e9..510a32635612 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -389,7 +389,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, { int ret = 0; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && !ret) { int enabled = init_net.ipv4.sysctl_tcp_early_demux; @@ -405,7 +405,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, { int ret = 0; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && !ret) { int enabled = init_net.ipv4.sysctl_udp_early_demux; @@ -683,14 +683,14 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "udp_early_demux", .data = &init_net.ipv4.sysctl_udp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_udp_early_demux }, { .procname = "tcp_early_demux", .data = &init_net.ipv4.sysctl_tcp_early_demux, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_tcp_early_demux }, -- cgit v1.2.3 From 4ecc1baf362c5df2dcabe242511e38ee28486545 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Mar 2021 11:08:17 -0700 Subject: tcp: convert elligible sysctls to u8 Many tcp sysctls are either bools or small ints that can fit into u8. Reducing space taken by sysctls can save few cache line misses when sending/receiving data while cpu caches are empty, for example after cpu idle period. This is hard to measure with typical network performance tests, but after this patch, struct netns_ipv4 has shrunk by three cache lines. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 68 +++++++++++------------ net/ipv4/sysctl_net_ipv4.c | 136 ++++++++++++++++++++++----------------------- 2 files changed, 102 insertions(+), 102 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 00f250ee4419..d377266d133f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -113,11 +113,11 @@ struct netns_ipv4 { u8 sysctl_nexthop_compat_mode; u8 sysctl_fwmark_reflect; - int sysctl_tcp_fwmark_accept; + u8 sysctl_tcp_fwmark_accept; #ifdef CONFIG_NET_L3_MASTER_DEV - int sysctl_tcp_l3mdev_accept; + u8 sysctl_tcp_l3mdev_accept; #endif - int sysctl_tcp_mtu_probing; + u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; int sysctl_tcp_min_snd_mss; @@ -125,46 +125,47 @@ struct netns_ipv4 { u32 sysctl_tcp_probe_interval; int sysctl_tcp_keepalive_time; - int sysctl_tcp_keepalive_probes; int sysctl_tcp_keepalive_intvl; + u8 sysctl_tcp_keepalive_probes; - int sysctl_tcp_syn_retries; - int sysctl_tcp_synack_retries; - int sysctl_tcp_syncookies; + u8 sysctl_tcp_syn_retries; + u8 sysctl_tcp_synack_retries; + u8 sysctl_tcp_syncookies; int sysctl_tcp_reordering; - int sysctl_tcp_retries1; - int sysctl_tcp_retries2; - int sysctl_tcp_orphan_retries; + u8 sysctl_tcp_retries1; + u8 sysctl_tcp_retries2; + u8 sysctl_tcp_orphan_retries; + u8 sysctl_tcp_tw_reuse; int sysctl_tcp_fin_timeout; unsigned int sysctl_tcp_notsent_lowat; - int sysctl_tcp_tw_reuse; - int sysctl_tcp_sack; - int sysctl_tcp_window_scaling; - int sysctl_tcp_timestamps; - int sysctl_tcp_early_retrans; - int sysctl_tcp_recovery; - int sysctl_tcp_thin_linear_timeouts; - int sysctl_tcp_slow_start_after_idle; - int sysctl_tcp_retrans_collapse; - int sysctl_tcp_stdurg; - int sysctl_tcp_rfc1337; - int sysctl_tcp_abort_on_overflow; - int sysctl_tcp_fack; + u8 sysctl_tcp_sack; + u8 sysctl_tcp_window_scaling; + u8 sysctl_tcp_timestamps; + u8 sysctl_tcp_early_retrans; + u8 sysctl_tcp_recovery; + u8 sysctl_tcp_thin_linear_timeouts; + u8 sysctl_tcp_slow_start_after_idle; + u8 sysctl_tcp_retrans_collapse; + u8 sysctl_tcp_stdurg; + u8 sysctl_tcp_rfc1337; + u8 sysctl_tcp_abort_on_overflow; + u8 sysctl_tcp_fack; /* obsolete */ int sysctl_tcp_max_reordering; - int sysctl_tcp_dsack; - int sysctl_tcp_app_win; int sysctl_tcp_adv_win_scale; - int sysctl_tcp_frto; - int sysctl_tcp_nometrics_save; - int sysctl_tcp_no_ssthresh_metrics_save; - int sysctl_tcp_moderate_rcvbuf; - int sysctl_tcp_tso_win_divisor; - int sysctl_tcp_workaround_signed_windows; + u8 sysctl_tcp_dsack; + u8 sysctl_tcp_app_win; + u8 sysctl_tcp_frto; + u8 sysctl_tcp_nometrics_save; + u8 sysctl_tcp_no_ssthresh_metrics_save; + u8 sysctl_tcp_moderate_rcvbuf; + u8 sysctl_tcp_tso_win_divisor; + u8 sysctl_tcp_workaround_signed_windows; int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; - int sysctl_tcp_min_tso_segs; int sysctl_tcp_min_rtt_wlen; - int sysctl_tcp_autocorking; + u8 sysctl_tcp_min_tso_segs; + u8 sysctl_tcp_autocorking; + u8 sysctl_tcp_reflect_tos; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; @@ -182,7 +183,6 @@ struct netns_ipv4 { unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; - int sysctl_tcp_reflect_tos; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 510a32635612..442ff4be1bde 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -775,17 +775,17 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_fwmark_accept", .data = &init_net.ipv4.sysctl_tcp_fwmark_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "tcp_l3mdev_accept", .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -793,9 +793,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_mtu_probing", .data = &init_net.ipv4.sysctl_tcp_mtu_probing, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_base_mss", @@ -897,9 +897,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_keepalive_probes", .data = &init_net.ipv4.sysctl_tcp_keepalive_probes, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_keepalive_intvl", @@ -911,26 +911,26 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_syn_retries", .data = &init_net.ipv4.sysctl_tcp_syn_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = &tcp_syn_retries_min, .extra2 = &tcp_syn_retries_max }, { .procname = "tcp_synack_retries", .data = &init_net.ipv4.sysctl_tcp_synack_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, #ifdef CONFIG_SYN_COOKIES { .procname = "tcp_syncookies", .data = &init_net.ipv4.sysctl_tcp_syncookies, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, #endif { @@ -943,24 +943,24 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_retries1", .data = &init_net.ipv4.sysctl_tcp_retries1, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra2 = &tcp_retr1_max }, { .procname = "tcp_retries2", .data = &init_net.ipv4.sysctl_tcp_retries2, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_orphan_retries", .data = &init_net.ipv4.sysctl_tcp_orphan_retries, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fin_timeout", @@ -979,9 +979,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_tw_reuse", .data = &init_net.ipv4.sysctl_tcp_tw_reuse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, @@ -1067,88 +1067,88 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_sack", .data = &init_net.ipv4.sysctl_tcp_sack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_window_scaling", .data = &init_net.ipv4.sysctl_tcp_window_scaling, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_timestamps", .data = &init_net.ipv4.sysctl_tcp_timestamps, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_early_retrans", .data = &init_net.ipv4.sysctl_tcp_early_retrans, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &four, }, { .procname = "tcp_recovery", .data = &init_net.ipv4.sysctl_tcp_recovery, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_thin_linear_timeouts", .data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_slow_start_after_idle", .data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_retrans_collapse", .data = &init_net.ipv4.sysctl_tcp_retrans_collapse, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_stdurg", .data = &init_net.ipv4.sysctl_tcp_stdurg, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_rfc1337", .data = &init_net.ipv4.sysctl_tcp_rfc1337, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_abort_on_overflow", .data = &init_net.ipv4.sysctl_tcp_abort_on_overflow, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_fack", .data = &init_net.ipv4.sysctl_tcp_fack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_max_reordering", @@ -1160,16 +1160,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_dsack", .data = &init_net.ipv4.sysctl_tcp_dsack, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_app_win", .data = &init_net.ipv4.sysctl_tcp_app_win, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_adv_win_scale", @@ -1183,46 +1183,46 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_frto", .data = &init_net.ipv4.sysctl_tcp_frto, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_metrics_save", .data = &init_net.ipv4.sysctl_tcp_nometrics_save, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_no_ssthresh_metrics_save", .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_workaround_signed_windows", .data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "tcp_limit_output_bytes", @@ -1241,9 +1241,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_min_tso_segs", .data = &init_net.ipv4.sysctl_tcp_min_tso_segs, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &gso_max_segs, }, @@ -1259,9 +1259,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_autocorking", .data = &init_net.ipv4.sysctl_tcp_autocorking, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1332,9 +1332,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_reflect_tos", .data = &init_net.ipv4.sysctl_tcp_reflect_tos, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -- cgit v1.2.3 From b910eaaaa4b89976ef02e5d6448f3f73dc671d91 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 22 Mar 2021 22:51:46 -0700 Subject: bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper Jiri Olsa reported a bug ([1]) in kernel where cgroup local storage pointer may be NULL in bpf_get_local_storage() helper. There are two issues uncovered by this bug: (1). kprobe or tracepoint prog incorrectly sets cgroup local storage before prog run, (2). due to change from preempt_disable to migrate_disable, preemption is possible and percpu storage might be overwritten by other tasks. This issue (1) is fixed in [2]. This patch tried to address issue (2). The following shows how things can go wrong: task 1: bpf_cgroup_storage_set() for percpu local storage preemption happens task 2: bpf_cgroup_storage_set() for percpu local storage preemption happens task 1: run bpf program task 1 will effectively use the percpu local storage setting by task 2 which will be either NULL or incorrect ones. Instead of just one common local storage per cpu, this patch fixed the issue by permitting 8 local storages per cpu and each local storage is identified by a task_struct pointer. This way, we allow at most 8 nested preemption between bpf_cgroup_storage_set() and bpf_cgroup_storage_unset(). The percpu local storage slot is released (calling bpf_cgroup_storage_unset()) by the same task after bpf program finished running. bpf_test_run() is also fixed to use the new bpf_cgroup_storage_set() interface. The patch is tested on top of [2] with reproducer in [1]. Without this patch, kernel will emit error in 2-3 minutes. With this patch, after one hour, still no error. [1] https://lore.kernel.org/bpf/CAKH8qBuXCfUz=w8L+Fj74OaUpbosO29niYwTki7e3Ag044_aww@mail.gmail.com/T [2] https://lore.kernel.org/bpf/20210309185028.3763817-1-yhs@fb.com Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20210323055146.3334476-1-yhs@fb.com --- include/linux/bpf-cgroup.h | 57 +++++++++++++++++++++++++++++++++++++++------- include/linux/bpf.h | 22 ++++++++++++++---- kernel/bpf/helpers.c | 15 ++++++++---- kernel/bpf/local_storage.c | 5 ++-- net/bpf/test_run.c | 6 ++++- 5 files changed, 86 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c42e02b4d84b..6a29fe11485d 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -20,14 +20,25 @@ struct bpf_sock_ops_kern; struct bpf_cgroup_storage; struct ctl_table; struct ctl_table_header; +struct task_struct; #ifdef CONFIG_CGROUP_BPF extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; #define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) -DECLARE_PER_CPU(struct bpf_cgroup_storage*, - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +#define BPF_CGROUP_STORAGE_NEST_MAX 8 + +struct bpf_cgroup_storage_info { + struct task_struct *task; + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; +}; + +/* For each cpu, permit maximum BPF_CGROUP_STORAGE_NEST_MAX number of tasks + * to use bpf cgroup storage simultaneously. + */ +DECLARE_PER_CPU(struct bpf_cgroup_storage_info, + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); #define for_each_cgroup_storage_type(stype) \ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) @@ -161,13 +172,42 @@ static inline enum bpf_cgroup_storage_type cgroup_storage_type( return BPF_CGROUP_STORAGE_SHARED; } -static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage - *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) +static inline int bpf_cgroup_storage_set(struct bpf_cgroup_storage + *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { enum bpf_cgroup_storage_type stype; + int i, err = 0; + + preempt_disable(); + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != NULL)) + continue; + + this_cpu_write(bpf_cgroup_storage_info[i].task, current); + for_each_cgroup_storage_type(stype) + this_cpu_write(bpf_cgroup_storage_info[i].storage[stype], + storage[stype]); + goto out; + } + err = -EBUSY; + WARN_ON_ONCE(1); + +out: + preempt_enable(); + return err; +} + +static inline void bpf_cgroup_storage_unset(void) +{ + int i; + + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + continue; - for_each_cgroup_storage_type(stype) - this_cpu_write(bpf_cgroup_storage[stype], storage[stype]); + this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); + return; + } } struct bpf_cgroup_storage * @@ -448,8 +488,9 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } -static inline void bpf_cgroup_storage_set( - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} +static inline int bpf_cgroup_storage_set( + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) { return 0; } +static inline void bpf_cgroup_storage_unset(void) {} static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 24678d6ecbcf..5a0801b420ca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1106,6 +1106,13 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, /* BPF program asks to set CN on the packet. */ #define BPF_RET_SET_CN (1 << 0) +/* For BPF_PROG_RUN_ARRAY_FLAGS and __BPF_PROG_RUN_ARRAY, + * if bpf_cgroup_storage_set() failed, the rest of programs + * will not execute. This should be a really rare scenario + * as it requires BPF_CGROUP_STORAGE_NEST_MAX number of + * preemptions all between bpf_cgroup_storage_set() and + * bpf_cgroup_storage_unset() on the same cpu. + */ #define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ ({ \ struct bpf_prog_array_item *_item; \ @@ -1118,10 +1125,12 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, _array = rcu_dereference(array); \ _item = &_array->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ + if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ + break; \ func_ret = func(_prog, ctx); \ _ret &= (func_ret & 1); \ *(ret_flags) |= (func_ret >> 1); \ + bpf_cgroup_storage_unset(); \ _item++; \ } \ rcu_read_unlock(); \ @@ -1142,9 +1151,14 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, goto _out; \ _item = &_array->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ - if (set_cg_storage) \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ - _ret &= func(_prog, ctx); \ + if (!set_cg_storage) { \ + _ret &= func(_prog, ctx); \ + } else { \ + if (unlikely(bpf_cgroup_storage_set(_item->cgroup_storage))) \ + break; \ + _ret &= func(_prog, ctx); \ + bpf_cgroup_storage_unset(); \ + } \ _item++; \ } \ _out: \ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 074800226327..f306611c4ddf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -382,8 +382,8 @@ const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { }; #ifdef CONFIG_CGROUP_BPF -DECLARE_PER_CPU(struct bpf_cgroup_storage*, - bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +DECLARE_PER_CPU(struct bpf_cgroup_storage_info, + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) { @@ -392,10 +392,17 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) * verifier checks that its value is correct. */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); - struct bpf_cgroup_storage *storage; + struct bpf_cgroup_storage *storage = NULL; void *ptr; + int i; - storage = this_cpu_read(bpf_cgroup_storage[stype]); + for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { + if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + continue; + + storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); + break; + } if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 2d4f9ac12377..bd11db9774c3 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -9,10 +9,11 @@ #include #include -DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); - #ifdef CONFIG_CGROUP_BPF +DEFINE_PER_CPU(struct bpf_cgroup_storage_info, + bpf_cgroup_storage_info[BPF_CGROUP_STORAGE_NEST_MAX]); + #include "../cgroup/cgroup-internal.h" #define LOCAL_STORAGE_CREATE_FLAG_MASK \ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 0abdd67f44b1..4aabf71cd95d 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -106,12 +106,16 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, bpf_test_timer_enter(&t); do { - bpf_cgroup_storage_set(storage); + ret = bpf_cgroup_storage_set(storage); + if (ret) + break; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = BPF_PROG_RUN(prog, ctx); + + bpf_cgroup_storage_unset(); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); bpf_test_timer_leave(&t); -- cgit v1.2.3 From fcb8d0d7587e0f2b7439d6c14a380fd17a450f96 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 25 Mar 2021 15:06:02 +0800 Subject: bpf: struct sock is declared twice in bpf_sk_storage header struct sock has been declared twice, therefore remove the duplicate. Signed-off-by: Wan Jiabing Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210325070602.858024-1-wanjiabing@vivo.com --- include/net/bpf_sk_storage.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 0e85713f56df..2926f1f00d65 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -27,7 +27,6 @@ struct bpf_local_storage_elem; struct bpf_sk_storage_diag; struct sk_buff; struct nlattr; -struct sock; #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); -- cgit v1.2.3 From ad1cd7856d870e5861ef80fbf3e4b0d68bb82a69 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 26 Mar 2021 13:22:21 -0700 Subject: ethtool: fec: add note about reuse of reserved struct ethtool_fecparam::reserved can't be used in SET, because ethtool user space doesn't zero-initialize the structure. Make this clear. Suggested-by: Andrew Lunn Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f6ef7d42c7a1..9a47c3efd8ca 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1382,6 +1382,10 @@ struct ethtool_per_queue_op { * @fec: Bitmask of configured FEC modes. * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET. * + * Note that @reserved was never validated on input and ethtool user space + * left it uninitialized when calling SET. Hence going forward it can only be + * used to return a value to userspace with GET. + * * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS. * FEC settings are configured by link autonegotiation whenever it's enabled. * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode. -- cgit v1.2.3 From d04feecaf1543e538e856166e494daebe808d1fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 26 Mar 2021 13:22:23 -0700 Subject: ethtool: document the enum values not defines kdoc does not have good support for documenting defines, and we can't abuse the enum documentation because it generates warnings. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9a47c3efd8ca..868b513d4f54 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1414,16 +1414,16 @@ struct ethtool_fecparam { /** * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration - * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported. Should not - * be used together with other bits. GET only. - * @ETHTOOL_FEC_AUTO: Select default/best FEC mode automatically, usually based - * link mode and SFP parameters read from module's EEPROM. - * This bit does _not_ mean autonegotiation. - * @ETHTOOL_FEC_OFF: No FEC Mode - * @ETHTOOL_FEC_RS: Reed-Solomon FEC Mode - * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon FEC Mode - * @ETHTOOL_FEC_LLRS: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet - * Consortium) + * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not + * be used together with other bits. GET only. + * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually + * based link mode and SFP parameters read from module's + * EEPROM. This bit does _not_ mean autonegotiation. + * @ETHTOOL_FEC_OFF_BIT: No FEC Mode + * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode + * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode + * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet + * Consortium) */ enum ethtool_fec_config_bits { ETHTOOL_FEC_NONE_BIT, -- cgit v1.2.3 From 2d9a93b4902be6a5504b5941dd15e9cd776aadca Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 25 Mar 2021 16:16:51 +0000 Subject: mld: convert from timer to delayed work mcast.c has several timers for delaying works. Timer's expire handler is working under atomic context so it can't use sleepable things such as GFP_KERNEL, mutex, etc. In order to use sleepable APIs, it converts from timers to delayed work. But there are some critical sections, which is used by both process and BH context. So that it still uses spin_lock_bh() and rwlock. Suggested-by: Cong Wang Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/net/if_inet6.h | 8 +-- net/ipv6/mcast.c | 140 ++++++++++++++++++++++++++++--------------------- 2 files changed, 83 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 8bf5906073bc..af5244c9ca5c 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -120,7 +120,7 @@ struct ifmcaddr6 { unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; - struct timer_list mca_timer; + struct delayed_work mca_work; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; @@ -179,9 +179,9 @@ struct inet6_dev { unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; - struct timer_list mc_gq_timer; /* general query timer */ - struct timer_list mc_ifc_timer; /* interface change timer */ - struct timer_list mc_dad_timer; /* dad complete mc timer */ + struct delayed_work mc_gq_work; /* general query work */ + struct delayed_work mc_ifc_work; /* interface change work */ + struct delayed_work mc_dad_work; /* dad complete mc work */ struct ifacaddr6 *ac_list; rwlock_t lock; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 6c8604390266..692a6dec8959 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include @@ -42,6 +41,7 @@ #include #include #include +#include #include #include @@ -67,14 +67,13 @@ static int __mld2_query_bugs[] __attribute__((__unused__)) = { BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4) }; +static struct workqueue_struct *mld_wq; static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; static void igmp6_join_group(struct ifmcaddr6 *ma); static void igmp6_leave_group(struct ifmcaddr6 *ma); -static void igmp6_timer_handler(struct timer_list *t); +static void mld_mca_work(struct work_struct *work); -static void mld_gq_timer_expire(struct timer_list *t); -static void mld_ifc_timer_expire(struct timer_list *t); static void mld_ifc_event(struct inet6_dev *idev); static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); @@ -713,7 +712,7 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) igmp6_leave_group(mc); spin_lock_bh(&mc->mca_lock); - if (del_timer(&mc->mca_timer)) + if (cancel_delayed_work(&mc->mca_work)) refcount_dec(&mc->mca_refcnt); spin_unlock_bh(&mc->mca_lock); } @@ -854,7 +853,7 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, if (!mc) return NULL; - timer_setup(&mc->mca_timer, igmp6_timer_handler, 0); + INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work); mc->mca_addr = *addr; mc->idev = idev; /* reference taken by caller */ @@ -1027,48 +1026,48 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, return rv; } -static void mld_gq_start_timer(struct inet6_dev *idev) +static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = prandom_u32() % idev->mc_maxdelay; idev->mc_gq_running = 1; - if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) + if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) in6_dev_hold(idev); } -static void mld_gq_stop_timer(struct inet6_dev *idev) +static void mld_gq_stop_work(struct inet6_dev *idev) { idev->mc_gq_running = 0; - if (del_timer(&idev->mc_gq_timer)) + if (cancel_delayed_work(&idev->mc_gq_work)) __in6_dev_put(idev); } -static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay) +static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = prandom_u32() % delay; - if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) + if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); } -static void mld_ifc_stop_timer(struct inet6_dev *idev) +static void mld_ifc_stop_work(struct inet6_dev *idev) { idev->mc_ifc_count = 0; - if (del_timer(&idev->mc_ifc_timer)) + if (cancel_delayed_work(&idev->mc_ifc_work)) __in6_dev_put(idev); } -static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay) +static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = prandom_u32() % delay; - if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2)) + if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); } -static void mld_dad_stop_timer(struct inet6_dev *idev) +static void mld_dad_stop_work(struct inet6_dev *idev) { - if (del_timer(&idev->mc_dad_timer)) + if (cancel_delayed_work(&idev->mc_dad_work)) __in6_dev_put(idev); } @@ -1080,21 +1079,20 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; - /* Do not start timer for these addresses */ + /* Do not start work for these addresses */ if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) return; - if (del_timer(&ma->mca_timer)) { + if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); - delay = ma->mca_timer.expires - jiffies; + delay = ma->mca_work.timer.expires - jiffies; } if (delay >= resptime) delay = prandom_u32() % resptime; - ma->mca_timer.expires = jiffies + delay; - if (!mod_timer(&ma->mca_timer, jiffies + delay)) + if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING; } @@ -1305,10 +1303,10 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, if (v1_query) mld_set_v1_mode(idev); - /* cancel MLDv2 report timer */ - mld_gq_stop_timer(idev); - /* cancel the interface change timer */ - mld_ifc_stop_timer(idev); + /* cancel MLDv2 report work */ + mld_gq_stop_work(idev); + /* cancel the interface change work */ + mld_ifc_stop_work(idev); /* clear deleted report items */ mld_clear_delrec(idev); @@ -1398,7 +1396,7 @@ int igmp6_event_query(struct sk_buff *skb) if (mlh2->mld2q_nsrcs) return -EINVAL; /* no sources allowed */ - mld_gq_start_timer(idev); + mld_gq_start_work(idev); return 0; } /* mark sources to include, if group & source-specific */ @@ -1482,14 +1480,14 @@ int igmp6_event_report(struct sk_buff *skb) return -ENODEV; /* - * Cancel the timer for this group + * Cancel the work for this group */ read_lock_bh(&idev->lock); for (ma = idev->mc_list; ma; ma = ma->next) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { spin_lock(&ma->mca_lock); - if (del_timer(&ma->mca_timer)) + if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING); spin_unlock(&ma->mca_lock); @@ -2103,21 +2101,23 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev) mld_send_initial_cr(idev); idev->mc_dad_count--; if (idev->mc_dad_count) - mld_dad_start_timer(idev, - unsolicited_report_interval(idev)); + mld_dad_start_work(idev, + unsolicited_report_interval(idev)); } } -static void mld_dad_timer_expire(struct timer_list *t) +static void mld_dad_work(struct work_struct *work) { - struct inet6_dev *idev = from_timer(idev, t, mc_dad_timer); + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_dad_work); mld_send_initial_cr(idev); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) - mld_dad_start_timer(idev, - unsolicited_report_interval(idev)); + mld_dad_start_work(idev, + unsolicited_report_interval(idev)); } in6_dev_put(idev); } @@ -2416,12 +2416,12 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) delay = prandom_u32() % unsolicited_report_interval(ma->idev); spin_lock_bh(&ma->mca_lock); - if (del_timer(&ma->mca_timer)) { + if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); - delay = ma->mca_timer.expires - jiffies; + delay = ma->mca_work.timer.expires - jiffies; } - if (!mod_timer(&ma->mca_timer, jiffies + delay)) + if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; spin_unlock_bh(&ma->mca_lock); @@ -2458,25 +2458,29 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma) } } -static void mld_gq_timer_expire(struct timer_list *t) +static void mld_gq_work(struct work_struct *work) { - struct inet6_dev *idev = from_timer(idev, t, mc_gq_timer); + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_gq_work); idev->mc_gq_running = 0; mld_send_report(idev, NULL); in6_dev_put(idev); } -static void mld_ifc_timer_expire(struct timer_list *t) +static void mld_ifc_work(struct work_struct *work) { - struct inet6_dev *idev = from_timer(idev, t, mc_ifc_timer); + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_ifc_work); mld_send_cr(idev); if (idev->mc_ifc_count) { idev->mc_ifc_count--; if (idev->mc_ifc_count) - mld_ifc_start_timer(idev, - unsolicited_report_interval(idev)); + mld_ifc_start_work(idev, + unsolicited_report_interval(idev)); } in6_dev_put(idev); } @@ -2486,22 +2490,23 @@ static void mld_ifc_event(struct inet6_dev *idev) if (mld_in_v1_mode(idev)) return; idev->mc_ifc_count = idev->mc_qrv; - mld_ifc_start_timer(idev, 1); + mld_ifc_start_work(idev, 1); } -static void igmp6_timer_handler(struct timer_list *t) +static void mld_mca_work(struct work_struct *work) { - struct ifmcaddr6 *ma = from_timer(ma, t, mca_timer); + struct ifmcaddr6 *ma = container_of(to_delayed_work(work), + struct ifmcaddr6, mca_work); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); - spin_lock(&ma->mca_lock); + spin_lock_bh(&ma->mca_lock); ma->mca_flags |= MAF_LAST_REPORTER; ma->mca_flags &= ~MAF_TIMER_RUNNING; - spin_unlock(&ma->mca_lock); + spin_unlock_bh(&ma->mca_lock); ma_put(ma); } @@ -2537,12 +2542,12 @@ void ipv6_mc_down(struct inet6_dev *idev) for (i = idev->mc_list; i; i = i->next) igmp6_group_dropped(i); - /* Should stop timer after group drop. or we will - * start timer again in mld_ifc_event() + /* Should stop work after group drop. or we will + * start work again in mld_ifc_event() */ - mld_ifc_stop_timer(idev); - mld_gq_stop_timer(idev); - mld_dad_stop_timer(idev); + mld_ifc_stop_work(idev); + mld_gq_stop_work(idev); + mld_dad_stop_work(idev); read_unlock_bh(&idev->lock); } @@ -2579,11 +2584,11 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) write_lock_bh(&idev->lock); spin_lock_init(&idev->mc_lock); idev->mc_gq_running = 0; - timer_setup(&idev->mc_gq_timer, mld_gq_timer_expire, 0); + INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work); idev->mc_tomb = NULL; idev->mc_ifc_count = 0; - timer_setup(&idev->mc_ifc_timer, mld_ifc_timer_expire, 0); - timer_setup(&idev->mc_dad_timer, mld_dad_timer_expire, 0); + INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work); + INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work); ipv6_mc_reset(idev); write_unlock_bh(&idev->lock); } @@ -2596,7 +2601,7 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) { struct ifmcaddr6 *i; - /* Deactivate timers */ + /* Deactivate works */ ipv6_mc_down(idev); mld_clear_delrec(idev); @@ -2763,7 +2768,7 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v) &im->mca_addr, im->mca_users, im->mca_flags, (im->mca_flags&MAF_TIMER_RUNNING) ? - jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0); + jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0); return 0; } @@ -3002,7 +3007,19 @@ static struct pernet_operations igmp6_net_ops = { int __init igmp6_init(void) { - return register_pernet_subsys(&igmp6_net_ops); + int err; + + err = register_pernet_subsys(&igmp6_net_ops); + if (err) + return err; + + mld_wq = create_workqueue("mld"); + if (!mld_wq) { + unregister_pernet_subsys(&igmp6_net_ops); + return -ENOMEM; + } + + return err; } int __init igmp6_late_init(void) @@ -3013,6 +3030,7 @@ int __init igmp6_late_init(void) void igmp6_cleanup(void) { unregister_pernet_subsys(&igmp6_net_ops); + destroy_workqueue(mld_wq); } void igmp6_late_cleanup(void) -- cgit v1.2.3 From cf2ce339b401bc53ee131f0ce38bae32a949925e Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 25 Mar 2021 16:16:52 +0000 Subject: mld: get rid of inet6_dev->mc_lock The purpose of mc_lock is to protect inet6_dev->mc_tomb. But mc_tomb is already protected by RTNL and all functions, which manipulate mc_tomb are called under RTNL. So, mc_lock is not needed. Furthermore, it is spinlock so the critical section is atomic. In order to reduce atomic context, it should be removed. Suggested-by: Cong Wang Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 - net/ipv6/mcast.c | 9 --------- 2 files changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index af5244c9ca5c..1080d2248304 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -167,7 +167,6 @@ struct inet6_dev { struct ifmcaddr6 *mc_list; struct ifmcaddr6 *mc_tomb; - spinlock_t mc_lock; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 692a6dec8959..35962aa3cc22 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -752,10 +752,8 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) } spin_unlock_bh(&im->mca_lock); - spin_lock_bh(&idev->mc_lock); pmc->next = idev->mc_tomb; idev->mc_tomb = pmc; - spin_unlock_bh(&idev->mc_lock); } static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) @@ -764,7 +762,6 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) struct ip6_sf_list *psf; struct in6_addr *pmca = &im->mca_addr; - spin_lock_bh(&idev->mc_lock); pmc_prev = NULL; for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) @@ -777,7 +774,6 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) else idev->mc_tomb = pmc->next; } - spin_unlock_bh(&idev->mc_lock); spin_lock_bh(&im->mca_lock); if (pmc) { @@ -801,10 +797,8 @@ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; - spin_lock_bh(&idev->mc_lock); pmc = idev->mc_tomb; idev->mc_tomb = NULL; - spin_unlock_bh(&idev->mc_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; @@ -1907,7 +1901,6 @@ static void mld_send_cr(struct inet6_dev *idev) int type, dtype; read_lock_bh(&idev->lock); - spin_lock(&idev->mc_lock); /* deleted MCA's */ pmc_prev = NULL; @@ -1941,7 +1934,6 @@ static void mld_send_cr(struct inet6_dev *idev) } else pmc_prev = pmc; } - spin_unlock(&idev->mc_lock); /* change recs */ for (pmc = idev->mc_list; pmc; pmc = pmc->next) { @@ -2582,7 +2574,6 @@ void ipv6_mc_up(struct inet6_dev *idev) void ipv6_mc_init_dev(struct inet6_dev *idev) { write_lock_bh(&idev->lock); - spin_lock_init(&idev->mc_lock); idev->mc_gq_running = 0; INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work); idev->mc_tomb = NULL; -- cgit v1.2.3 From 882ba1f73c06831f2a21044ebd8864c485ac04f2 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 25 Mar 2021 16:16:53 +0000 Subject: mld: convert ipv6_mc_socklist->sflist to RCU The sflist has been protected by rwlock so that the critical section is atomic context. In order to switch this context, changing locking is needed. The sflist actually already protected by RTNL So if it's converted to use RCU, its control path context can be switched to sleepable. Suggested-by: Cong Wang Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/net/if_inet6.h | 4 ++-- net/ipv6/mcast.c | 52 +++++++++++++++++++++----------------------------- 2 files changed, 24 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 1080d2248304..062294aeeb6d 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -78,6 +78,7 @@ struct inet6_ifaddr { struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; + struct rcu_head rcu; struct in6_addr sl_addr[]; }; @@ -91,8 +92,7 @@ struct ipv6_mc_socklist { int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; - rwlock_t sflock; - struct ip6_sf_socklist *sflist; + struct ip6_sf_socklist __rcu *sflist; struct rcu_head rcu; }; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 35962aa3cc22..9da55d23a13c 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -178,8 +178,7 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = mode; - rwlock_init(&mc_lst->sflock); - mc_lst->sflist = NULL; + RCU_INIT_POINTER(mc_lst->sflist, NULL); /* * now add/increase the group membership on the device @@ -335,7 +334,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk, struct net *net = sock_net(sk); int i, j, rv; int leavegroup = 0; - int pmclocked = 0; int err; source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; @@ -364,7 +362,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, goto done; } /* if a source filter was set, must be the same mode as before */ - if (pmc->sflist) { + if (rcu_access_pointer(pmc->sflist)) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; @@ -376,10 +374,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, pmc->sfmode = omode; } - write_lock(&pmc->sflock); - pmclocked = 1; - - psl = pmc->sflist; + psl = rtnl_dereference(pmc->sflist); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ @@ -429,9 +424,11 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); } - pmc->sflist = psl = newpsl; + psl = newpsl; + rcu_assign_pointer(pmc->sflist, psl); } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { @@ -447,8 +444,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: - if (pmclocked) - write_unlock(&pmc->sflock); read_unlock_bh(&idev->lock); rcu_read_unlock(); if (leavegroup) @@ -526,17 +521,16 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); } - write_lock(&pmc->sflock); - psl = pmc->sflist; + psl = rtnl_dereference(pmc->sflist); if (psl) { (void) ip6_mc_del_src(idev, group, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); } else (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); - pmc->sflist = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = gsf->gf_fmode; - write_unlock(&pmc->sflock); err = 0; done: read_unlock_bh(&idev->lock); @@ -585,16 +579,14 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, if (!pmc) /* must have a prior join */ goto done; gsf->gf_fmode = pmc->sfmode; - psl = pmc->sflist; + psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; read_unlock_bh(&idev->lock); rcu_read_unlock(); copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - /* changes to psl require the socket lock, and a write lock - * on pmc->sflock. We have the socket lock so reading here is safe. - */ + for (i = 0; i < copycount; i++, p++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -630,8 +622,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, rcu_read_unlock(); return np->mc_all; } - read_lock(&mc->sflock); - psl = mc->sflist; + psl = rcu_dereference(mc->sflist); if (!psl) { rv = mc->sfmode == MCAST_EXCLUDE; } else { @@ -646,7 +637,6 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = false; } - read_unlock(&mc->sflock); rcu_read_unlock(); return rv; @@ -2422,19 +2412,21 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, struct inet6_dev *idev) { + struct ip6_sf_socklist *psl; int err; - write_lock_bh(&iml->sflock); - if (!iml->sflist) { + psl = rtnl_dereference(iml->sflist); + + if (!psl) { /* any-source empty exclude case */ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); } else { err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, - iml->sflist->sl_count, iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + psl->sl_count, psl->sl_addr, 0); + RCU_INIT_POINTER(iml->sflist, NULL); + atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + kfree_rcu(psl, rcu); } - write_unlock_bh(&iml->sflock); return err; } -- cgit v1.2.3 From 4b200e398953c237c86d32bf26d4cb2a96556a6f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 25 Mar 2021 16:16:54 +0000 Subject: mld: convert ip6_sf_list to RCU The ip6_sf_list has been protected by mca_lock(spin_lock) so that the critical section is atomic context. In order to switch this context, changing locking is needed. The ip6_sf_list actually already protected by RTNL So if it's converted to use RCU, its control path context can be switched to sleepable. But It doesn't remove mca_lock yet because ifmcaddr6 isn't converted to RCU yet. So, It's not fully converted to the sleepable context. Suggested-by: Cong Wang Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/net/if_inet6.h | 7 +- net/ipv6/mcast.c | 200 +++++++++++++++++++++++++++++++------------------ 2 files changed, 130 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 062294aeeb6d..7875a3208426 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -97,12 +97,13 @@ struct ipv6_mc_socklist { }; struct ip6_sf_list { - struct ip6_sf_list *sf_next; + struct ip6_sf_list __rcu *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ + struct rcu_head rcu; }; #define MAF_TIMER_RUNNING 0x01 @@ -115,8 +116,8 @@ struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 *next; - struct ip6_sf_list *mca_sources; - struct ip6_sf_list *mca_tomb; + struct ip6_sf_list __rcu *mca_sources; + struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9da55d23a13c..bc0fb4815c97 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -113,10 +113,25 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; */ #define for_each_pmc_rcu(np, pmc) \ - for (pmc = rcu_dereference(np->ipv6_mc_list); \ - pmc != NULL; \ + for (pmc = rcu_dereference((np)->ipv6_mc_list); \ + pmc; \ pmc = rcu_dereference(pmc->next)) +#define for_each_psf_rtnl(mc, psf) \ + for (psf = rtnl_dereference((mc)->mca_sources); \ + psf; \ + psf = rtnl_dereference(psf->sf_next)) + +#define for_each_psf_rcu(mc, psf) \ + for (psf = rcu_dereference((mc)->mca_sources); \ + psf; \ + psf = rcu_dereference(psf->sf_next)) + +#define for_each_psf_tomb(mc, psf) \ + for (psf = rtnl_dereference((mc)->mca_tomb); \ + psf; \ + psf = rtnl_dereference(psf->sf_next)) + static int unsolicited_report_interval(struct inet6_dev *idev) { int iv; @@ -734,10 +749,14 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) if (pmc->mca_sfmode == MCAST_INCLUDE) { struct ip6_sf_list *psf; - pmc->mca_tomb = im->mca_tomb; - pmc->mca_sources = im->mca_sources; - im->mca_tomb = im->mca_sources = NULL; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + rcu_assign_pointer(pmc->mca_tomb, + rtnl_dereference(im->mca_tomb)); + rcu_assign_pointer(pmc->mca_sources, + rtnl_dereference(im->mca_sources)); + RCU_INIT_POINTER(im->mca_tomb, NULL); + RCU_INIT_POINTER(im->mca_sources, NULL); + + for_each_psf_rtnl(pmc, psf) psf->sf_crcount = pmc->mca_crcount; } spin_unlock_bh(&im->mca_lock); @@ -748,9 +767,9 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { - struct ifmcaddr6 *pmc, *pmc_prev; - struct ip6_sf_list *psf; + struct ip6_sf_list *psf, *sources, *tomb; struct in6_addr *pmca = &im->mca_addr; + struct ifmcaddr6 *pmc, *pmc_prev; pmc_prev = NULL; for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) { @@ -769,9 +788,16 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) if (pmc) { im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { - swap(im->mca_tomb, pmc->mca_tomb); - swap(im->mca_sources, pmc->mca_sources); - for (psf = im->mca_sources; psf; psf = psf->sf_next) + tomb = rcu_replace_pointer(im->mca_tomb, + rtnl_dereference(pmc->mca_tomb), + lockdep_rtnl_is_held()); + rcu_assign_pointer(pmc->mca_tomb, tomb); + + sources = rcu_replace_pointer(im->mca_sources, + rtnl_dereference(pmc->mca_sources), + lockdep_rtnl_is_held()); + rcu_assign_pointer(pmc->mca_sources, sources); + for_each_psf_rtnl(im, psf) psf->sf_crcount = idev->mc_qrv; } else { im->mca_crcount = idev->mc_qrv; @@ -803,12 +829,12 @@ static void mld_clear_delrec(struct inet6_dev *idev) struct ip6_sf_list *psf, *psf_next; spin_lock_bh(&pmc->mca_lock); - psf = pmc->mca_tomb; - pmc->mca_tomb = NULL; + psf = rtnl_dereference(pmc->mca_tomb); + RCU_INIT_POINTER(pmc->mca_tomb, NULL); spin_unlock_bh(&pmc->mca_lock); for (; psf; psf = psf_next) { - psf_next = psf->sf_next; - kfree(psf); + psf_next = rtnl_dereference(psf->sf_next); + kfree_rcu(psf, rcu); } } read_unlock_bh(&idev->lock); @@ -990,7 +1016,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, struct ip6_sf_list *psf; spin_lock_bh(&mc->mca_lock); - for (psf = mc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rcu(mc, psf) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; } @@ -1089,7 +1115,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, int i, scount; scount = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rcu(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1122,7 +1148,7 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, /* mark INCLUDE-mode sources */ scount = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rcu(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1532,7 +1558,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) struct ip6_sf_list *psf; int scount = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rtnl(pmc, psf) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -1707,14 +1733,16 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, - int type, int gdeleted, int sdeleted, int crsend) + int type, int gdeleted, int sdeleted, + int crsend) { + struct ip6_sf_list *psf, *psf_prev, *psf_next; + int scount, stotal, first, isquery, truncate; + struct ip6_sf_list __rcu **psf_list; struct inet6_dev *idev = pmc->idev; struct net_device *dev = idev->dev; - struct mld2_report *pmr; struct mld2_grec *pgr = NULL; - struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; - int scount, stotal, first, isquery, truncate; + struct mld2_report *pmr; unsigned int mtu; if (pmc->mca_flags & MAF_NOREPORT) @@ -1733,7 +1761,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; - if (!*psf_list) + if (!rcu_access_pointer(*psf_list)) goto empty_source; pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL; @@ -1749,10 +1777,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } first = 1; psf_prev = NULL; - for (psf = *psf_list; psf; psf = psf_next) { + for (psf = rtnl_dereference(*psf_list); + psf; + psf = psf_next) { struct in6_addr *psrc; - psf_next = psf->sf_next; + psf_next = rtnl_dereference(psf->sf_next); if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; @@ -1799,10 +1829,12 @@ decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) - psf_prev->sf_next = psf->sf_next; + rcu_assign_pointer(psf_prev->sf_next, + rtnl_dereference(psf->sf_next)); else - *psf_list = psf->sf_next; - kfree(psf); + rcu_assign_pointer(*psf_list, + rtnl_dereference(psf->sf_next)); + kfree_rcu(psf, rcu); continue; } } @@ -1866,21 +1898,26 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) /* * remove zero-count source records from a source filter list */ -static void mld_clear_zeros(struct ip6_sf_list **ppsf) +static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf) { struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf = *ppsf; psf; psf = psf_next) { - psf_next = psf->sf_next; + for (psf = rtnl_dereference(*ppsf); + psf; + psf = psf_next) { + psf_next = rtnl_dereference(psf->sf_next); if (psf->sf_crcount == 0) { if (psf_prev) - psf_prev->sf_next = psf->sf_next; + rcu_assign_pointer(psf_prev->sf_next, + rtnl_dereference(psf->sf_next)); else - *ppsf = psf->sf_next; - kfree(psf); - } else + rcu_assign_pointer(*ppsf, + rtnl_dereference(psf->sf_next)); + kfree_rcu(psf, rcu); + } else { psf_prev = psf; + } } } @@ -1913,8 +1950,9 @@ static void mld_send_cr(struct inet6_dev *idev) mld_clear_zeros(&pmc->mca_sources); } } - if (pmc->mca_crcount == 0 && !pmc->mca_tomb && - !pmc->mca_sources) { + if (pmc->mca_crcount == 0 && + !rcu_access_pointer(pmc->mca_tomb) && + !rcu_access_pointer(pmc->mca_sources)) { if (pmc_prev) pmc_prev->next = pmc_next; else @@ -2111,7 +2149,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rtnl(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2126,17 +2164,22 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* no more filters for this source */ if (psf_prev) - psf_prev->sf_next = psf->sf_next; + rcu_assign_pointer(psf_prev->sf_next, + rtnl_dereference(psf->sf_next)); else - pmc->mca_sources = psf->sf_next; + rcu_assign_pointer(pmc->mca_sources, + rtnl_dereference(psf->sf_next)); + if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; - psf->sf_next = pmc->mca_tomb; - pmc->mca_tomb = psf; + rcu_assign_pointer(psf->sf_next, + rtnl_dereference(pmc->mca_tomb)); + rcu_assign_pointer(pmc->mca_tomb, psf); rv = 1; - } else - kfree(psf); + } else { + kfree_rcu(psf, rcu); + } } return rv; } @@ -2188,7 +2231,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + for_each_psf_rtnl(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); } else if (sf_setstate(pmc) || changerec) @@ -2207,7 +2250,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rtnl(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2219,9 +2262,10 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, psf->sf_addr = *psfsrc; if (psf_prev) { - psf_prev->sf_next = psf; - } else - pmc->mca_sources = psf; + rcu_assign_pointer(psf_prev->sf_next, psf); + } else { + rcu_assign_pointer(pmc->mca_sources, psf); + } } psf->sf_count[sfmode]++; return 0; @@ -2232,13 +2276,15 @@ static void sf_markstate(struct ifmcaddr6 *pmc) struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + for_each_psf_rtnl(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; - } else + } else { psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; + } + } } static int sf_setstate(struct ifmcaddr6 *pmc) @@ -2249,7 +2295,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) int new_in, rv; rv = 0; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) { + for_each_psf_rtnl(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -2259,8 +2305,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) if (!psf->sf_oldin) { struct ip6_sf_list *prev = NULL; - for (dpsf = pmc->mca_tomb; dpsf; - dpsf = dpsf->sf_next) { + for_each_psf_tomb(pmc, dpsf) { if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; @@ -2268,10 +2313,12 @@ static int sf_setstate(struct ifmcaddr6 *pmc) } if (dpsf) { if (prev) - prev->sf_next = dpsf->sf_next; + rcu_assign_pointer(prev->sf_next, + rtnl_dereference(dpsf->sf_next)); else - pmc->mca_tomb = dpsf->sf_next; - kfree(dpsf); + rcu_assign_pointer(pmc->mca_tomb, + rtnl_dereference(dpsf->sf_next)); + kfree_rcu(dpsf, rcu); } psf->sf_crcount = qrv; rv++; @@ -2282,7 +2329,8 @@ static int sf_setstate(struct ifmcaddr6 *pmc) * add or update "delete" records if an active filter * is now inactive */ - for (dpsf = pmc->mca_tomb; dpsf; dpsf = dpsf->sf_next) + + for_each_psf_tomb(pmc, dpsf) if (ipv6_addr_equal(&dpsf->sf_addr, &psf->sf_addr)) break; @@ -2291,9 +2339,9 @@ static int sf_setstate(struct ifmcaddr6 *pmc) if (!dpsf) continue; *dpsf = *psf; - /* pmc->mca_lock held by callers */ - dpsf->sf_next = pmc->mca_tomb; - pmc->mca_tomb = dpsf; + rcu_assign_pointer(dpsf->sf_next, + rtnl_dereference(pmc->mca_tomb)); + rcu_assign_pointer(pmc->mca_tomb, dpsf); } dpsf->sf_crcount = qrv; rv++; @@ -2356,7 +2404,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for (psf = pmc->mca_sources; psf; psf = psf->sf_next) + for_each_psf_rtnl(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(idev); } else if (sf_setstate(pmc)) @@ -2370,16 +2418,20 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; - for (psf = pmc->mca_tomb; psf; psf = nextpsf) { - nextpsf = psf->sf_next; - kfree(psf); + for (psf = rtnl_dereference(pmc->mca_tomb); + psf; + psf = nextpsf) { + nextpsf = rtnl_dereference(psf->sf_next); + kfree_rcu(psf, rcu); } - pmc->mca_tomb = NULL; - for (psf = pmc->mca_sources; psf; psf = nextpsf) { - nextpsf = psf->sf_next; - kfree(psf); + RCU_INIT_POINTER(pmc->mca_tomb, NULL); + for (psf = rtnl_dereference(pmc->mca_sources); + psf; + psf = nextpsf) { + nextpsf = rtnl_dereference(psf->sf_next); + kfree_rcu(psf, rcu); } - pmc->mca_sources = NULL; + RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; pmc->mca_sfcount[MCAST_EXCLUDE] = 1; @@ -2789,7 +2841,7 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) im = idev->mc_list; if (likely(im)) { spin_lock_bh(&im->mca_lock); - psf = im->mca_sources; + psf = rcu_dereference(im->mca_sources); if (likely(psf)) { state->im = im; state->idev = idev; @@ -2806,7 +2858,7 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); - psf = psf->sf_next; + psf = rcu_dereference(psf->sf_next); while (!psf) { spin_unlock_bh(&state->im->mca_lock); state->im = state->im->next; @@ -2828,7 +2880,7 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s if (!state->im) break; spin_lock_bh(&state->im->mca_lock); - psf = state->im->mca_sources; + psf = rcu_dereference(state->im->mca_sources); } out: return psf; -- cgit v1.2.3 From 88e2ca3080947fe22eb520c1f8231e79a105d011 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 25 Mar 2021 16:16:55 +0000 Subject: mld: convert ifmcaddr6 to RCU The ifmcaddr6 has been protected by inet6_dev->lock(rwlock) so that the critical section is atomic context. In order to switch this context, changing locking is needed. The ifmcaddr6 actually already protected by RTNL So if it's converted to use RCU, its control path context can be switched to sleepable. Suggested-by: Cong Wang Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 6 +- include/net/if_inet6.h | 7 +- net/batman-adv/multicast.c | 6 +- net/ipv6/addrconf.c | 9 +- net/ipv6/addrconf_core.c | 2 +- net/ipv6/af_inet6.c | 2 +- net/ipv6/mcast.c | 296 +++++++++++++++++----------------------- 7 files changed, 140 insertions(+), 188 deletions(-) (limited to 'include') diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 35b42275a06c..d308ff744a29 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -1098,8 +1098,9 @@ walk_ipv6: tmp.disp_flag = QETH_DISP_ADDR_ADD; tmp.is_multicast = 1; - read_lock_bh(&in6_dev->lock); - for (im6 = in6_dev->mc_list; im6 != NULL; im6 = im6->next) { + for (im6 = rtnl_dereference(in6_dev->mc_list); + im6; + im6 = rtnl_dereference(im6->next)) { tmp.u.a6.addr = im6->mca_addr; ipm = qeth_l3_find_addr_by_ip(card, &tmp); @@ -1117,7 +1118,6 @@ walk_ipv6: qeth_l3_ipaddr_hash(ipm)); } - read_unlock_bh(&in6_dev->lock); out: return 0; diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 7875a3208426..521158e05c18 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -115,7 +115,7 @@ struct ip6_sf_list { struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; - struct ifmcaddr6 *next; + struct ifmcaddr6 __rcu *next; struct ip6_sf_list __rcu *mca_sources; struct ip6_sf_list __rcu *mca_tomb; unsigned int mca_sfmode; @@ -128,6 +128,7 @@ struct ifmcaddr6 { spinlock_t mca_lock; unsigned long mca_cstamp; unsigned long mca_tstamp; + struct rcu_head rcu; }; /* Anycast stuff */ @@ -166,8 +167,8 @@ struct inet6_dev { struct list_head addr_list; - struct ifmcaddr6 *mc_list; - struct ifmcaddr6 *mc_tomb; + struct ifmcaddr6 __rcu *mc_list; + struct ifmcaddr6 __rcu *mc_tomb; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 28166402d30c..1d63c8cbbfe7 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -454,8 +454,9 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, return 0; } - read_lock_bh(&in6_dev->lock); - for (pmc6 = in6_dev->mc_list; pmc6; pmc6 = pmc6->next) { + for (pmc6 = rcu_dereference(in6_dev->mc_list); + pmc6; + pmc6 = rcu_dereference(pmc6->next)) { if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; @@ -484,7 +485,6 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, hlist_add_head(&new->list, mcast_list); ret++; } - read_unlock_bh(&in6_dev->lock); rcu_read_unlock(); return ret; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f2337fb756ac..b502f78d5091 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5107,17 +5107,20 @@ next: break; } case MULTICAST_ADDR: + read_unlock_bh(&idev->lock); fillargs->event = RTM_GETMULTICAST; /* multicast address */ - for (ifmca = idev->mc_list; ifmca; - ifmca = ifmca->next, ip_idx++) { + for (ifmca = rcu_dereference(idev->mc_list); + ifmca; + ifmca = rcu_dereference(ifmca->next), ip_idx++) { if (ip_idx < s_ip_idx) continue; err = inet6_fill_ifmcaddr(skb, ifmca, fillargs); if (err < 0) break; } + read_lock_bh(&idev->lock); break; case ANYCAST_ADDR: fillargs->event = RTM_GETANYCAST; @@ -6093,10 +6096,8 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) { - rcu_read_lock_bh(); if (likely(ifp->idev->dead == 0)) __ipv6_ifa_notify(event, ifp); - rcu_read_unlock_bh(); } #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index c70c192bc91b..a36626afbc02 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -250,7 +250,7 @@ void in6_dev_finish_destroy(struct inet6_dev *idev) struct net_device *dev = idev->dev; WARN_ON(!list_empty(&idev->addr_list)); - WARN_ON(idev->mc_list); + WARN_ON(rcu_access_pointer(idev->mc_list)); WARN_ON(timer_pending(&idev->rs_timer)); #ifdef NET_REFCNT_DEBUG diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 802f5111805a..3c9bacffc9c3 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -222,7 +222,7 @@ lookup_protocol: inet->mc_loop = 1; inet->mc_ttl = 1; inet->mc_index = 0; - inet->mc_list = NULL; + RCU_INIT_POINTER(inet->mc_list, NULL); inet->rcv_tos = 0; if (net->ipv4.sysctl_ip_no_pmtu_disc) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index bc0fb4815c97..75541cf53153 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -112,6 +112,11 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; * socket join on multicast group */ +#define for_each_pmc_rtnl(np, pmc) \ + for (pmc = rtnl_dereference((np)->ipv6_mc_list); \ + pmc; \ + pmc = rtnl_dereference(pmc->next)) + #define for_each_pmc_rcu(np, pmc) \ for (pmc = rcu_dereference((np)->ipv6_mc_list); \ pmc; \ @@ -132,6 +137,21 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; psf; \ psf = rtnl_dereference(psf->sf_next)) +#define for_each_mc_rtnl(idev, mc) \ + for (mc = rtnl_dereference((idev)->mc_list); \ + mc; \ + mc = rtnl_dereference(mc->next)) + +#define for_each_mc_rcu(idev, mc) \ + for (mc = rcu_dereference((idev)->mc_list); \ + mc; \ + mc = rcu_dereference(mc->next)) + +#define for_each_mc_tomb(idev, mc) \ + for (mc = rtnl_dereference((idev)->mc_tomb); \ + mc; \ + mc = rtnl_dereference(mc->next)) + static int unsolicited_report_interval(struct inet6_dev *idev) { int iv; @@ -158,15 +178,11 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, if (!ipv6_addr_is_multicast(addr)) return -EINVAL; - rcu_read_lock(); - for_each_pmc_rcu(np, mc_lst) { + for_each_pmc_rtnl(np, mc_lst) { if ((ifindex == 0 || mc_lst->ifindex == ifindex) && - ipv6_addr_equal(&mc_lst->addr, addr)) { - rcu_read_unlock(); + ipv6_addr_equal(&mc_lst->addr, addr)) return -EADDRINUSE; - } } - rcu_read_unlock(); mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); @@ -268,10 +284,9 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } EXPORT_SYMBOL(ipv6_sock_mc_drop); -/* called with rcu_read_lock() */ -static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, - const struct in6_addr *group, - int ifindex) +static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net, + const struct in6_addr *group, + int ifindex) { struct net_device *dev = NULL; struct inet6_dev *idev = NULL; @@ -283,19 +298,17 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, dev = rt->dst.dev; ip6_rt_put(rt); } - } else - dev = dev_get_by_index_rcu(net, ifindex); + } else { + dev = __dev_get_by_index(net, ifindex); + } if (!dev) return NULL; idev = __in6_dev_get(dev); if (!idev) return NULL; - read_lock_bh(&idev->lock); - if (idev->dead) { - read_unlock_bh(&idev->lock); + if (idev->dead) return NULL; - } return idev; } @@ -357,16 +370,13 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - rcu_read_lock(); - idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface); - if (!idev) { - rcu_read_unlock(); + idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface); + if (!idev) return -ENODEV; - } err = -EADDRNOTAVAIL; - for_each_pmc_rcu(inet6, pmc) { + for_each_pmc_rtnl(inet6, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -459,8 +469,6 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: - read_unlock_bh(&idev->lock); - rcu_read_unlock(); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; @@ -486,13 +494,9 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - rcu_read_lock(); - idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - - if (!idev) { - rcu_read_unlock(); + idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); + if (!idev) return -ENODEV; - } err = 0; @@ -501,7 +505,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, goto done; } - for_each_pmc_rcu(inet6, pmc) { + for_each_pmc_rtnl(inet6, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(&pmc->addr, group)) @@ -548,8 +552,6 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, pmc->sfmode = gsf->gf_fmode; err = 0; done: - read_unlock_bh(&idev->lock); - rcu_read_unlock(); if (leavegroup) err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group); return err; @@ -571,13 +573,9 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - rcu_read_lock(); - idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface); - - if (!idev) { - rcu_read_unlock(); + idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface); + if (!idev) return -ENODEV; - } err = -EADDRNOTAVAIL; /* changes to the ipv6_mc_list require the socket lock and @@ -585,19 +583,18 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, * so reading the list is safe. */ - for_each_pmc_rcu(inet6, pmc) { + for_each_pmc_rtnl(inet6, pmc) { if (pmc->ifindex != gsf->gf_interface) continue; if (ipv6_addr_equal(group, &pmc->addr)) break; } if (!pmc) /* must have a prior join */ - goto done; + return err; + gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; - read_unlock_bh(&idev->lock); - rcu_read_unlock(); copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; @@ -614,10 +611,6 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, return -EFAULT; } return 0; -done: - read_unlock_bh(&idev->lock); - rcu_read_unlock(); - return err; } bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, @@ -761,8 +754,8 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) } spin_unlock_bh(&im->mca_lock); - pmc->next = idev->mc_tomb; - idev->mc_tomb = pmc; + rcu_assign_pointer(pmc->next, idev->mc_tomb); + rcu_assign_pointer(idev->mc_tomb, pmc); } static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) @@ -772,16 +765,16 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) struct ifmcaddr6 *pmc, *pmc_prev; pmc_prev = NULL; - for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) { + for_each_mc_tomb(idev, pmc) { if (ipv6_addr_equal(&pmc->mca_addr, pmca)) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) - pmc_prev->next = pmc->next; + rcu_assign_pointer(pmc_prev->next, pmc->next); else - idev->mc_tomb = pmc->next; + rcu_assign_pointer(idev->mc_tomb, pmc->next); } spin_lock_bh(&im->mca_lock); @@ -804,7 +797,7 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) } in6_dev_put(pmc->idev); ip6_mc_clear_src(pmc); - kfree(pmc); + kfree_rcu(pmc, rcu); } spin_unlock_bh(&im->mca_lock); } @@ -813,19 +806,18 @@ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; - pmc = idev->mc_tomb; - idev->mc_tomb = NULL; + pmc = rtnl_dereference(idev->mc_tomb); + RCU_INIT_POINTER(idev->mc_tomb, NULL); for (; pmc; pmc = nextpmc) { - nextpmc = pmc->next; + nextpmc = rtnl_dereference(pmc->next); ip6_mc_clear_src(pmc); in6_dev_put(pmc->idev); - kfree(pmc); + kfree_rcu(pmc, rcu); } /* clear dead sources, too */ - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + for_each_mc_rtnl(idev, pmc) { struct ip6_sf_list *psf, *psf_next; spin_lock_bh(&pmc->mca_lock); @@ -837,7 +829,6 @@ static void mld_clear_delrec(struct inet6_dev *idev) kfree_rcu(psf, rcu); } } - read_unlock_bh(&idev->lock); } static void mca_get(struct ifmcaddr6 *mc) @@ -849,7 +840,7 @@ static void ma_put(struct ifmcaddr6 *mc) { if (refcount_dec_and_test(&mc->mca_refcnt)) { in6_dev_put(mc->idev); - kfree(mc); + kfree_rcu(mc, rcu); } } @@ -900,17 +891,14 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, if (!idev) return -EINVAL; - write_lock_bh(&idev->lock); if (idev->dead) { - write_unlock_bh(&idev->lock); in6_dev_put(idev); return -ENODEV; } - for (mc = idev->mc_list; mc; mc = mc->next) { + for_each_mc_rtnl(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; - write_unlock_bh(&idev->lock); ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); in6_dev_put(idev); return 0; @@ -919,19 +907,14 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mc = mca_alloc(idev, addr, mode); if (!mc) { - write_unlock_bh(&idev->lock); in6_dev_put(idev); return -ENOMEM; } - mc->next = idev->mc_list; - idev->mc_list = mc; + rcu_assign_pointer(mc->next, idev->mc_list); + rcu_assign_pointer(idev->mc_list, mc); - /* Hold this for the code below before we unlock, - * it is already exposed via idev->mc_list. - */ mca_get(mc); - write_unlock_bh(&idev->lock); mld_del_delrec(idev, mc); igmp6_group_added(mc); @@ -950,16 +933,16 @@ EXPORT_SYMBOL(ipv6_dev_mc_inc); */ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { - struct ifmcaddr6 *ma, **map; + struct ifmcaddr6 *ma, __rcu **map; ASSERT_RTNL(); - write_lock_bh(&idev->lock); - for (map = &idev->mc_list; (ma = *map) != NULL; map = &ma->next) { + for (map = &idev->mc_list; + (ma = rtnl_dereference(*map)); + map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { *map = ma->next; - write_unlock_bh(&idev->lock); igmp6_group_dropped(ma); ip6_mc_clear_src(ma); @@ -967,11 +950,9 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) ma_put(ma); return 0; } - write_unlock_bh(&idev->lock); return 0; } } - write_unlock_bh(&idev->lock); return -ENOENT; } @@ -1006,8 +987,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - for (mc = idev->mc_list; mc; mc = mc->next) { + for_each_mc_rcu(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, group)) break; } @@ -1030,7 +1010,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, } else rv = true; /* don't filter unspecified source */ } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); return rv; @@ -1082,9 +1061,8 @@ static void mld_dad_stop_work(struct inet6_dev *idev) } /* - * IGMP handling (alias multicast ICMPv6 messages) + * IGMP handling (alias multicast ICMPv6 messages) */ - static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { unsigned long delay = resptime; @@ -1422,15 +1400,14 @@ int igmp6_event_query(struct sk_buff *skb) return -EINVAL; } - read_lock_bh(&idev->lock); if (group_type == IPV6_ADDR_ANY) { - for (ma = idev->mc_list; ma; ma = ma->next) { + for_each_mc_rcu(idev, ma) { spin_lock_bh(&ma->mca_lock); igmp6_group_queried(ma, max_delay); spin_unlock_bh(&ma->mca_lock); } } else { - for (ma = idev->mc_list; ma; ma = ma->next) { + for_each_mc_rcu(idev, ma) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; spin_lock_bh(&ma->mca_lock); @@ -1452,7 +1429,6 @@ int igmp6_event_query(struct sk_buff *skb) break; } } - read_unlock_bh(&idev->lock); return 0; } @@ -1493,18 +1469,17 @@ int igmp6_event_report(struct sk_buff *skb) * Cancel the work for this group */ - read_lock_bh(&idev->lock); - for (ma = idev->mc_list; ma; ma = ma->next) { + for_each_mc_rcu(idev, ma) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { spin_lock(&ma->mca_lock); if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); - ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING); + ma->mca_flags &= ~(MAF_LAST_REPORTER | + MAF_TIMER_RUNNING); spin_unlock(&ma->mca_lock); break; } } - read_unlock_bh(&idev->lock); return 0; } @@ -1868,9 +1843,8 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) struct sk_buff *skb = NULL; int type; - read_lock_bh(&idev->lock); if (!pmc) { - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + for_each_mc_rtnl(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) continue; spin_lock_bh(&pmc->mca_lock); @@ -1890,7 +1864,6 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) skb = add_grec(skb, pmc, type, 0, 0, 0); spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); if (skb) mld_sendpack(skb); } @@ -1927,12 +1900,12 @@ static void mld_send_cr(struct inet6_dev *idev) struct sk_buff *skb = NULL; int type, dtype; - read_lock_bh(&idev->lock); - /* deleted MCA's */ pmc_prev = NULL; - for (pmc = idev->mc_tomb; pmc; pmc = pmc_next) { - pmc_next = pmc->next; + for (pmc = rtnl_dereference(idev->mc_tomb); + pmc; + pmc = pmc_next) { + pmc_next = rtnl_dereference(pmc->next); if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; @@ -1954,17 +1927,17 @@ static void mld_send_cr(struct inet6_dev *idev) !rcu_access_pointer(pmc->mca_tomb) && !rcu_access_pointer(pmc->mca_sources)) { if (pmc_prev) - pmc_prev->next = pmc_next; + rcu_assign_pointer(pmc_prev->next, pmc_next); else - idev->mc_tomb = pmc_next; + rcu_assign_pointer(idev->mc_tomb, pmc_next); in6_dev_put(pmc->idev); - kfree(pmc); + kfree_rcu(pmc, rcu); } else pmc_prev = pmc; } /* change recs */ - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + for_each_mc_rtnl(idev, pmc) { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; @@ -1987,7 +1960,6 @@ static void mld_send_cr(struct inet6_dev *idev) } spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); if (!skb) return; (void) mld_sendpack(skb); @@ -2099,8 +2071,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) return; skb = NULL; - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + for_each_mc_rtnl(idev, pmc) { spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; @@ -2109,7 +2080,6 @@ static void mld_send_initial_cr(struct inet6_dev *idev) skb = add_grec(skb, pmc, type, 0, 0, 1); spin_unlock_bh(&pmc->mca_lock); } - read_unlock_bh(&idev->lock); if (skb) mld_sendpack(skb); } @@ -2132,7 +2102,9 @@ static void mld_dad_work(struct work_struct *work) struct inet6_dev, mc_dad_work); + rtnl_lock(); mld_send_initial_cr(idev); + rtnl_unlock(); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) @@ -2194,24 +2166,22 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + + for_each_mc_rtnl(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } - if (!pmc) { - /* MCA not found?? bug */ - read_unlock_bh(&idev->lock); + if (!pmc) return -ESRCH; - } spin_lock_bh(&pmc->mca_lock); + sf_markstate(pmc); if (!delta) { if (!pmc->mca_sfcount[sfmode]) { spin_unlock_bh(&pmc->mca_lock); - read_unlock_bh(&idev->lock); return -EINVAL; } + pmc->mca_sfcount[sfmode]--; } err = 0; @@ -2237,7 +2207,6 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, } else if (sf_setstate(pmc) || changerec) mld_ifc_event(pmc->idev); spin_unlock_bh(&pmc->mca_lock); - read_unlock_bh(&idev->lock); return err; } @@ -2363,16 +2332,13 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) { + + for_each_mc_rtnl(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } - if (!pmc) { - /* MCA not found?? bug */ - read_unlock_bh(&idev->lock); + if (!pmc) return -ESRCH; - } spin_lock_bh(&pmc->mca_lock); sf_markstate(pmc); @@ -2407,10 +2373,10 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, for_each_psf_rtnl(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(idev); - } else if (sf_setstate(pmc)) + } else if (sf_setstate(pmc)) { mld_ifc_event(idev); + } spin_unlock_bh(&pmc->mca_lock); - read_unlock_bh(&idev->lock); return err; } @@ -2485,9 +2451,10 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, static void igmp6_leave_group(struct ifmcaddr6 *ma) { if (mld_in_v1_mode(ma->idev)) { - if (ma->mca_flags & MAF_LAST_REPORTER) + if (ma->mca_flags & MAF_LAST_REPORTER) { igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REDUCTION); + } } else { mld_add_delrec(ma->idev, ma); mld_ifc_event(ma->idev); @@ -2500,8 +2467,12 @@ static void mld_gq_work(struct work_struct *work) struct inet6_dev, mc_gq_work); - idev->mc_gq_running = 0; + rtnl_lock(); mld_send_report(idev, NULL); + rtnl_unlock(); + + idev->mc_gq_running = 0; + in6_dev_put(idev); } @@ -2511,7 +2482,10 @@ static void mld_ifc_work(struct work_struct *work) struct inet6_dev, mc_ifc_work); + rtnl_lock(); mld_send_cr(idev); + rtnl_unlock(); + if (idev->mc_ifc_count) { idev->mc_ifc_count--; if (idev->mc_ifc_count) @@ -2525,6 +2499,7 @@ static void mld_ifc_event(struct inet6_dev *idev) { if (mld_in_v1_mode(idev)) return; + idev->mc_ifc_count = idev->mc_qrv; mld_ifc_start_work(idev, 1); } @@ -2534,10 +2509,12 @@ static void mld_mca_work(struct work_struct *work) struct ifmcaddr6 *ma = container_of(to_delayed_work(work), struct ifmcaddr6, mca_work); + rtnl_lock(); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); + rtnl_unlock(); spin_lock_bh(&ma->mca_lock); ma->mca_flags |= MAF_LAST_REPORTER; @@ -2554,10 +2531,8 @@ void ipv6_mc_unmap(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ - read_lock_bh(&idev->lock); - for (i = idev->mc_list; i; i = i->next) + for_each_mc_rtnl(idev, i) igmp6_group_dropped(i); - read_unlock_bh(&idev->lock); } void ipv6_mc_remap(struct inet6_dev *idev) @@ -2572,10 +2547,7 @@ void ipv6_mc_down(struct inet6_dev *idev) struct ifmcaddr6 *i; /* Withdraw multicast list */ - - read_lock_bh(&idev->lock); - - for (i = idev->mc_list; i; i = i->next) + for_each_mc_rtnl(idev, i) igmp6_group_dropped(i); /* Should stop work after group drop. or we will @@ -2584,7 +2556,6 @@ void ipv6_mc_down(struct inet6_dev *idev) mld_ifc_stop_work(idev); mld_gq_stop_work(idev); mld_dad_stop_work(idev); - read_unlock_bh(&idev->lock); } static void ipv6_mc_reset(struct inet6_dev *idev) @@ -2604,28 +2575,24 @@ void ipv6_mc_up(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ - read_lock_bh(&idev->lock); ipv6_mc_reset(idev); - for (i = idev->mc_list; i; i = i->next) { + for_each_mc_rtnl(idev, i) { mld_del_delrec(idev, i); igmp6_group_added(i); } - read_unlock_bh(&idev->lock); } /* IPv6 device initialization. */ void ipv6_mc_init_dev(struct inet6_dev *idev) { - write_lock_bh(&idev->lock); idev->mc_gq_running = 0; INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work); - idev->mc_tomb = NULL; + RCU_INIT_POINTER(idev->mc_tomb, NULL); idev->mc_ifc_count = 0; INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work); INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work); ipv6_mc_reset(idev); - write_unlock_bh(&idev->lock); } /* @@ -2650,16 +2617,12 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) if (idev->cnf.forwarding) __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); - write_lock_bh(&idev->lock); - while ((i = idev->mc_list) != NULL) { - idev->mc_list = i->next; + while ((i = rtnl_dereference(idev->mc_list))) { + rcu_assign_pointer(idev->mc_list, rtnl_dereference(i->next)); - write_unlock_bh(&idev->lock); ip6_mc_clear_src(i); ma_put(i); - write_lock_bh(&idev->lock); } - write_unlock_bh(&idev->lock); } static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) @@ -2669,12 +2632,11 @@ static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) ASSERT_RTNL(); if (mld_in_v1_mode(idev)) { - read_lock_bh(&idev->lock); - for (pmc = idev->mc_list; pmc; pmc = pmc->next) + for_each_mc_rtnl(idev, pmc) igmp6_join_group(pmc); - read_unlock_bh(&idev->lock); - } else + } else { mld_send_report(idev, NULL); + } } static int ipv6_mc_netdev_event(struct notifier_block *this, @@ -2721,13 +2683,12 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) idev = __in6_dev_get(state->dev); if (!idev) continue; - read_lock_bh(&idev->lock); - im = idev->mc_list; + + im = rcu_dereference(idev->mc_list); if (im) { state->idev = idev; break; } - read_unlock_bh(&idev->lock); } return im; } @@ -2736,11 +2697,8 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); - im = im->next; + im = rcu_dereference(im->next); while (!im) { - if (likely(state->idev)) - read_unlock_bh(&state->idev->lock); - state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; @@ -2749,8 +2707,7 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; - read_lock_bh(&state->idev->lock); - im = state->idev->mc_list; + im = rcu_dereference(state->idev->mc_list); } return im; } @@ -2784,10 +2741,8 @@ static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) { struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); - if (likely(state->idev)) { - read_unlock_bh(&state->idev->lock); + if (likely(state->idev)) state->idev = NULL; - } state->dev = NULL; rcu_read_unlock(); } @@ -2802,7 +2757,7 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, &im->mca_addr, im->mca_users, im->mca_flags, - (im->mca_flags&MAF_TIMER_RUNNING) ? + (im->mca_flags & MAF_TIMER_RUNNING) ? jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0); return 0; } @@ -2837,8 +2792,8 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) idev = __in6_dev_get(state->dev); if (unlikely(idev == NULL)) continue; - read_lock_bh(&idev->lock); - im = idev->mc_list; + + im = rcu_dereference(idev->mc_list); if (likely(im)) { spin_lock_bh(&im->mca_lock); psf = rcu_dereference(im->mca_sources); @@ -2849,7 +2804,6 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) } spin_unlock_bh(&im->mca_lock); } - read_unlock_bh(&idev->lock); } return psf; } @@ -2861,11 +2815,8 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s psf = rcu_dereference(psf->sf_next); while (!psf) { spin_unlock_bh(&state->im->mca_lock); - state->im = state->im->next; + state->im = rcu_dereference(state->im->next); while (!state->im) { - if (likely(state->idev)) - read_unlock_bh(&state->idev->lock); - state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; @@ -2874,8 +2825,7 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s state->idev = __in6_dev_get(state->dev); if (!state->idev) continue; - read_lock_bh(&state->idev->lock); - state->im = state->idev->mc_list; + state->im = rcu_dereference(state->idev->mc_list); } if (!state->im) break; @@ -2917,14 +2867,14 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + if (likely(state->im)) { spin_unlock_bh(&state->im->mca_lock); state->im = NULL; } - if (likely(state->idev)) { - read_unlock_bh(&state->idev->lock); + if (likely(state->idev)) state->idev = NULL; - } + state->dev = NULL; rcu_read_unlock(); } -- cgit v1.2.3 From f185de28d9ae6c978135993769352e523ee8df06 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 25 Mar 2021 16:16:56 +0000 Subject: mld: add new workqueues for process mld events When query/report packets are received, mld module processes them. But they are processed under BH context so it couldn't use sleepable functions. So, in order to switch context, the two workqueues are added which processes query and report event. In the struct inet6_dev, mc_{query | report}_queue are added so it is per-interface queue. And mc_{query | report}_work are workqueue structure. When the query or report event is received, skb is queued to proper queue and worker function is scheduled immediately. Workqueues and queues are protected by spinlock, which is mc_{query | report}_lock, and worker functions are protected by RTNL. Suggested-by: Cong Wang Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/net/if_inet6.h | 9 +- include/net/mld.h | 3 + net/ipv6/icmp.c | 4 +- net/ipv6/mcast.c | 280 ++++++++++++++++++++++++++++++++++--------------- 4 files changed, 210 insertions(+), 86 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 521158e05c18..882e0f88756f 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -125,7 +125,6 @@ struct ifmcaddr6 { unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; - spinlock_t mca_lock; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; @@ -183,6 +182,14 @@ struct inet6_dev { struct delayed_work mc_gq_work; /* general query work */ struct delayed_work mc_ifc_work; /* interface change work */ struct delayed_work mc_dad_work; /* dad complete mc work */ + struct delayed_work mc_query_work; /* mld query work */ + struct delayed_work mc_report_work; /* mld report work */ + + struct sk_buff_head mc_query_queue; /* mld query queue */ + struct sk_buff_head mc_report_queue; /* mld report queue */ + + spinlock_t mc_query_lock; /* mld query queue lock */ + spinlock_t mc_report_lock; /* mld query report lock */ struct ifacaddr6 *ac_list; rwlock_t lock; diff --git a/include/net/mld.h b/include/net/mld.h index 496bddb59942..c07359808493 100644 --- a/include/net/mld.h +++ b/include/net/mld.h @@ -92,6 +92,9 @@ struct mld2_query { #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) +#define MLD_MAX_QUEUE 8 +#define MLD_MAX_SKBS 32 + static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index fd1f896115c1..29d38d6b55fb 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -944,11 +944,11 @@ static int icmpv6_rcv(struct sk_buff *skb) case ICMPV6_MGM_QUERY: igmp6_event_query(skb); - break; + return 0; case ICMPV6_MGM_REPORT: igmp6_event_report(skb); - break; + return 0; case ICMPV6_MGM_REDUCTION: case ICMPV6_NI_QUERY: diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 75541cf53153..3ad754388933 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -439,7 +439,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) count += psl->sl_max; - newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC); + newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -517,7 +517,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, } if (gsf->gf_numsrc) { newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), - GFP_ATOMIC); + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -659,13 +659,11 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) IPV6_ADDR_SCOPE_LINKLOCAL) return; - spin_lock_bh(&mc->mca_lock); if (!(mc->mca_flags&MAF_LOADED)) { mc->mca_flags |= MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } - spin_unlock_bh(&mc->mca_lock); if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) return; @@ -695,24 +693,20 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) IPV6_ADDR_SCOPE_LINKLOCAL) return; - spin_lock_bh(&mc->mca_lock); if (mc->mca_flags&MAF_LOADED) { mc->mca_flags &= ~MAF_LOADED; if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } - spin_unlock_bh(&mc->mca_lock); if (mc->mca_flags & MAF_NOREPORT) return; if (!mc->idev->dead) igmp6_leave_group(mc); - spin_lock_bh(&mc->mca_lock); if (cancel_delayed_work(&mc->mca_work)) refcount_dec(&mc->mca_refcnt); - spin_unlock_bh(&mc->mca_lock); } /* @@ -728,12 +722,10 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ - pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC); + pmc = kzalloc(sizeof(*pmc), GFP_KERNEL); if (!pmc) return; - spin_lock_bh(&im->mca_lock); - spin_lock_init(&pmc->mca_lock); pmc->idev = im->idev; in6_dev_hold(idev); pmc->mca_addr = im->mca_addr; @@ -752,7 +744,6 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) for_each_psf_rtnl(pmc, psf) psf->sf_crcount = pmc->mca_crcount; } - spin_unlock_bh(&im->mca_lock); rcu_assign_pointer(pmc->next, idev->mc_tomb); rcu_assign_pointer(idev->mc_tomb, pmc); @@ -777,7 +768,6 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) rcu_assign_pointer(idev->mc_tomb, pmc->next); } - spin_lock_bh(&im->mca_lock); if (pmc) { im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { @@ -799,7 +789,6 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) ip6_mc_clear_src(pmc); kfree_rcu(pmc, rcu); } - spin_unlock_bh(&im->mca_lock); } static void mld_clear_delrec(struct inet6_dev *idev) @@ -820,10 +809,8 @@ static void mld_clear_delrec(struct inet6_dev *idev) for_each_mc_rtnl(idev, pmc) { struct ip6_sf_list *psf, *psf_next; - spin_lock_bh(&pmc->mca_lock); psf = rtnl_dereference(pmc->mca_tomb); RCU_INIT_POINTER(pmc->mca_tomb, NULL); - spin_unlock_bh(&pmc->mca_lock); for (; psf; psf = psf_next) { psf_next = rtnl_dereference(psf->sf_next); kfree_rcu(psf, rcu); @@ -831,6 +818,26 @@ static void mld_clear_delrec(struct inet6_dev *idev) } } +static void mld_clear_query(struct inet6_dev *idev) +{ + struct sk_buff *skb; + + spin_lock_bh(&idev->mc_query_lock); + while ((skb = __skb_dequeue(&idev->mc_query_queue))) + kfree_skb(skb); + spin_unlock_bh(&idev->mc_query_lock); +} + +static void mld_clear_report(struct inet6_dev *idev) +{ + struct sk_buff *skb; + + spin_lock_bh(&idev->mc_report_lock); + while ((skb = __skb_dequeue(&idev->mc_report_queue))) + kfree_skb(skb); + spin_unlock_bh(&idev->mc_report_lock); +} + static void mca_get(struct ifmcaddr6 *mc) { refcount_inc(&mc->mca_refcnt); @@ -850,7 +857,7 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, { struct ifmcaddr6 *mc; - mc = kzalloc(sizeof(*mc), GFP_ATOMIC); + mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return NULL; @@ -862,7 +869,6 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, /* mca_stamp should be updated upon changes */ mc->mca_cstamp = mc->mca_tstamp = jiffies; refcount_set(&mc->mca_refcnt, 1); - spin_lock_init(&mc->mca_lock); mc->mca_sfmode = mode; mc->mca_sfcount[mode] = 1; @@ -995,7 +1001,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, if (src_addr && !ipv6_addr_any(src_addr)) { struct ip6_sf_list *psf; - spin_lock_bh(&mc->mca_lock); for_each_psf_rcu(mc, psf) { if (ipv6_addr_equal(&psf->sf_addr, src_addr)) break; @@ -1006,7 +1011,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, mc->mca_sfcount[MCAST_EXCLUDE]; else rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; - spin_unlock_bh(&mc->mca_lock); } else rv = true; /* don't filter unspecified source */ } @@ -1060,6 +1064,20 @@ static void mld_dad_stop_work(struct inet6_dev *idev) __in6_dev_put(idev); } +static void mld_query_stop_work(struct inet6_dev *idev) +{ + spin_lock_bh(&idev->mc_query_lock); + if (cancel_delayed_work(&idev->mc_query_work)) + __in6_dev_put(idev); + spin_unlock_bh(&idev->mc_query_lock); +} + +static void mld_report_stop_work(struct inet6_dev *idev) +{ + if (cancel_delayed_work_sync(&idev->mc_report_work)) + __in6_dev_put(idev); +} + /* * IGMP handling (alias multicast ICMPv6 messages) */ @@ -1093,7 +1111,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, int i, scount; scount = 0; - for_each_psf_rcu(pmc, psf) { + for_each_psf_rtnl(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1126,7 +1144,7 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, /* mark INCLUDE-mode sources */ scount = 0; - for_each_psf_rcu(pmc, psf) { + for_each_psf_rtnl(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1317,19 +1335,42 @@ static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, /* called with rcu_read_lock() */ int igmp6_event_query(struct sk_buff *skb) +{ + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return -EINVAL; + + if (idev->dead) { + kfree_skb(skb); + return -ENODEV; + } + + spin_lock_bh(&idev->mc_query_lock); + if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { + __skb_queue_tail(&idev->mc_query_queue, skb); + if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) + in6_dev_hold(idev); + } + spin_unlock_bh(&idev->mc_query_lock); + + return 0; +} + +static void __mld_query_work(struct sk_buff *skb) { struct mld2_query *mlh2 = NULL; - struct ifmcaddr6 *ma; const struct in6_addr *group; unsigned long max_delay; struct inet6_dev *idev; + struct ifmcaddr6 *ma; struct mld_msg *mld; int group_type; int mark = 0; int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) - return -EINVAL; + goto out; /* compute payload length excluding extension headers */ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); @@ -1346,11 +1387,11 @@ int igmp6_event_query(struct sk_buff *skb) ipv6_hdr(skb)->hop_limit != 1 || !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) - return -EINVAL; + goto out; idev = __in6_dev_get(skb->dev); if (!idev) - return 0; + goto out; mld = (struct mld_msg *)icmp6_hdr(skb); group = &mld->mld_mca; @@ -1358,59 +1399,56 @@ int igmp6_event_query(struct sk_buff *skb) if (group_type != IPV6_ADDR_ANY && !(group_type&IPV6_ADDR_MULTICAST)) - return -EINVAL; + goto out; if (len < MLD_V1_QUERY_LEN) { - return -EINVAL; + goto out; } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) { err = mld_process_v1(idev, mld, &max_delay, len == MLD_V1_QUERY_LEN); if (err < 0) - return err; + goto out; } else if (len >= MLD_V2_QUERY_LEN_MIN) { int srcs_offset = sizeof(struct mld2_query) - sizeof(struct icmp6hdr); if (!pskb_may_pull(skb, srcs_offset)) - return -EINVAL; + goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); err = mld_process_v2(idev, mlh2, &max_delay); if (err < 0) - return err; + goto out; if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) - return -EINVAL; /* no sources allowed */ + goto out; /* no sources allowed */ mld_gq_start_work(idev); - return 0; + goto out; } /* mark sources to include, if group & source-specific */ if (mlh2->mld2q_nsrcs != 0) { if (!pskb_may_pull(skb, srcs_offset + ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr))) - return -EINVAL; + goto out; mlh2 = (struct mld2_query *)skb_transport_header(skb); mark = 1; } } else { - return -EINVAL; + goto out; } if (group_type == IPV6_ADDR_ANY) { - for_each_mc_rcu(idev, ma) { - spin_lock_bh(&ma->mca_lock); + for_each_mc_rtnl(idev, ma) { igmp6_group_queried(ma, max_delay); - spin_unlock_bh(&ma->mca_lock); } } else { - for_each_mc_rcu(idev, ma) { + for_each_mc_rtnl(idev, ma) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; - spin_lock_bh(&ma->mca_lock); if (ma->mca_flags & MAF_TIMER_RUNNING) { /* gsquery <- gsquery && mark */ if (!mark) @@ -1425,16 +1463,72 @@ int igmp6_event_query(struct sk_buff *skb) if (!(ma->mca_flags & MAF_GSQUERY) || mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs)) igmp6_group_queried(ma, max_delay); - spin_unlock_bh(&ma->mca_lock); break; } } - return 0; +out: + consume_skb(skb); +} + +static void mld_query_work(struct work_struct *work) +{ + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_query_work); + struct sk_buff_head q; + struct sk_buff *skb; + bool rework = false; + int cnt = 0; + + skb_queue_head_init(&q); + + spin_lock_bh(&idev->mc_query_lock); + while ((skb = __skb_dequeue(&idev->mc_query_queue))) { + __skb_queue_tail(&q, skb); + + if (++cnt >= MLD_MAX_QUEUE) { + rework = true; + schedule_delayed_work(&idev->mc_query_work, 0); + break; + } + } + spin_unlock_bh(&idev->mc_query_lock); + + rtnl_lock(); + while ((skb = __skb_dequeue(&q))) + __mld_query_work(skb); + rtnl_unlock(); + + if (!rework) + in6_dev_put(idev); } /* called with rcu_read_lock() */ int igmp6_event_report(struct sk_buff *skb) +{ + struct inet6_dev *idev = __in6_dev_get(skb->dev); + + if (!idev) + return -EINVAL; + + if (idev->dead) { + kfree_skb(skb); + return -ENODEV; + } + + spin_lock_bh(&idev->mc_report_lock); + if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { + __skb_queue_tail(&idev->mc_report_queue, skb); + if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) + in6_dev_hold(idev); + } + spin_unlock_bh(&idev->mc_report_lock); + + return 0; +} + +static void __mld_report_work(struct sk_buff *skb) { struct ifmcaddr6 *ma; struct inet6_dev *idev; @@ -1443,15 +1537,15 @@ int igmp6_event_report(struct sk_buff *skb) /* Our own report looped back. Ignore it. */ if (skb->pkt_type == PACKET_LOOPBACK) - return 0; + goto out; /* send our report if the MC router may not have heard this report */ if (skb->pkt_type != PACKET_MULTICAST && skb->pkt_type != PACKET_BROADCAST) - return 0; + goto out; if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) - return -EINVAL; + goto out; mld = (struct mld_msg *)icmp6_hdr(skb); @@ -1459,28 +1553,60 @@ int igmp6_event_report(struct sk_buff *skb) addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); if (addr_type != IPV6_ADDR_ANY && !(addr_type&IPV6_ADDR_LINKLOCAL)) - return -EINVAL; + goto out; idev = __in6_dev_get(skb->dev); if (!idev) - return -ENODEV; + goto out; /* * Cancel the work for this group */ - for_each_mc_rcu(idev, ma) { + for_each_mc_rtnl(idev, ma) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { - spin_lock(&ma->mca_lock); if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); ma->mca_flags &= ~(MAF_LAST_REPORTER | MAF_TIMER_RUNNING); - spin_unlock(&ma->mca_lock); break; } } - return 0; + +out: + consume_skb(skb); +} + +static void mld_report_work(struct work_struct *work) +{ + struct inet6_dev *idev = container_of(to_delayed_work(work), + struct inet6_dev, + mc_report_work); + struct sk_buff_head q; + struct sk_buff *skb; + bool rework = false; + int cnt = 0; + + skb_queue_head_init(&q); + spin_lock_bh(&idev->mc_report_lock); + while ((skb = __skb_dequeue(&idev->mc_report_queue))) { + __skb_queue_tail(&q, skb); + + if (++cnt >= MLD_MAX_QUEUE) { + rework = true; + schedule_delayed_work(&idev->mc_report_work, 0); + break; + } + } + spin_unlock_bh(&idev->mc_report_lock); + + rtnl_lock(); + while ((skb = __skb_dequeue(&q))) + __mld_report_work(skb); + rtnl_unlock(); + + if (!rework) + in6_dev_put(idev); } static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, @@ -1847,22 +1973,18 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) for_each_mc_rtnl(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) continue; - spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); - spin_unlock_bh(&pmc->mca_lock); } } else { - spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_MODE_IS_EXCLUDE; else type = MLD2_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0, 0); - spin_unlock_bh(&pmc->mca_lock); } if (skb) mld_sendpack(skb); @@ -1938,7 +2060,6 @@ static void mld_send_cr(struct inet6_dev *idev) /* change recs */ for_each_mc_rtnl(idev, pmc) { - spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_ALLOW_NEW_SOURCES; @@ -1958,7 +2079,6 @@ static void mld_send_cr(struct inet6_dev *idev) skb = add_grec(skb, pmc, type, 0, 0, 0); pmc->mca_crcount--; } - spin_unlock_bh(&pmc->mca_lock); } if (!skb) return; @@ -2072,13 +2192,11 @@ static void mld_send_initial_cr(struct inet6_dev *idev) skb = NULL; for_each_mc_rtnl(idev, pmc) { - spin_lock_bh(&pmc->mca_lock); if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else type = MLD2_ALLOW_NEW_SOURCES; skb = add_grec(skb, pmc, type, 0, 0, 1); - spin_unlock_bh(&pmc->mca_lock); } if (skb) mld_sendpack(skb); @@ -2104,13 +2222,13 @@ static void mld_dad_work(struct work_struct *work) rtnl_lock(); mld_send_initial_cr(idev); - rtnl_unlock(); if (idev->mc_dad_count) { idev->mc_dad_count--; if (idev->mc_dad_count) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } + rtnl_unlock(); in6_dev_put(idev); } @@ -2173,12 +2291,10 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, } if (!pmc) return -ESRCH; - spin_lock_bh(&pmc->mca_lock); sf_markstate(pmc); if (!delta) { if (!pmc->mca_sfcount[sfmode]) { - spin_unlock_bh(&pmc->mca_lock); return -EINVAL; } @@ -2206,7 +2322,6 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, mld_ifc_event(pmc->idev); } else if (sf_setstate(pmc) || changerec) mld_ifc_event(pmc->idev); - spin_unlock_bh(&pmc->mca_lock); return err; } @@ -2225,7 +2340,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, psf_prev = psf; } if (!psf) { - psf = kzalloc(sizeof(*psf), GFP_ATOMIC); + psf = kzalloc(sizeof(*psf), GFP_KERNEL); if (!psf) return -ENOBUFS; @@ -2304,7 +2419,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) &psf->sf_addr)) break; if (!dpsf) { - dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); + dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL); if (!dpsf) continue; *dpsf = *psf; @@ -2339,7 +2454,6 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, } if (!pmc) return -ESRCH; - spin_lock_bh(&pmc->mca_lock); sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; @@ -2376,7 +2490,6 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, } else if (sf_setstate(pmc)) { mld_ifc_event(idev); } - spin_unlock_bh(&pmc->mca_lock); return err; } @@ -2415,7 +2528,6 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) delay = prandom_u32() % unsolicited_report_interval(ma->idev); - spin_lock_bh(&ma->mca_lock); if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); delay = ma->mca_work.timer.expires - jiffies; @@ -2424,7 +2536,6 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; - spin_unlock_bh(&ma->mca_lock); } static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, @@ -2469,9 +2580,8 @@ static void mld_gq_work(struct work_struct *work) rtnl_lock(); mld_send_report(idev, NULL); - rtnl_unlock(); - idev->mc_gq_running = 0; + rtnl_unlock(); in6_dev_put(idev); } @@ -2484,7 +2594,6 @@ static void mld_ifc_work(struct work_struct *work) rtnl_lock(); mld_send_cr(idev); - rtnl_unlock(); if (idev->mc_ifc_count) { idev->mc_ifc_count--; @@ -2492,6 +2601,7 @@ static void mld_ifc_work(struct work_struct *work) mld_ifc_start_work(idev, unsolicited_report_interval(idev)); } + rtnl_unlock(); in6_dev_put(idev); } @@ -2514,12 +2624,10 @@ static void mld_mca_work(struct work_struct *work) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); - rtnl_unlock(); - - spin_lock_bh(&ma->mca_lock); ma->mca_flags |= MAF_LAST_REPORTER; ma->mca_flags &= ~MAF_TIMER_RUNNING; - spin_unlock_bh(&ma->mca_lock); + rtnl_unlock(); + ma_put(ma); } @@ -2553,6 +2661,9 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Should stop work after group drop. or we will * start work again in mld_ifc_event() */ + synchronize_net(); + mld_query_stop_work(idev); + mld_report_stop_work(idev); mld_ifc_stop_work(idev); mld_gq_stop_work(idev); mld_dad_stop_work(idev); @@ -2592,6 +2703,12 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) idev->mc_ifc_count = 0; INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work); INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work); + INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work); + INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work); + skb_queue_head_init(&idev->mc_query_queue); + skb_queue_head_init(&idev->mc_report_queue); + spin_lock_init(&idev->mc_query_lock); + spin_lock_init(&idev->mc_report_lock); ipv6_mc_reset(idev); } @@ -2606,6 +2723,8 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) /* Deactivate works */ ipv6_mc_down(idev); mld_clear_delrec(idev); + mld_clear_query(idev); + mld_clear_report(idev); /* Delete all-nodes address. */ /* We cannot call ipv6_dev_mc_dec() directly, our caller in @@ -2795,14 +2914,12 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) im = rcu_dereference(idev->mc_list); if (likely(im)) { - spin_lock_bh(&im->mca_lock); psf = rcu_dereference(im->mca_sources); if (likely(psf)) { state->im = im; state->idev = idev; break; } - spin_unlock_bh(&im->mca_lock); } } return psf; @@ -2814,7 +2931,6 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s psf = rcu_dereference(psf->sf_next); while (!psf) { - spin_unlock_bh(&state->im->mca_lock); state->im = rcu_dereference(state->im->next); while (!state->im) { state->dev = next_net_device_rcu(state->dev); @@ -2829,7 +2945,6 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s } if (!state->im) break; - spin_lock_bh(&state->im->mca_lock); psf = rcu_dereference(state->im->mca_sources); } out: @@ -2868,10 +2983,8 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) { struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); - if (likely(state->im)) { - spin_unlock_bh(&state->im->mca_lock); + if (likely(state->im)) state->im = NULL; - } if (likely(state->idev)) state->idev = NULL; @@ -2955,6 +3068,7 @@ static int __net_init igmp6_net_init(struct net *net) } inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1; + net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL; err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); -- cgit v1.2.3 From 63ed8de4be81b699ca727e9f8e3344bd487806d7 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 25 Mar 2021 16:16:57 +0000 Subject: mld: add mc_lock for protecting per-interface mld data The purpose of this lock is to avoid a bottleneck in the query/report event handler logic. By previous patches, almost all mld data is protected by RTNL. So, the query and report event handler, which is data path logic acquires RTNL too. Therefore if a lot of query and report events are received, it uses RTNL for a long time. So it makes the control-plane bottleneck because of using RTNL. In order to avoid this bottleneck, mc_lock is added. mc_lock protect only per-interface mld data and per-interface mld data is used in the query/report event handler logic. So, no longer rtnl_lock is needed in the query/report event handler logic. Therefore bottleneck will be disappeared by mc_lock. Suggested-by: Cong Wang Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/net/if_inet6.h | 1 + net/ipv6/mcast.c | 309 ++++++++++++++++++++++++++++++------------------- 2 files changed, 194 insertions(+), 116 deletions(-) (limited to 'include') diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index 882e0f88756f..71bb4cc4d05d 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -190,6 +190,7 @@ struct inet6_dev { spinlock_t mc_query_lock; /* mld query queue lock */ spinlock_t mc_report_lock; /* mld query report lock */ + struct mutex mc_lock; /* mld global lock */ struct ifacaddr6 *ac_list; rwlock_t lock; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 3ad754388933..49b0cebfdcdc 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -111,6 +111,8 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; /* * socket join on multicast group */ +#define mc_dereference(e, idev) \ + rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock)) #define for_each_pmc_rtnl(np, pmc) \ for (pmc = rtnl_dereference((np)->ipv6_mc_list); \ @@ -122,10 +124,10 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; pmc; \ pmc = rcu_dereference(pmc->next)) -#define for_each_psf_rtnl(mc, psf) \ - for (psf = rtnl_dereference((mc)->mca_sources); \ +#define for_each_psf_mclock(mc, psf) \ + for (psf = mc_dereference((mc)->mca_sources, mc->idev); \ psf; \ - psf = rtnl_dereference(psf->sf_next)) + psf = mc_dereference(psf->sf_next, mc->idev)) #define for_each_psf_rcu(mc, psf) \ for (psf = rcu_dereference((mc)->mca_sources); \ @@ -133,14 +135,14 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; psf = rcu_dereference(psf->sf_next)) #define for_each_psf_tomb(mc, psf) \ - for (psf = rtnl_dereference((mc)->mca_tomb); \ + for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \ psf; \ - psf = rtnl_dereference(psf->sf_next)) + psf = mc_dereference(psf->sf_next, mc->idev)) -#define for_each_mc_rtnl(idev, mc) \ - for (mc = rtnl_dereference((idev)->mc_list); \ +#define for_each_mc_mclock(idev, mc) \ + for (mc = mc_dereference((idev)->mc_list, idev); \ mc; \ - mc = rtnl_dereference(mc->next)) + mc = mc_dereference(mc->next, idev)) #define for_each_mc_rcu(idev, mc) \ for (mc = rcu_dereference((idev)->mc_list); \ @@ -148,9 +150,9 @@ int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT; mc = rcu_dereference(mc->next)) #define for_each_mc_tomb(idev, mc) \ - for (mc = rtnl_dereference((idev)->mc_tomb); \ + for (mc = mc_dereference((idev)->mc_tomb, idev); \ mc; \ - mc = rtnl_dereference(mc->next)) + mc = mc_dereference(mc->next, idev)) static int unsolicited_report_interval(struct inet6_dev *idev) { @@ -268,11 +270,12 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); - (void) ip6_mc_leave_src(sk, mc_lst, idev); + ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else - (void) ip6_mc_leave_src(sk, mc_lst, NULL); + } else { + ip6_mc_leave_src(sk, mc_lst, NULL); + } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); @@ -329,11 +332,12 @@ void __ipv6_sock_mc_close(struct sock *sk) if (dev) { struct inet6_dev *idev = __in6_dev_get(dev); - (void) ip6_mc_leave_src(sk, mc_lst, idev); + ip6_mc_leave_src(sk, mc_lst, idev); if (idev) __ipv6_dev_mc_dec(idev, &mc_lst->addr); - } else - (void) ip6_mc_leave_src(sk, mc_lst, NULL); + } else { + ip6_mc_leave_src(sk, mc_lst, NULL); + } atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc); kfree_rcu(mc_lst, rcu); @@ -376,6 +380,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, err = -EADDRNOTAVAIL; + mutex_lock(&idev->mc_lock); for_each_pmc_rtnl(inet6, pmc) { if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) continue; @@ -469,6 +474,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, /* update the interface list */ ip6_mc_add_src(idev, group, omode, 1, source, 1); done: + mutex_unlock(&idev->mc_lock); if (leavegroup) err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group); return err; @@ -529,25 +535,33 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } + mutex_lock(&idev->mc_lock); err = ip6_mc_add_src(idev, group, gsf->gf_fmode, - newpsl->sl_count, newpsl->sl_addr, 0); + newpsl->sl_count, newpsl->sl_addr, 0); if (err) { + mutex_unlock(&idev->mc_lock); sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); goto done; } + mutex_unlock(&idev->mc_lock); } else { newpsl = NULL; - (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); + mutex_lock(&idev->mc_lock); + ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0); + mutex_unlock(&idev->mc_lock); } + mutex_lock(&idev->mc_lock); psl = rtnl_dereference(pmc->sflist); if (psl) { - (void) ip6_mc_del_src(idev, group, pmc->sfmode, - psl->sl_count, psl->sl_addr, 0); + ip6_mc_del_src(idev, group, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); kfree_rcu(psl, rcu); - } else - (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + } else { + ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + } + mutex_unlock(&idev->mc_lock); rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = gsf->gf_fmode; err = 0; @@ -650,6 +664,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, return rv; } +/* called with mc_lock */ static void igmp6_group_added(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; @@ -684,6 +699,7 @@ static void igmp6_group_added(struct ifmcaddr6 *mc) mld_ifc_event(mc->idev); } +/* called with mc_lock */ static void igmp6_group_dropped(struct ifmcaddr6 *mc) { struct net_device *dev = mc->idev->dev; @@ -711,6 +727,7 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc) /* * deleted ifmcaddr6 manipulation + * called with mc_lock */ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { @@ -735,13 +752,13 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) struct ip6_sf_list *psf; rcu_assign_pointer(pmc->mca_tomb, - rtnl_dereference(im->mca_tomb)); + mc_dereference(im->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_sources, - rtnl_dereference(im->mca_sources)); + mc_dereference(im->mca_sources, idev)); RCU_INIT_POINTER(im->mca_tomb, NULL); RCU_INIT_POINTER(im->mca_sources, NULL); - for_each_psf_rtnl(pmc, psf) + for_each_psf_mclock(pmc, psf) psf->sf_crcount = pmc->mca_crcount; } @@ -749,6 +766,7 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) rcu_assign_pointer(idev->mc_tomb, pmc); } +/* called with mc_lock */ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) { struct ip6_sf_list *psf, *sources, *tomb; @@ -772,15 +790,15 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) im->idev = pmc->idev; if (im->mca_sfmode == MCAST_INCLUDE) { tomb = rcu_replace_pointer(im->mca_tomb, - rtnl_dereference(pmc->mca_tomb), - lockdep_rtnl_is_held()); + mc_dereference(pmc->mca_tomb, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_tomb, tomb); sources = rcu_replace_pointer(im->mca_sources, - rtnl_dereference(pmc->mca_sources), - lockdep_rtnl_is_held()); + mc_dereference(pmc->mca_sources, pmc->idev), + lockdep_is_held(&im->idev->mc_lock)); rcu_assign_pointer(pmc->mca_sources, sources); - for_each_psf_rtnl(im, psf) + for_each_psf_mclock(im, psf) psf->sf_crcount = idev->mc_qrv; } else { im->mca_crcount = idev->mc_qrv; @@ -791,28 +809,29 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) } } +/* called with mc_lock */ static void mld_clear_delrec(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *nextpmc; - pmc = rtnl_dereference(idev->mc_tomb); + pmc = mc_dereference(idev->mc_tomb, idev); RCU_INIT_POINTER(idev->mc_tomb, NULL); for (; pmc; pmc = nextpmc) { - nextpmc = rtnl_dereference(pmc->next); + nextpmc = mc_dereference(pmc->next, idev); ip6_mc_clear_src(pmc); in6_dev_put(pmc->idev); kfree_rcu(pmc, rcu); } /* clear dead sources, too */ - for_each_mc_rtnl(idev, pmc) { + for_each_mc_mclock(idev, pmc) { struct ip6_sf_list *psf, *psf_next; - psf = rtnl_dereference(pmc->mca_tomb); + psf = mc_dereference(pmc->mca_tomb, idev); RCU_INIT_POINTER(pmc->mca_tomb, NULL); for (; psf; psf = psf_next) { - psf_next = rtnl_dereference(psf->sf_next); + psf_next = mc_dereference(psf->sf_next, idev); kfree_rcu(psf, rcu); } } @@ -851,6 +870,7 @@ static void ma_put(struct ifmcaddr6 *mc) } } +/* called with mc_lock */ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, const struct in6_addr *addr, unsigned int mode) @@ -902,10 +922,12 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, return -ENODEV; } - for_each_mc_rtnl(idev, mc) { + mutex_lock(&idev->mc_lock); + for_each_mc_mclock(idev, mc) { if (ipv6_addr_equal(&mc->mca_addr, addr)) { mc->mca_users++; ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0); + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return 0; } @@ -913,6 +935,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mc = mca_alloc(idev, addr, mode); if (!mc) { + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); return -ENOMEM; } @@ -924,6 +947,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mld_del_delrec(idev, mc); igmp6_group_added(mc); + mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; } @@ -935,7 +959,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) EXPORT_SYMBOL(ipv6_dev_mc_inc); /* - * device multicast group del + * device multicast group del */ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) { @@ -943,8 +967,9 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) ASSERT_RTNL(); + mutex_lock(&idev->mc_lock); for (map = &idev->mc_list; - (ma = rtnl_dereference(*map)); + (ma = mc_dereference(*map, idev)); map = &ma->next) { if (ipv6_addr_equal(&ma->mca_addr, addr)) { if (--ma->mca_users == 0) { @@ -952,14 +977,17 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) igmp6_group_dropped(ma); ip6_mc_clear_src(ma); + mutex_unlock(&idev->mc_lock); ma_put(ma); return 0; } + mutex_unlock(&idev->mc_lock); return 0; } } + mutex_unlock(&idev->mc_lock); return -ENOENT; } @@ -1019,6 +1047,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, return rv; } +/* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { unsigned long tv = prandom_u32() % idev->mc_maxdelay; @@ -1028,6 +1057,7 @@ static void mld_gq_start_work(struct inet6_dev *idev) in6_dev_hold(idev); } +/* called with mc_lock */ static void mld_gq_stop_work(struct inet6_dev *idev) { idev->mc_gq_running = 0; @@ -1035,6 +1065,7 @@ static void mld_gq_stop_work(struct inet6_dev *idev) __in6_dev_put(idev); } +/* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = prandom_u32() % delay; @@ -1043,6 +1074,7 @@ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) in6_dev_hold(idev); } +/* called with mc_lock */ static void mld_ifc_stop_work(struct inet6_dev *idev) { idev->mc_ifc_count = 0; @@ -1050,6 +1082,7 @@ static void mld_ifc_stop_work(struct inet6_dev *idev) __in6_dev_put(idev); } +/* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { unsigned long tv = prandom_u32() % delay; @@ -1080,6 +1113,7 @@ static void mld_report_stop_work(struct inet6_dev *idev) /* * IGMP handling (alias multicast ICMPv6 messages) + * called with mc_lock */ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) { @@ -1103,7 +1137,9 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) ma->mca_flags |= MAF_TIMER_RUNNING; } -/* mark EXCLUDE-mode sources */ +/* mark EXCLUDE-mode sources + * called with mc_lock + */ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { @@ -1111,7 +1147,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, int i, scount; scount = 0; - for_each_psf_rtnl(pmc, psf) { + for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1132,6 +1168,7 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs, return true; } +/* called with mc_lock */ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, const struct in6_addr *srcs) { @@ -1144,7 +1181,7 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, /* mark INCLUDE-mode sources */ scount = 0; - for_each_psf_rtnl(pmc, psf) { + for_each_psf_mclock(pmc, psf) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { @@ -1370,7 +1407,7 @@ static void __mld_query_work(struct sk_buff *skb) int len, err; if (!pskb_may_pull(skb, sizeof(struct in6_addr))) - goto out; + goto kfree_skb; /* compute payload length excluding extension headers */ len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); @@ -1387,11 +1424,11 @@ static void __mld_query_work(struct sk_buff *skb) ipv6_hdr(skb)->hop_limit != 1 || !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) || IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD)) - goto out; + goto kfree_skb; - idev = __in6_dev_get(skb->dev); + idev = in6_dev_get(skb->dev); if (!idev) - goto out; + goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); group = &mld->mld_mca; @@ -1442,11 +1479,11 @@ static void __mld_query_work(struct sk_buff *skb) } if (group_type == IPV6_ADDR_ANY) { - for_each_mc_rtnl(idev, ma) { + for_each_mc_mclock(idev, ma) { igmp6_group_queried(ma, max_delay); } } else { - for_each_mc_rtnl(idev, ma) { + for_each_mc_mclock(idev, ma) { if (!ipv6_addr_equal(group, &ma->mca_addr)) continue; if (ma->mca_flags & MAF_TIMER_RUNNING) { @@ -1468,6 +1505,8 @@ static void __mld_query_work(struct sk_buff *skb) } out: + in6_dev_put(idev); +kfree_skb: consume_skb(skb); } @@ -1495,10 +1534,10 @@ static void mld_query_work(struct work_struct *work) } spin_unlock_bh(&idev->mc_query_lock); - rtnl_lock(); + mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_query_work(skb); - rtnl_unlock(); + mutex_unlock(&idev->mc_lock); if (!rework) in6_dev_put(idev); @@ -1530,22 +1569,22 @@ int igmp6_event_report(struct sk_buff *skb) static void __mld_report_work(struct sk_buff *skb) { - struct ifmcaddr6 *ma; struct inet6_dev *idev; + struct ifmcaddr6 *ma; struct mld_msg *mld; int addr_type; /* Our own report looped back. Ignore it. */ if (skb->pkt_type == PACKET_LOOPBACK) - goto out; + goto kfree_skb; /* send our report if the MC router may not have heard this report */ if (skb->pkt_type != PACKET_MULTICAST && skb->pkt_type != PACKET_BROADCAST) - goto out; + goto kfree_skb; if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr))) - goto out; + goto kfree_skb; mld = (struct mld_msg *)icmp6_hdr(skb); @@ -1553,17 +1592,17 @@ static void __mld_report_work(struct sk_buff *skb) addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); if (addr_type != IPV6_ADDR_ANY && !(addr_type&IPV6_ADDR_LINKLOCAL)) - goto out; + goto kfree_skb; - idev = __in6_dev_get(skb->dev); + idev = in6_dev_get(skb->dev); if (!idev) - goto out; + goto kfree_skb; /* * Cancel the work for this group */ - for_each_mc_rtnl(idev, ma) { + for_each_mc_mclock(idev, ma) { if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) { if (cancel_delayed_work(&ma->mca_work)) refcount_dec(&ma->mca_refcnt); @@ -1573,7 +1612,8 @@ static void __mld_report_work(struct sk_buff *skb) } } -out: + in6_dev_put(idev); +kfree_skb: consume_skb(skb); } @@ -1600,10 +1640,10 @@ static void mld_report_work(struct work_struct *work) } spin_unlock_bh(&idev->mc_report_lock); - rtnl_lock(); + mutex_lock(&idev->mc_lock); while ((skb = __skb_dequeue(&q))) __mld_report_work(skb); - rtnl_unlock(); + mutex_unlock(&idev->mc_lock); if (!rework) in6_dev_put(idev); @@ -1659,7 +1699,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) struct ip6_sf_list *psf; int scount = 0; - for_each_psf_rtnl(pmc, psf) { + for_each_psf_mclock(pmc, psf) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; @@ -1833,6 +1873,7 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) +/* called with mc_lock */ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted, int crsend) @@ -1878,12 +1919,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, } first = 1; psf_prev = NULL; - for (psf = rtnl_dereference(*psf_list); + for (psf = mc_dereference(*psf_list, idev); psf; psf = psf_next) { struct in6_addr *psrc; - psf_next = rtnl_dereference(psf->sf_next); + psf_next = mc_dereference(psf->sf_next, idev); if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) { psf_prev = psf; @@ -1931,10 +1972,10 @@ decrease_sf_crcount: if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, - rtnl_dereference(psf->sf_next)); + mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*psf_list, - rtnl_dereference(psf->sf_next)); + mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); continue; } @@ -1964,13 +2005,14 @@ empty_source: return skb; } +/* called with mc_lock */ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) { struct sk_buff *skb = NULL; int type; if (!pmc) { - for_each_mc_rtnl(idev, pmc) { + for_each_mc_mclock(idev, pmc) { if (pmc->mca_flags & MAF_NOREPORT) continue; if (pmc->mca_sfcount[MCAST_EXCLUDE]) @@ -1992,23 +2034,24 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) /* * remove zero-count source records from a source filter list + * called with mc_lock */ -static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf) +static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev) { struct ip6_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; - for (psf = rtnl_dereference(*ppsf); + for (psf = mc_dereference(*ppsf, idev); psf; psf = psf_next) { - psf_next = rtnl_dereference(psf->sf_next); + psf_next = mc_dereference(psf->sf_next, idev); if (psf->sf_crcount == 0) { if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, - rtnl_dereference(psf->sf_next)); + mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(*ppsf, - rtnl_dereference(psf->sf_next)); + mc_dereference(psf->sf_next, idev)); kfree_rcu(psf, rcu); } else { psf_prev = psf; @@ -2016,6 +2059,7 @@ static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf) } } +/* called with mc_lock */ static void mld_send_cr(struct inet6_dev *idev) { struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; @@ -2024,10 +2068,10 @@ static void mld_send_cr(struct inet6_dev *idev) /* deleted MCA's */ pmc_prev = NULL; - for (pmc = rtnl_dereference(idev->mc_tomb); + for (pmc = mc_dereference(idev->mc_tomb, idev); pmc; pmc = pmc_next) { - pmc_next = rtnl_dereference(pmc->next); + pmc_next = mc_dereference(pmc->next, idev); if (pmc->mca_sfmode == MCAST_INCLUDE) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_BLOCK_OLD_SOURCES; @@ -2041,8 +2085,8 @@ static void mld_send_cr(struct inet6_dev *idev) } pmc->mca_crcount--; if (pmc->mca_crcount == 0) { - mld_clear_zeros(&pmc->mca_tomb); - mld_clear_zeros(&pmc->mca_sources); + mld_clear_zeros(&pmc->mca_tomb, idev); + mld_clear_zeros(&pmc->mca_sources, idev); } } if (pmc->mca_crcount == 0 && @@ -2059,7 +2103,7 @@ static void mld_send_cr(struct inet6_dev *idev) } /* change recs */ - for_each_mc_rtnl(idev, pmc) { + for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { type = MLD2_BLOCK_OLD_SOURCES; dtype = MLD2_ALLOW_NEW_SOURCES; @@ -2181,6 +2225,7 @@ err_out: goto out; } +/* called with mc_lock */ static void mld_send_initial_cr(struct inet6_dev *idev) { struct sk_buff *skb; @@ -2191,7 +2236,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) return; skb = NULL; - for_each_mc_rtnl(idev, pmc) { + for_each_mc_mclock(idev, pmc) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) type = MLD2_CHANGE_TO_EXCLUDE; else @@ -2204,6 +2249,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev) void ipv6_mc_dad_complete(struct inet6_dev *idev) { + mutex_lock(&idev->mc_lock); idev->mc_dad_count = idev->mc_qrv; if (idev->mc_dad_count) { mld_send_initial_cr(idev); @@ -2212,6 +2258,7 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } + mutex_unlock(&idev->mc_lock); } static void mld_dad_work(struct work_struct *work) @@ -2219,8 +2266,7 @@ static void mld_dad_work(struct work_struct *work) struct inet6_dev *idev = container_of(to_delayed_work(work), struct inet6_dev, mc_dad_work); - - rtnl_lock(); + mutex_lock(&idev->mc_lock); mld_send_initial_cr(idev); if (idev->mc_dad_count) { idev->mc_dad_count--; @@ -2228,10 +2274,11 @@ static void mld_dad_work(struct work_struct *work) mld_dad_start_work(idev, unsolicited_report_interval(idev)); } - rtnl_unlock(); + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } +/* called with mc_lock */ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) { @@ -2239,7 +2286,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, int rv = 0; psf_prev = NULL; - for_each_psf_rtnl(pmc, psf) { + for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2255,16 +2302,16 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* no more filters for this source */ if (psf_prev) rcu_assign_pointer(psf_prev->sf_next, - rtnl_dereference(psf->sf_next)); + mc_dereference(psf->sf_next, idev)); else rcu_assign_pointer(pmc->mca_sources, - rtnl_dereference(psf->sf_next)); + mc_dereference(psf->sf_next, idev)); if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && !mld_in_v1_mode(idev)) { psf->sf_crcount = idev->mc_qrv; rcu_assign_pointer(psf->sf_next, - rtnl_dereference(pmc->mca_tomb)); + mc_dereference(pmc->mca_tomb, idev)); rcu_assign_pointer(pmc->mca_tomb, psf); rv = 1; } else { @@ -2274,6 +2321,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, return rv; } +/* called with mc_lock */ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, int delta) @@ -2285,7 +2333,7 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; - for_each_mc_rtnl(idev, pmc) { + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } @@ -2294,9 +2342,8 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, sf_markstate(pmc); if (!delta) { - if (!pmc->mca_sfcount[sfmode]) { + if (!pmc->mca_sfcount[sfmode]) return -EINVAL; - } pmc->mca_sfcount[sfmode]--; } @@ -2317,16 +2364,19 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_sfmode = MCAST_INCLUDE; pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for_each_psf_rtnl(pmc, psf) + for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(pmc->idev); - } else if (sf_setstate(pmc) || changerec) + } else if (sf_setstate(pmc) || changerec) { mld_ifc_event(pmc->idev); + } + return err; } /* * Add multicast single-source filter to the interface list + * called with mc_lock */ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, const struct in6_addr *psfsrc) @@ -2334,7 +2384,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, struct ip6_sf_list *psf, *psf_prev; psf_prev = NULL; - for_each_psf_rtnl(pmc, psf) { + for_each_psf_mclock(pmc, psf) { if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) break; psf_prev = psf; @@ -2355,12 +2405,13 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, return 0; } +/* called with mc_lock */ static void sf_markstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf; int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; - for_each_psf_rtnl(pmc, psf) { + for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && @@ -2371,6 +2422,7 @@ static void sf_markstate(struct ifmcaddr6 *pmc) } } +/* called with mc_lock */ static int sf_setstate(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *dpsf; @@ -2379,7 +2431,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) int new_in, rv; rv = 0; - for_each_psf_rtnl(pmc, psf) { + for_each_psf_mclock(pmc, psf) { if (pmc->mca_sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; @@ -2398,10 +2450,12 @@ static int sf_setstate(struct ifmcaddr6 *pmc) if (dpsf) { if (prev) rcu_assign_pointer(prev->sf_next, - rtnl_dereference(dpsf->sf_next)); + mc_dereference(dpsf->sf_next, + pmc->idev)); else rcu_assign_pointer(pmc->mca_tomb, - rtnl_dereference(dpsf->sf_next)); + mc_dereference(dpsf->sf_next, + pmc->idev)); kfree_rcu(dpsf, rcu); } psf->sf_crcount = qrv; @@ -2424,7 +2478,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) continue; *dpsf = *psf; rcu_assign_pointer(dpsf->sf_next, - rtnl_dereference(pmc->mca_tomb)); + mc_dereference(pmc->mca_tomb, pmc->idev)); rcu_assign_pointer(pmc->mca_tomb, dpsf); } dpsf->sf_crcount = qrv; @@ -2436,6 +2490,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc) /* * Add multicast source filter list to the interface list + * called with mc_lock */ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int sfmode, int sfcount, const struct in6_addr *psfsrc, @@ -2448,7 +2503,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, if (!idev) return -ENODEV; - for_each_mc_rtnl(idev, pmc) { + for_each_mc_mclock(idev, pmc) { if (ipv6_addr_equal(pmca, &pmc->mca_addr)) break; } @@ -2484,7 +2539,7 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, pmc->mca_crcount = idev->mc_qrv; idev->mc_ifc_count = pmc->mca_crcount; - for_each_psf_rtnl(pmc, psf) + for_each_psf_mclock(pmc, psf) psf->sf_crcount = 0; mld_ifc_event(idev); } else if (sf_setstate(pmc)) { @@ -2493,21 +2548,22 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, return err; } +/* called with mc_lock */ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) { struct ip6_sf_list *psf, *nextpsf; - for (psf = rtnl_dereference(pmc->mca_tomb); + for (psf = mc_dereference(pmc->mca_tomb, pmc->idev); psf; psf = nextpsf) { - nextpsf = rtnl_dereference(psf->sf_next); + nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_tomb, NULL); - for (psf = rtnl_dereference(pmc->mca_sources); + for (psf = mc_dereference(pmc->mca_sources, pmc->idev); psf; psf = nextpsf) { - nextpsf = rtnl_dereference(psf->sf_next); + nextpsf = mc_dereference(psf->sf_next, pmc->idev); kfree_rcu(psf, rcu); } RCU_INIT_POINTER(pmc->mca_sources, NULL); @@ -2516,7 +2572,7 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) pmc->mca_sfcount[MCAST_EXCLUDE] = 1; } - +/* called with mc_lock */ static void igmp6_join_group(struct ifmcaddr6 *ma) { unsigned long delay; @@ -2546,19 +2602,27 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, psl = rtnl_dereference(iml->sflist); + if (idev) + mutex_lock(&idev->mc_lock); + if (!psl) { /* any-source empty exclude case */ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); } else { err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, - psl->sl_count, psl->sl_addr, 0); + psl->sl_count, psl->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } + + if (idev) + mutex_unlock(&idev->mc_lock); + return err; } +/* called with mc_lock */ static void igmp6_leave_group(struct ifmcaddr6 *ma) { if (mld_in_v1_mode(ma->idev)) { @@ -2578,10 +2642,10 @@ static void mld_gq_work(struct work_struct *work) struct inet6_dev, mc_gq_work); - rtnl_lock(); + mutex_lock(&idev->mc_lock); mld_send_report(idev, NULL); idev->mc_gq_running = 0; - rtnl_unlock(); + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } @@ -2592,7 +2656,7 @@ static void mld_ifc_work(struct work_struct *work) struct inet6_dev, mc_ifc_work); - rtnl_lock(); + mutex_lock(&idev->mc_lock); mld_send_cr(idev); if (idev->mc_ifc_count) { @@ -2601,10 +2665,11 @@ static void mld_ifc_work(struct work_struct *work) mld_ifc_start_work(idev, unsolicited_report_interval(idev)); } - rtnl_unlock(); + mutex_unlock(&idev->mc_lock); in6_dev_put(idev); } +/* called with mc_lock */ static void mld_ifc_event(struct inet6_dev *idev) { if (mld_in_v1_mode(idev)) @@ -2619,14 +2684,14 @@ static void mld_mca_work(struct work_struct *work) struct ifmcaddr6 *ma = container_of(to_delayed_work(work), struct ifmcaddr6, mca_work); - rtnl_lock(); + mutex_lock(&ma->idev->mc_lock); if (mld_in_v1_mode(ma->idev)) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); else mld_send_report(ma->idev, ma); ma->mca_flags |= MAF_LAST_REPORTER; ma->mca_flags &= ~MAF_TIMER_RUNNING; - rtnl_unlock(); + mutex_unlock(&ma->idev->mc_lock); ma_put(ma); } @@ -2639,8 +2704,10 @@ void ipv6_mc_unmap(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ - for_each_mc_rtnl(idev, i) + mutex_lock(&idev->mc_lock); + for_each_mc_mclock(idev, i) igmp6_group_dropped(i); + mutex_unlock(&idev->mc_lock); } void ipv6_mc_remap(struct inet6_dev *idev) @@ -2649,14 +2716,15 @@ void ipv6_mc_remap(struct inet6_dev *idev) } /* Device going down */ - void ipv6_mc_down(struct inet6_dev *idev) { struct ifmcaddr6 *i; + mutex_lock(&idev->mc_lock); /* Withdraw multicast list */ - for_each_mc_rtnl(idev, i) + for_each_mc_mclock(idev, i) igmp6_group_dropped(i); + mutex_unlock(&idev->mc_lock); /* Should stop work after group drop. or we will * start work again in mld_ifc_event() @@ -2687,10 +2755,12 @@ void ipv6_mc_up(struct inet6_dev *idev) /* Install multicast list, except for all-nodes (already installed) */ ipv6_mc_reset(idev); - for_each_mc_rtnl(idev, i) { + mutex_lock(&idev->mc_lock); + for_each_mc_mclock(idev, i) { mld_del_delrec(idev, i); igmp6_group_added(i); } + mutex_unlock(&idev->mc_lock); } /* IPv6 device initialization. */ @@ -2709,6 +2779,7 @@ void ipv6_mc_init_dev(struct inet6_dev *idev) skb_queue_head_init(&idev->mc_report_queue); spin_lock_init(&idev->mc_query_lock); spin_lock_init(&idev->mc_report_lock); + mutex_init(&idev->mc_lock); ipv6_mc_reset(idev); } @@ -2722,7 +2793,9 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) /* Deactivate works */ ipv6_mc_down(idev); + mutex_lock(&idev->mc_lock); mld_clear_delrec(idev); + mutex_unlock(&idev->mc_lock); mld_clear_query(idev); mld_clear_report(idev); @@ -2736,12 +2809,14 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) if (idev->cnf.forwarding) __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters); - while ((i = rtnl_dereference(idev->mc_list))) { - rcu_assign_pointer(idev->mc_list, rtnl_dereference(i->next)); + mutex_lock(&idev->mc_lock); + while ((i = mc_dereference(idev->mc_list, idev))) { + rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev)); ip6_mc_clear_src(i); ma_put(i); } + mutex_unlock(&idev->mc_lock); } static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) @@ -2750,12 +2825,14 @@ static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) ASSERT_RTNL(); + mutex_lock(&idev->mc_lock); if (mld_in_v1_mode(idev)) { - for_each_mc_rtnl(idev, pmc) + for_each_mc_mclock(idev, pmc) igmp6_join_group(pmc); } else { mld_send_report(idev, NULL); } + mutex_unlock(&idev->mc_lock); } static int ipv6_mc_netdev_event(struct notifier_block *this, -- cgit v1.2.3 From e16301fbe1837c9594f9c1957c28fd1bb18fbd15 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:51:30 -0700 Subject: bpf: Simplify freeing logic in linfo and jited_linfo This patch simplifies the linfo freeing logic by combining "bpf_prog_free_jited_linfo()" and "bpf_prog_free_unused_jited_linfo()" into the new "bpf_prog_jit_attempt_done()". It is a prep work for the kernel function call support. In a later patch, freeing the kernel function call descriptors will also be done in the "bpf_prog_jit_attempt_done()". "bpf_prog_free_linfo()" is removed since it is only called by "__bpf_prog_put_noref()". The kvfree() are directly called instead. It also takes this chance to s/kcalloc/kvcalloc/ for the jited_linfo allocation. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015130.1544323-1-kafai@fb.com --- include/linux/filter.h | 3 +-- kernel/bpf/core.c | 35 ++++++++++++----------------------- kernel/bpf/syscall.c | 3 ++- kernel/bpf/verifier.c | 4 ++-- 4 files changed, 17 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index b2b85b2cad8e..0d9c710eb050 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -877,8 +877,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog); void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); -void bpf_prog_free_jited_linfo(struct bpf_prog *prog); -void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); +void bpf_prog_jit_attempt_done(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 75244ecb2389..a35eb3d7b126 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -143,25 +143,22 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) if (!prog->aux->nr_linfo || !prog->jit_requested) return 0; - prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, - sizeof(*prog->aux->jited_linfo), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, + sizeof(*prog->aux->jited_linfo), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!prog->aux->jited_linfo) return -ENOMEM; return 0; } -void bpf_prog_free_jited_linfo(struct bpf_prog *prog) +void bpf_prog_jit_attempt_done(struct bpf_prog *prog) { - kfree(prog->aux->jited_linfo); - prog->aux->jited_linfo = NULL; -} - -void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) -{ - if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) - bpf_prog_free_jited_linfo(prog); + if (prog->aux->jited_linfo && + (!prog->jited || !prog->aux->jited_linfo[0])) { + kvfree(prog->aux->jited_linfo); + prog->aux->jited_linfo = NULL; + } } /* The jit engine is responsible to provide an array @@ -217,12 +214,6 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; } -void bpf_prog_free_linfo(struct bpf_prog *prog) -{ - bpf_prog_free_jited_linfo(prog); - kvfree(prog->aux->linfo); -} - struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { @@ -1866,15 +1857,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) return fp; fp = bpf_int_jit_compile(fp); - if (!fp->jited) { - bpf_prog_free_jited_linfo(fp); + bpf_prog_jit_attempt_done(fp); #ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { *err = -ENOTSUPP; return fp; -#endif - } else { - bpf_prog_free_unused_jited_linfo(fp); } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 250503482cda..eaf85bf51c5a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1694,7 +1694,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) { bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); - bpf_prog_free_linfo(prog); + kvfree(prog->aux->jited_linfo); + kvfree(prog->aux->linfo); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 85f9f842d15c..b7df3f06a279 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11741,7 +11741,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; - bpf_prog_free_unused_jited_linfo(prog); + bpf_prog_jit_attempt_done(prog); return 0; out_free: for (i = 0; i < env->subprog_cnt; i++) { @@ -11764,7 +11764,7 @@ out_undo_insn: insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; } - bpf_prog_free_jited_linfo(prog); + bpf_prog_jit_attempt_done(prog); return err; } -- cgit v1.2.3 From 34747c4120418143097d4343312a0ca96c986d86 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:51:36 -0700 Subject: bpf: Refactor btf_check_func_arg_match This patch moved the subprog specific logic from btf_check_func_arg_match() to the new btf_check_subprog_arg_match(). The core logic is left in btf_check_func_arg_match() which will be reused later to check the kernel function call. The "if (!btf_type_is_ptr(t))" is checked first to improve the indentation which will be useful for a later patch. Some of the "btf_kind_str[]" usages is replaced with the shortcut "btf_type_str(t)". Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015136.1544504-1-kafai@fb.com --- include/linux/bpf.h | 4 +- include/linux/btf.h | 5 ++ kernel/bpf/btf.c | 159 +++++++++++++++++++++++++++----------------------- kernel/bpf/verifier.c | 4 +- 4 files changed, 95 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5a0801b420ca..eaae618a90b5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1545,8 +1545,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); +int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/btf.h b/include/linux/btf.h index 9c1b52738bbe..8a05687a4ee2 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -141,6 +141,11 @@ static inline bool btf_type_is_enum(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; } +static inline bool btf_type_is_scalar(const struct btf_type *t) +{ + return btf_type_is_int(t) || btf_type_is_enum(t); +} + static inline bool btf_type_is_typedef(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 369faeddf1df..3c489adacf3b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4377,7 +4377,7 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_LINK_TYPE static const struct btf_member * -btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf, +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) { @@ -5362,122 +5362,135 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } -/* Compare BTF of a function with given bpf_reg_state. - * Returns: - * EFAULT - there is a verifier bug. Abort verification. - * EINVAL - there is a type mismatch or BTF is not available. - * 0 - BTF matches with what bpf_reg_state expects. - * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. - */ -int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs) +static int btf_check_func_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs, + bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; - struct bpf_prog *prog = env->prog; - struct btf *btf = prog->aux->btf; - const struct btf_param *args; + const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; - u32 i, nargs, btf_id, type_size; - const char *tname; - bool is_global; - - if (!prog->aux->func_info) - return -EINVAL; - - btf_id = prog->aux->func_info[subprog].type_id; - if (!btf_id) - return -EFAULT; - - if (prog->aux->func_info_aux[subprog].unreliable) - return -EINVAL; + const struct btf_param *args; + u32 i, nargs; - t = btf_type_by_id(btf, btf_id); + t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { /* These checks were already done by the verifier while loading * struct bpf_func_info */ - bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n", - subprog); + bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", + func_id); return -EFAULT; } - tname = btf_name_by_offset(btf, t->name_off); + func_name = btf_name_by_offset(btf, t->name_off); t = btf_type_by_id(btf, t->type); if (!t || !btf_type_is_func_proto(t)) { - bpf_log(log, "Invalid BTF of func %s\n", tname); + bpf_log(log, "Invalid BTF of func %s\n", func_name); return -EFAULT; } args = (const struct btf_param *)(t + 1); nargs = btf_type_vlen(t); if (nargs > MAX_BPF_FUNC_REG_ARGS) { - bpf_log(log, "Function %s has %d > %d args\n", tname, nargs, + bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs, MAX_BPF_FUNC_REG_ARGS); - goto out; + return -EINVAL; } - is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ for (i = 0; i < nargs; i++) { - struct bpf_reg_state *reg = ®s[i + 1]; + u32 regno = i + 1; + struct bpf_reg_state *reg = ®s[regno]; - t = btf_type_by_id(btf, args[i].type); - while (btf_type_is_modifier(t)) - t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) { + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + if (btf_type_is_scalar(t)) { if (reg->type == SCALAR_VALUE) continue; - bpf_log(log, "R%d is not a scalar\n", i + 1); - goto out; + bpf_log(log, "R%d is not a scalar\n", regno); + return -EINVAL; } - if (btf_type_is_ptr(t)) { + + if (!btf_type_is_ptr(t)) { + bpf_log(log, "Unrecognized arg#%d type %s\n", + i, btf_type_str(t)); + return -EINVAL; + } + + ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ - if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { - if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)]); - goto out; - } - if (check_ctx_reg(env, reg, i + 1)) - goto out; - continue; + if (reg->type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_type_str(t)); + return -EINVAL; } + if (check_ctx_reg(env, reg, regno)) + return -EINVAL; + } else if (ptr_to_mem_ok) { + const struct btf_type *resolve_ret; + u32 type_size; - if (!is_global) - goto out; - - t = btf_type_skip_modifiers(btf, t->type, NULL); - - ref_t = btf_resolve_size(btf, t, &type_size); - if (IS_ERR(ref_t)) { + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); + if (IS_ERR(resolve_ret)) { bpf_log(log, - "arg#%d reference type('%s %s') size cannot be determined: %ld\n", - i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), - PTR_ERR(ref_t)); - goto out; + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(ref_t), ref_tname, + PTR_ERR(resolve_ret)); + return -EINVAL; } - if (check_mem_reg(env, reg, i + 1, type_size)) - goto out; - - continue; + if (check_mem_reg(env, reg, regno, type_size)) + return -EINVAL; + } else { + return -EINVAL; } - bpf_log(log, "Unrecognized arg#%d type %s\n", - i, btf_kind_str[BTF_INFO_KIND(t->info)]); - goto out; } + return 0; -out: +} + +/* Compare BTF of a function with given bpf_reg_state. + * Returns: + * EFAULT - there is a verifier bug. Abort verification. + * EINVAL - there is a type mismatch or BTF is not available. + * 0 - BTF matches with what bpf_reg_state expects. + * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. + */ +int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs) +{ + struct bpf_prog *prog = env->prog; + struct btf *btf = prog->aux->btf; + bool is_global; + u32 btf_id; + int err; + + if (!prog->aux->func_info) + return -EINVAL; + + btf_id = prog->aux->func_info[subprog].type_id; + if (!btf_id) + return -EFAULT; + + if (prog->aux->func_info_aux[subprog].unreliable) + return -EINVAL; + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global); + /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. * In such cases mark the function as unreliable from BTF point of view. */ - prog->aux->func_info_aux[subprog].unreliable = true; - return -EINVAL; + if (err) + prog->aux->func_info_aux[subprog].unreliable = true; + return err; } /* Convert BTF of a function into bpf_reg_state if possible diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b7df3f06a279..b31e62daafbd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5365,7 +5365,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn func_info_aux = env->prog->aux->func_info_aux; if (func_info_aux) is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, subprog, caller->regs); + err = btf_check_subprog_arg_match(env, subprog, caller->regs); if (err == -EFAULT) return err; if (is_global) { @@ -12288,7 +12288,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); - ret = btf_check_func_arg_match(env, subprog, regs); + ret = btf_check_subprog_arg_match(env, subprog, regs); if (ret == -EFAULT) /* unlikely verifier bug. abort. * ret == 0 and ret < 0 are sadly acceptable for -- cgit v1.2.3 From e6ac2450d6dee3121cd8bbf2907b78a68a8a353d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:51:42 -0700 Subject: bpf: Support bpf program calling kernel function This patch adds support to BPF verifier to allow bpf program calling kernel function directly. The use case included in this set is to allow bpf-tcp-cc to directly call some tcp-cc helper functions (e.g. "tcp_cong_avoid_ai()"). Those functions have already been used by some kernel tcp-cc implementations. This set will also allow the bpf-tcp-cc program to directly call the kernel tcp-cc implementation, For example, a bpf_dctcp may only want to implement its own dctcp_cwnd_event() and reuse other dctcp_*() directly from the kernel tcp_dctcp.c instead of reimplementing (or copy-and-pasting) them. The tcp-cc kernel functions mentioned above will be white listed for the struct_ops bpf-tcp-cc programs to use in a later patch. The white listed functions are not bounded to a fixed ABI contract. Those functions have already been used by the existing kernel tcp-cc. If any of them has changed, both in-tree and out-of-tree kernel tcp-cc implementations have to be changed. The same goes for the struct_ops bpf-tcp-cc programs which have to be adjusted accordingly. This patch is to make the required changes in the bpf verifier. First change is in btf.c, it adds a case in "btf_check_func_arg_match()". When the passed in "btf->kernel_btf == true", it means matching the verifier regs' states with a kernel function. This will handle the PTR_TO_BTF_ID reg. It also maps PTR_TO_SOCK_COMMON, PTR_TO_SOCKET, and PTR_TO_TCP_SOCK to its kernel's btf_id. In the later libbpf patch, the insn calling a kernel function will look like: insn->code == (BPF_JMP | BPF_CALL) insn->src_reg == BPF_PSEUDO_KFUNC_CALL /* <- new in this patch */ insn->imm == func_btf_id /* btf_id of the running kernel */ [ For the future calling function-in-kernel-module support, an array of module btf_fds can be passed at the load time and insn->off can be used to index into this array. ] At the early stage of verifier, the verifier will collect all kernel function calls into "struct bpf_kfunc_desc". Those descriptors are stored in "prog->aux->kfunc_tab" and will be available to the JIT. Since this "add" operation is similar to the current "add_subprog()" and looking for the same insn->code, they are done together in the new "add_subprog_and_kfunc()". In the "do_check()" stage, the new "check_kfunc_call()" is added to verify the kernel function call instruction: 1. Ensure the kernel function can be used by a particular BPF_PROG_TYPE. A new bpf_verifier_ops "check_kfunc_call" is added to do that. The bpf-tcp-cc struct_ops program will implement this function in a later patch. 2. Call "btf_check_kfunc_args_match()" to ensure the regs can be used as the args of a kernel function. 3. Mark the regs' type, subreg_def, and zext_dst. At the later do_misc_fixups() stage, the new fixup_kfunc_call() will replace the insn->imm with the function address (relative to __bpf_call_base). If needed, the jit can find the btf_func_model by calling the new bpf_jit_find_kfunc_model(prog, insn). With the imm set to the function address, "bpftool prog dump xlated" will be able to display the kernel function calls the same way as it displays other bpf helper calls. gpl_compatible program is required to call kernel function. This feature currently requires JIT. The verifier selftests are adjusted because of the changes in the verbose log in add_subprog_and_kfunc(). Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015142.1544736-1-kafai@fb.com --- arch/x86/net/bpf_jit_comp.c | 5 + include/linux/bpf.h | 24 ++ include/linux/btf.h | 1 + include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 4 + kernel/bpf/btf.c | 65 +++- kernel/bpf/core.c | 18 +- kernel/bpf/disasm.c | 13 +- kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 368 +++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 4 + tools/testing/selftests/bpf/verifier/calls.c | 12 +- tools/testing/selftests/bpf/verifier/dead_code.c | 10 +- 13 files changed, 480 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b35fc8023884..9eead60f0301 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2346,3 +2346,8 @@ out: tmp : orig_prog); return prog; } + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eaae618a90b5..b5b7967e3ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -427,6 +427,7 @@ enum bpf_reg_type { PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ PTR_TO_MAP_KEY, /* reg points to a map element key */ + __BPF_REG_TYPE_MAX, }; /* The information passed from prog-specific *_is_valid_access @@ -480,6 +481,7 @@ struct bpf_verifier_ops { const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); + bool (*check_kfunc_call)(u32 kfunc_btf_id); }; struct bpf_prog_offload_ops { @@ -796,6 +798,8 @@ struct btf_mod_pair { struct module *module; }; +struct bpf_kfunc_desc_tab; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; @@ -832,6 +836,7 @@ struct bpf_prog_aux { struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; + struct bpf_kfunc_desc_tab *kfunc_tab; u32 size_poke_tab; struct bpf_ksym ksym; const struct bpf_prog_ops *ops; @@ -1547,6 +1552,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, @@ -1557,6 +1565,10 @@ struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); +bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1737,6 +1749,18 @@ bpf_base_func_proto(enum bpf_func_id func_id) static inline void bpf_task_storage_free(struct task_struct *task) { } + +static inline bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) +{ + return false; +} + +static inline const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + return NULL; +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/include/linux/btf.h b/include/linux/btf.h index 8a05687a4ee2..3bac66e0183a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, u32 *type_size); +const char *btf_type_str(const struct btf_type *t); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 0d9c710eb050..eecfd82db648 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -918,6 +918,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); +bool bpf_jit_supports_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 008edc1dc8c1..598716742593 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1117,6 +1117,10 @@ enum bpf_link_type { * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3c489adacf3b..ec8afc4bc560 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -283,7 +283,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_FLOAT] = "FLOAT", }; -static const char *btf_type_str(const struct btf_type *t) +const char *btf_type_str(const struct btf_type *t) { return btf_kind_str[BTF_INFO_KIND(t->info)]; } @@ -5362,6 +5362,14 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif +}; + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, @@ -5371,12 +5379,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs; + u32 i, nargs, ref_id; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { /* These checks were already done by the verifier while loading - * struct bpf_func_info + * struct bpf_func_info or in add_kfunc_call(). */ bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n", func_id); @@ -5418,9 +5426,49 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (btf_get_prog_ctx_type(log, btf, t, env->prog->type, i)) { + if (btf_is_kernel(btf)) { + const struct btf_type *reg_ref_t; + const struct btf *reg_btf; + const char *reg_ref_tname; + u32 reg_ref_id; + + if (!btf_type_is_struct(ref_t)) { + bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", + func_name, i, btf_type_str(ref_t), + ref_tname); + return -EINVAL; + } + + if (reg->type == PTR_TO_BTF_ID) { + reg_btf = reg->btf; + reg_ref_id = reg->btf_id; + } else if (reg2btf_ids[reg->type]) { + reg_btf = btf_vmlinux; + reg_ref_id = *reg2btf_ids[reg->type]; + } else { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", + func_name, i, + btf_type_str(ref_t), ref_tname, regno); + return -EINVAL; + } + + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, + ®_ref_id); + reg_ref_tname = btf_name_by_offset(reg_btf, + reg_ref_t->name_off); + if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, + reg->off, btf, ref_id)) { + bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", + func_name, i, + btf_type_str(ref_t), ref_tname, + regno, btf_type_str(reg_ref_t), + reg_ref_tname); + return -EINVAL; + } + } else if (btf_get_prog_ctx_type(log, btf, t, + env->prog->type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5493,6 +5541,13 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return err; } +int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, + const struct btf *btf, u32 func_id, + struct bpf_reg_state *regs) +{ + return btf_check_func_arg_match(env, btf, func_id, regs, false); +} + /* Convert BTF of a function into bpf_reg_state if possible * Returns: * EFAULT - there is a verifier bug. Abort verification. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a35eb3d7b126..f5423251c118 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -159,6 +159,9 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog) kvfree(prog->aux->jited_linfo); prog->aux->jited_linfo = NULL; } + + kfree(prog->aux->kfunc_tab); + prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array @@ -1840,9 +1843,15 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ + bool jit_needed = false; + if (fp->bpf_func) goto finalize; + if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || + bpf_prog_has_kfunc_call(fp)) + jit_needed = true; + bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant @@ -1858,12 +1867,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); -#ifdef CONFIG_BPF_JIT_ALWAYS_ON - if (!fp->jited) { + if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; } -#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) @@ -2343,6 +2350,11 @@ bool __weak bpf_jit_needs_zext(void) return false; } +bool __weak bpf_jit_supports_kfunc_call(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 3acc7e0b6916..dad821c8ecd0 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs, { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - if (insn->src_reg != BPF_PSEUDO_CALL && + if (!insn->src_reg && insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && func_id_str[insn->imm]) return func_id_str[insn->imm]; - if (cbs && cbs->cb_call) - return cbs->cb_call(cbs->private_data, insn); + if (cbs && cbs->cb_call) { + const char *res; + + res = cbs->cb_call(cbs->private_data, insn); + if (res) + return res; + } if (insn->src_reg == BPF_PSEUDO_CALL) snprintf(buff, len, "%+d", insn->imm); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + snprintf(buff, len, "kernel-function"); return buff; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eaf85bf51c5a..9603de81811a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1696,6 +1696,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) btf_put(prog->aux->btf); kvfree(prog->aux->jited_linfo); kvfree(prog->aux->linfo); + kfree(prog->aux->kfunc_tab); if (prog->aux->attach_btf) btf_put(prog->aux->attach_btf); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b31e62daafbd..852541a435ef 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -234,6 +234,12 @@ static bool bpf_pseudo_call(const struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_CALL; } +static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_KFUNC_CALL; +} + static bool bpf_pseudo_func(const struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && @@ -1554,47 +1560,205 @@ static int add_subprog(struct bpf_verifier_env *env, int off) verbose(env, "too many subprograms\n"); return -E2BIG; } + /* determine subprog starts. The end is one before the next starts */ env->subprog_info[env->subprog_cnt++].start = off; sort(env->subprog_info, env->subprog_cnt, sizeof(env->subprog_info[0]), cmp_subprogs, NULL); return env->subprog_cnt - 1; } -static int check_subprogs(struct bpf_verifier_env *env) +struct bpf_kfunc_desc { + struct btf_func_model func_model; + u32 func_id; + s32 imm; +}; + +#define MAX_KFUNC_DESCS 256 +struct bpf_kfunc_desc_tab { + struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS]; + u32 nr_descs; +}; + +static int kfunc_desc_cmp_by_id(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + /* func_id is not greater than BTF_MAX_TYPE */ + return d0->func_id - d1->func_id; +} + +static const struct bpf_kfunc_desc * +find_kfunc_desc(const struct bpf_prog *prog, u32 func_id) +{ + struct bpf_kfunc_desc desc = { + .func_id = func_id, + }; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + return bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_id); +} + +static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id) +{ + const struct btf_type *func, *func_proto; + struct bpf_kfunc_desc_tab *tab; + struct bpf_prog_aux *prog_aux; + struct bpf_kfunc_desc *desc; + const char *func_name; + unsigned long addr; + int err; + + prog_aux = env->prog->aux; + tab = prog_aux->kfunc_tab; + if (!tab) { + if (!btf_vmlinux) { + verbose(env, "calling kernel function is not supported without CONFIG_DEBUG_INFO_BTF\n"); + return -ENOTSUPP; + } + + if (!env->prog->jit_requested) { + verbose(env, "JIT is required for calling kernel function\n"); + return -ENOTSUPP; + } + + if (!bpf_jit_supports_kfunc_call()) { + verbose(env, "JIT does not support calling kernel function\n"); + return -ENOTSUPP; + } + + if (!env->prog->gpl_compatible) { + verbose(env, "cannot call kernel function from non-GPL compatible program\n"); + return -EINVAL; + } + + tab = kzalloc(sizeof(*tab), GFP_KERNEL); + if (!tab) + return -ENOMEM; + prog_aux->kfunc_tab = tab; + } + + if (find_kfunc_desc(env->prog, func_id)) + return 0; + + if (tab->nr_descs == MAX_KFUNC_DESCS) { + verbose(env, "too many different kernel function calls\n"); + return -E2BIG; + } + + func = btf_type_by_id(btf_vmlinux, func_id); + if (!func || !btf_type_is_func(func)) { + verbose(env, "kernel btf_id %u is not a function\n", + func_id); + return -EINVAL; + } + func_proto = btf_type_by_id(btf_vmlinux, func->type); + if (!func_proto || !btf_type_is_func_proto(func_proto)) { + verbose(env, "kernel function btf_id %u does not have a valid func_proto\n", + func_id); + return -EINVAL; + } + + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + addr = kallsyms_lookup_name(func_name); + if (!addr) { + verbose(env, "cannot find address for kernel function %s\n", + func_name); + return -EINVAL; + } + + desc = &tab->descs[tab->nr_descs++]; + desc->func_id = func_id; + desc->imm = BPF_CAST_CALL(addr) - __bpf_call_base; + err = btf_distill_func_proto(&env->log, btf_vmlinux, + func_proto, func_name, + &desc->func_model); + if (!err) + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_id, NULL); + return err; +} + +static int kfunc_desc_cmp_by_imm(const void *a, const void *b) +{ + const struct bpf_kfunc_desc *d0 = a; + const struct bpf_kfunc_desc *d1 = b; + + if (d0->imm > d1->imm) + return 1; + else if (d0->imm < d1->imm) + return -1; + return 0; +} + +static void sort_kfunc_descs_by_imm(struct bpf_prog *prog) +{ + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + if (!tab) + return; + + sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), + kfunc_desc_cmp_by_imm, NULL); +} + +bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog) +{ + return !!prog->aux->kfunc_tab; +} + +const struct btf_func_model * +bpf_jit_find_kfunc_model(const struct bpf_prog *prog, + const struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc desc = { + .imm = insn->imm, + }; + const struct bpf_kfunc_desc *res; + struct bpf_kfunc_desc_tab *tab; + + tab = prog->aux->kfunc_tab; + res = bsearch(&desc, tab->descs, tab->nr_descs, + sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm); + + return res ? &res->func_model : NULL; +} + +static int add_subprog_and_kfunc(struct bpf_verifier_env *env) { - int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; + int i, ret, insn_cnt = env->prog->len; /* Add entry function. */ ret = add_subprog(env, 0); - if (ret < 0) + if (ret) return ret; - /* determine subprog starts. The end is one before the next starts */ - for (i = 0; i < insn_cnt; i++) { - if (bpf_pseudo_func(insn + i)) { - if (!env->bpf_capable) { - verbose(env, - "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); - return -EPERM; - } - ret = add_subprog(env, i + insn[i].imm + 1); - if (ret < 0) - return ret; - /* remember subprog */ - insn[i + 1].imm = ret; - continue; - } - if (!bpf_pseudo_call(insn + i)) + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn) && + !bpf_pseudo_kfunc_call(insn)) continue; + if (!env->bpf_capable) { - verbose(env, - "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); + verbose(env, "loading/calling other bpf or kernel functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } - ret = add_subprog(env, i + insn[i].imm + 1); + + if (bpf_pseudo_func(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + if (ret >= 0) + /* remember subprog */ + insn[1].imm = ret; + } else if (bpf_pseudo_call(insn)) { + ret = add_subprog(env, i + insn->imm + 1); + } else { + ret = add_kfunc_call(env, insn->imm); + } + if (ret < 0) return ret; } @@ -1608,6 +1772,16 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_subprog_info *subprog = env->subprog_info; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + /* now check that all jumps are within the same subprog */ subprog_start = subprog[cur_subprog].start; subprog_end = subprog[cur_subprog + 1].start; @@ -1916,6 +2090,17 @@ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, return i; } +static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) +{ + const struct btf_type *func; + + if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) + return NULL; + + func = btf_type_by_id(btf_vmlinux, insn->imm); + return btf_name_by_offset(btf_vmlinux, func->name_off); +} + /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -1924,6 +2109,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, u32 *reg_mask, u64 *stack_mask) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -5960,6 +6146,98 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return 0; } +/* mark_btf_func_reg_size() is used when the reg size is determined by + * the BTF func_proto's return value size and argument. + */ +static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, + size_t reg_size) +{ + struct bpf_reg_state *reg = &cur_regs(env)[regno]; + + if (regno == BPF_REG_0) { + /* Function return value */ + reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = reg_size == sizeof(u64) ? + DEF_NOT_SUBREG : env->insn_idx + 1; + } else { + /* Function argument */ + if (reg_size == sizeof(u64)) { + mark_insn_zext(env, reg); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + } else { + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); + } + } +} + +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct btf_type *t, *func, *func_proto, *ptr_type; + struct bpf_reg_state *regs = cur_regs(env); + const char *func_name, *ptr_type_name; + u32 i, nargs, func_id, ptr_type_id; + const struct btf_param *args; + int err; + + func_id = insn->imm; + func = btf_type_by_id(btf_vmlinux, func_id); + func_name = btf_name_by_offset(btf_vmlinux, func->name_off); + func_proto = btf_type_by_id(btf_vmlinux, func->type); + + if (!env->ops->check_kfunc_call || + !env->ops->check_kfunc_call(func_id)) { + verbose(env, "calling kernel function %s is not allowed\n", + func_name); + return -EACCES; + } + + /* Check the arguments */ + err = btf_check_kfunc_arg_match(env, btf_vmlinux, func_id, regs); + if (err) + return err; + + for (i = 0; i < CALLER_SAVED_REGS; i++) + mark_reg_not_init(env, regs, caller_saved[i]); + + /* Check return type */ + t = btf_type_skip_modifiers(btf_vmlinux, func_proto->type, NULL); + if (btf_type_is_scalar(t)) { + mark_reg_unknown(env, regs, BPF_REG_0); + mark_btf_func_reg_size(env, BPF_REG_0, t->size); + } else if (btf_type_is_ptr(t)) { + ptr_type = btf_type_skip_modifiers(btf_vmlinux, t->type, + &ptr_type_id); + if (!btf_type_is_struct(ptr_type)) { + ptr_type_name = btf_name_by_offset(btf_vmlinux, + ptr_type->name_off); + verbose(env, "kernel function %s returns pointer type %s %s is not supported\n", + func_name, btf_type_str(ptr_type), + ptr_type_name); + return -EINVAL; + } + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].btf = btf_vmlinux; + regs[BPF_REG_0].type = PTR_TO_BTF_ID; + regs[BPF_REG_0].btf_id = ptr_type_id; + mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ + + nargs = btf_type_vlen(func_proto); + args = (const struct btf_param *)(func_proto + 1); + for (i = 0; i < nargs; i++) { + u32 regno = i + 1; + + t = btf_type_skip_modifiers(btf_vmlinux, args[i].type, NULL); + if (btf_type_is_ptr(t)) + mark_btf_func_reg_size(env, regno, sizeof(void *)); + else + /* scalar. ensured by btf_check_kfunc_arg_match() */ + mark_btf_func_reg_size(env, regno, t->size); + } + + return 0; +} + static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -10162,6 +10440,7 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, .cb_print = verbose, .private_data = env, }; @@ -10309,7 +10588,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL) || + insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); @@ -10324,6 +10604,8 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); + else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) + err = check_kfunc_call(env, insn); else err = check_helper_call(env, insn, &env->insn_idx); if (err) @@ -11634,6 +11916,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->linfo = prog->aux->linfo; func[i]->aux->nr_linfo = prog->aux->nr_linfo; func[i]->aux->jited_linfo = prog->aux->jited_linfo; @@ -11773,6 +12056,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) #ifndef CONFIG_BPF_JIT_ALWAYS_ON struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; + bool has_kfunc_call = bpf_prog_has_kfunc_call(prog); int i, depth; #endif int err = 0; @@ -11786,6 +12070,10 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON + if (has_kfunc_call) { + verbose(env, "calling kernel functions are not allowed in non-JITed programs\n"); + return -EINVAL; + } if (env->subprog_cnt > 1 && env->prog->aux->tail_call_reachable) { /* When JIT fails the progs with bpf2bpf calls and tail_calls * have to be rejected, since interpreter doesn't support them yet. @@ -11814,6 +12102,26 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } +static int fixup_kfunc_call(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + const struct bpf_kfunc_desc *desc; + + /* insn->imm has the btf func_id. Replace it with + * an address (relative to __bpf_base_call). + */ + desc = find_kfunc_desc(env->prog, insn->imm); + if (!desc) { + verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", + insn->imm); + return -EFAULT; + } + + insn->imm = desc->imm; + + return 0; +} + /* Do various post-verification rewrites in a single program pass. * These rewrites simplify JIT and interpreter implementations. */ @@ -11949,6 +12257,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) continue; if (insn->src_reg == BPF_PSEUDO_CALL) continue; + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + ret = fixup_kfunc_call(env, insn); + if (ret) + return ret; + continue; + } if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -12178,6 +12492,8 @@ patch_call_imm: } } + sort_kfunc_descs_by_imm(env->prog); + return 0; } @@ -12883,6 +13199,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (!env->explored_states) goto skip_full_check; + ret = add_subprog_and_kfunc(env); + if (ret < 0) + goto skip_full_check; + ret = check_subprogs(env); if (ret < 0) goto skip_full_check; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d3036e292a9..ab9f2233607c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1117,6 +1117,10 @@ enum bpf_link_type { * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index eb888c8479c3..336a749673d1 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -136,7 +136,7 @@ { "calls: wrong src reg", .insns = { - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0), BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 5cf361d8eb1c..17fe33a75034 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, -- cgit v1.2.3 From 7bd1590d4eba1583f6ee85e8cfe556505f761e19 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:52 -0700 Subject: bpf: selftests: Add kfunc_call test This patch adds a few kernel function bpf_kfunc_call_test*() for the selftest's test_run purpose. They will be allowed for tc_cls prog. The selftest calling the kernel function bpf_kfunc_call_test*() is also added in this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015252.1551395-1-kafai@fb.com --- include/linux/bpf.h | 6 +++ net/bpf/test_run.c | 28 ++++++++++ net/core/filter.c | 1 + .../testing/selftests/bpf/prog_tests/kfunc_call.c | 59 ++++++++++++++++++++++ .../testing/selftests/bpf/progs/kfunc_call_test.c | 47 +++++++++++++++++ .../selftests/bpf/progs/kfunc_call_test_subprog.c | 42 +++++++++++++++ 6 files changed, 183 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_call.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test.c create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b5b7967e3ff3..9fdd839b418c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1532,6 +1532,7 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1731,6 +1732,11 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, return -ENOTSUPP; } +static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +{ + return false; +} + static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 4aabf71cd95d..a5d72c48fb66 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 Facebook */ #include +#include #include #include #include @@ -213,10 +214,37 @@ int noinline bpf_modify_return_test(int a, int *b) *b += 1; return a + *b; } + +u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d) +{ + return a + b + c + d; +} + +int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b) +{ + return a + b; +} + +struct sock * noinline bpf_kfunc_call_test3(struct sock *sk) +{ + return sk; +} + __diag_pop(); ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); +BTF_SET_START(test_sk_kfunc_ids) +BTF_ID(func, bpf_kfunc_call_test1) +BTF_ID(func, bpf_kfunc_call_test2) +BTF_ID(func, bpf_kfunc_call_test3) +BTF_SET_END(test_sk_kfunc_ids) + +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +{ + return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id); +} + static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { diff --git a/net/core/filter.c b/net/core/filter.c index 17dc159ec40c..cae56d08a670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9813,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c new file mode 100644 index 000000000000..7fc0951ee75f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "kfunc_call_test.skel.h" +#include "kfunc_call_test_subprog.skel.h" + +static void test_main(void) +{ + struct kfunc_call_test *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 12, "test1-retval"); + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test2)"); + ASSERT_EQ(retval, 3, "test2-retval"); + + kfunc_call_test__destroy(skel); +} + +static void test_subprog(void) +{ + struct kfunc_call_test_subprog *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test_subprog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 10, "test1-retval"); + ASSERT_NEQ(skel->data->active_res, -1, "active_res"); + ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); + + kfunc_call_test_subprog__destroy(skel); +} + +void test_kfunc_call(void) +{ + if (test__start_subtest("main")) + test_main(); + + if (test__start_subtest("subprog")) + test_subprog(); +} diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c new file mode 100644 index 000000000000..470f8723e463 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "bpf_tcp_helpers.h" + +extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; + +SEC("classifier") +int kfunc_call_test2(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + return bpf_kfunc_call_test2((struct sock *)sk, 1, 2); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + __u64 a = 1ULL << 32; + __u32 ret; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4); + ret = a >> 32; /* ret should be 2 */ + ret += (__u32)a; /* ret should be 12 */ + + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c new file mode 100644 index 000000000000..b2dcb7d9cb03 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include +#include +#include "bpf_tcp_helpers.h" + +extern const int bpf_prog_active __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; +extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; +int active_res = -1; +int sk_state = -1; + +int __noinline f1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + int *active; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, + bpf_get_smp_processor_id()); + if (active) + active_res = *active; + + sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; + + return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + return f1(skb); +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From de1d1ee3e3e9f028623e7beb4c090a2b68572f10 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 26 Mar 2021 14:20:22 +0100 Subject: nexthop: Rename artifacts related to legacy multipath nexthop groups After resilient next-hop groups have been added recently, there are two types of multipath next-hop groups: the legacy "mpath", and the new "resilient". Calling the legacy next-hop group type "mpath" is unfortunate, because that describes the fact that a packet could be forwarded in one of several paths, which is also true for the resilient next-hop groups. Therefore, to make the naming clearer, rename various artifacts to reflect the assumptions made. Therefore as of this patch: - The flag for multipath groups is nh_grp_entry::is_multipath. This includes the legacy and resilient groups, as well as any future group types that behave as multipath groups. Functions that assume this have "mpath" in the name. - The flag for legacy multipath groups is nh_grp_entry::hash_threshold. Functions that assume this have "hthr" in the name. - The flag for resilient groups is nh_grp_entry::resilient. Functions that assume this have "res" in the name. Besides the above, struct nh_grp_entry::mpath was renamed to ::hthr as well. UAPI artifacts were obviously left intact. Suggested-by: David Ahern Signed-off-by: Petr Machata Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 4 ++-- net/ipv4/nexthop.c | 56 +++++++++++++++++++++++++-------------------------- 2 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 28145f714801..10e1777877e6 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -102,7 +102,7 @@ struct nh_grp_entry { union { struct { atomic_t upper_bound; - } mpath; + } hthr; struct { /* Member on uw_nh_entries. */ struct list_head uw_nh_entry; @@ -120,7 +120,7 @@ struct nh_group { struct nh_group *spare; /* spare group for removals */ u16 num_nh; bool is_multipath; - bool mpath; + bool hash_threshold; bool resilient; bool fdb_nh; bool has_v4; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f09fe3a5608f..5a2fc8798d20 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -116,8 +116,8 @@ static void nh_notifier_single_info_fini(struct nh_notifier_info *info) kfree(info->nh); } -static int nh_notifier_mp_info_init(struct nh_notifier_info *info, - struct nh_group *nhg) +static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, + struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; @@ -181,8 +181,8 @@ static int nh_notifier_grp_info_init(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) - return nh_notifier_mp_info_init(info, nhg); + if (nhg->hash_threshold) + return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; @@ -193,7 +193,7 @@ static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); - if (nhg->mpath) + if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); @@ -406,7 +406,7 @@ static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); - err = nh_notifier_mp_info_init(&info, nhg); + err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; @@ -661,7 +661,7 @@ static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) u16 group_type = 0; int i; - if (nhg->mpath) + if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; @@ -992,9 +992,9 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ - if (nhg->mpath) { + if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, - "Multipath group can not be a nexthop within a group"); + "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { @@ -1151,7 +1151,7 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) +static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nexthop *rc = NULL; int i; @@ -1160,7 +1160,7 @@ static struct nexthop *nexthop_select_path_mp(struct nh_group *nhg, int hash) struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; - if (hash > atomic_read(&nhge->mpath.upper_bound)) + if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nhi = rcu_dereference(nhge->nh->nh_info); @@ -1212,8 +1212,8 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) return nh; nhg = rcu_dereference(nh->nh_grp); - if (nhg->mpath) - return nexthop_select_path_mp(nhg, hash); + if (nhg->hash_threshold) + return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); @@ -1710,7 +1710,7 @@ static void replace_nexthop_grp_res(struct nh_group *oldg, nh_res_table_upkeep(old_res_table, true, false); } -static void nh_mp_group_rebalance(struct nh_group *nhg) +static void nh_hthr_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; @@ -1725,7 +1725,7 @@ static void nh_mp_group_rebalance(struct nh_group *nhg) w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; - atomic_set(&nhge->mpath.upper_bound, upper_bound); + atomic_set(&nhge->hthr.upper_bound, upper_bound); } } @@ -1752,7 +1752,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; - newg->mpath = nhg->mpath; + newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; @@ -1781,8 +1781,8 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, j++; } - if (newg->mpath) - nh_mp_group_rebalance(newg); + if (newg->hash_threshold) + nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); @@ -1794,7 +1794,7 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, /* Removal of a NH from a resilient group is notified through * bucket notifications. */ - if (newg->mpath) { + if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) @@ -1928,12 +1928,12 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); - if (newg->mpath != oldg->mpath) { + if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } - if (newg->mpath) { + if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) @@ -2063,7 +2063,7 @@ static int replace_nexthop_single_notify(struct net *net, struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; - if (nhg->mpath) { + if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { @@ -2328,8 +2328,8 @@ static int insert_nexthop(struct net *net, struct nexthop *new_nh, rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); - /* The initial insertion is a full notification for mpath as well - * as resilient groups. + /* The initial insertion is a full notification for hash-threshold as + * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) @@ -2438,7 +2438,7 @@ static struct nexthop *nexthop_create_group(struct net *net, } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { - nhg->mpath = 1; + nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; @@ -2455,10 +2455,10 @@ static struct nexthop *nexthop_create_group(struct net *net, nhg->is_multipath = true; } - WARN_ON_ONCE(nhg->mpath + nhg->resilient != 1); + WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); - if (nhg->mpath) - nh_mp_group_rebalance(nhg); + if (nhg->hash_threshold) + nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; -- cgit v1.2.3 From f318482a1c57315d0efccd2861f153f55c2117c6 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 19 Mar 2021 15:21:32 +0100 Subject: can: dev: can_free_echo_skb(): extend to return can frame length In order to implement byte queue limits (bql) in CAN drivers, the length of the CAN frame needs to be passed into the networking stack even if the transmission failed for some reason. To avoid to calculate this length twice, extend can_free_echo_skb() to return that value. Convert all users of this function, too. This patch is the natural extension of commit: | 9420e1d495e2 ("can: dev: can_get_echo_skb(): extend to return can | frame length") Link: https://lore.kernel.org/r/20210319142700.305648-3-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/skb.c | 11 +++++++++-- drivers/net/can/grcan.c | 2 +- drivers/net/can/m_can/m_can.c | 2 +- drivers/net/can/rcar/rcar_can.c | 2 +- drivers/net/can/rcar/rcar_canfd.c | 2 +- drivers/net/can/sja1000/sja1000.c | 2 +- drivers/net/can/spi/hi311x.c | 2 +- drivers/net/can/spi/mcp251x.c | 2 +- drivers/net/can/usb/ems_usb.c | 2 +- drivers/net/can/usb/esd_usb2.c | 4 ++-- drivers/net/can/usb/gs_usb.c | 2 +- drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c | 2 +- drivers/net/can/usb/mcba_usb.c | 2 +- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 2 +- drivers/net/can/usb/ucan.c | 6 +++--- drivers/net/can/usb/usb_8dev.c | 2 +- include/linux/can/skb.h | 3 ++- 17 files changed, 29 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c index 2256391ddbb3..387c0bc0fb9c 100644 --- a/drivers/net/can/dev/skb.c +++ b/drivers/net/can/dev/skb.c @@ -153,7 +153,8 @@ EXPORT_SYMBOL_GPL(can_get_echo_skb); * * The function is typically called when TX failed. */ -void can_free_echo_skb(struct net_device *dev, unsigned int idx) +void can_free_echo_skb(struct net_device *dev, unsigned int idx, + unsigned int *frame_len_ptr) { struct can_priv *priv = netdev_priv(dev); @@ -164,7 +165,13 @@ void can_free_echo_skb(struct net_device *dev, unsigned int idx) } if (priv->echo_skb[idx]) { - dev_kfree_skb_any(priv->echo_skb[idx]); + struct sk_buff *skb = priv->echo_skb[idx]; + struct can_skb_priv *can_skb_priv = can_skb_prv(skb); + + if (frame_len_ptr) + *frame_len_ptr = can_skb_priv->frame_len; + + dev_kfree_skb_any(skb); priv->echo_skb[idx] = NULL; } } diff --git a/drivers/net/can/grcan.c b/drivers/net/can/grcan.c index 4a8453290530..78e27940b2af 100644 --- a/drivers/net/can/grcan.c +++ b/drivers/net/can/grcan.c @@ -520,7 +520,7 @@ static int catch_up_echo_skb(struct net_device *dev, int budget, bool echo) can_get_echo_skb(dev, i, NULL); } else { /* For cleanup of untransmitted messages */ - can_free_echo_skb(dev, i); + can_free_echo_skb(dev, i, NULL); } priv->eskbp = grcan_ring_add(priv->eskbp, GRCAN_MSG_SIZE, diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 0c8d36bc668c..2ae3da16cbfe 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -425,7 +425,7 @@ static void m_can_clean(struct net_device *net) putidx = ((m_can_read(cdev, M_CAN_TXFQS) & TXFQS_TFQPI_MASK) >> TXFQS_TFQPI_SHIFT); - can_free_echo_skb(cdev->net, putidx); + can_free_echo_skb(cdev->net, putidx, NULL); cdev->tx_skb = NULL; } } diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c index 4870c4ea190a..00e4533c8bdd 100644 --- a/drivers/net/can/rcar/rcar_can.c +++ b/drivers/net/can/rcar/rcar_can.c @@ -217,7 +217,7 @@ static void tx_failure_cleanup(struct net_device *ndev) int i; for (i = 0; i < RCAR_CAN_FIFO_DEPTH; i++) - can_free_echo_skb(ndev, i); + can_free_echo_skb(ndev, i, NULL); } static void rcar_can_error(struct net_device *ndev) diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index d8d233e62990..311e6ca3bdc4 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -617,7 +617,7 @@ static void rcar_canfd_tx_failure_cleanup(struct net_device *ndev) u32 i; for (i = 0; i < RCANFD_FIFO_DEPTH; i++) - can_free_echo_skb(ndev, i); + can_free_echo_skb(ndev, i, NULL); } static int rcar_canfd_reset_controller(struct rcar_canfd_global *gpriv) diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index 9e86488ba55f..3fad54646746 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -525,7 +525,7 @@ irqreturn_t sja1000_interrupt(int irq, void *dev_id) if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT && !(status & SR_TCS)) { stats->tx_errors++; - can_free_echo_skb(dev, 0); + can_free_echo_skb(dev, 0, NULL); } else { /* transmission complete */ stats->tx_bytes += diff --git a/drivers/net/can/spi/hi311x.c b/drivers/net/can/spi/hi311x.c index c3e020c90111..6f5d6d04a8b9 100644 --- a/drivers/net/can/spi/hi311x.c +++ b/drivers/net/can/spi/hi311x.c @@ -179,7 +179,7 @@ static void hi3110_clean(struct net_device *net) net->stats.tx_errors++; dev_kfree_skb(priv->tx_skb); if (priv->tx_len) - can_free_echo_skb(priv->net, 0); + can_free_echo_skb(priv->net, 0, NULL); priv->tx_skb = NULL; priv->tx_len = 0; } diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index f69fb4238a65..80ab1593ca31 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -276,7 +276,7 @@ static void mcp251x_clean(struct net_device *net) net->stats.tx_errors++; dev_kfree_skb(priv->tx_skb); if (priv->tx_len) - can_free_echo_skb(priv->net, 0); + can_free_echo_skb(priv->net, 0, NULL); priv->tx_skb = NULL; priv->tx_len = 0; } diff --git a/drivers/net/can/usb/ems_usb.c b/drivers/net/can/usb/ems_usb.c index 18f40eb20360..5af69787d9d5 100644 --- a/drivers/net/can/usb/ems_usb.c +++ b/drivers/net/can/usb/ems_usb.c @@ -807,7 +807,7 @@ static netdev_tx_t ems_usb_start_xmit(struct sk_buff *skb, struct net_device *ne err = usb_submit_urb(urb, GFP_ATOMIC); if (unlikely(err)) { - can_free_echo_skb(netdev, context->echo_index); + can_free_echo_skb(netdev, context->echo_index, NULL); usb_unanchor_urb(urb); usb_free_coherent(dev->udev, size, buf, urb->transfer_dma); diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c index 562acbf454fd..65b58f8fc328 100644 --- a/drivers/net/can/usb/esd_usb2.c +++ b/drivers/net/can/usb/esd_usb2.c @@ -360,7 +360,7 @@ static void esd_usb2_tx_done_msg(struct esd_usb2_net_priv *priv, can_get_echo_skb(netdev, context->echo_index, NULL); } else { stats->tx_errors++; - can_free_echo_skb(netdev, context->echo_index); + can_free_echo_skb(netdev, context->echo_index, NULL); } /* Release context */ @@ -793,7 +793,7 @@ static netdev_tx_t esd_usb2_start_xmit(struct sk_buff *skb, err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { - can_free_echo_skb(netdev, context->echo_index); + can_free_echo_skb(netdev, context->echo_index, NULL); atomic_dec(&priv->active_tx_jobs); usb_unanchor_urb(urb); diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index a00dc1904415..5e892bef46b0 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -533,7 +533,7 @@ static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb, if (unlikely(rc)) { /* usb send failed */ atomic_dec(&dev->active_tx_urbs); - can_free_echo_skb(netdev, idx); + can_free_echo_skb(netdev, idx, NULL); gs_free_tx_context(txc); usb_unanchor_urb(urb); diff --git a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c index 4e97da8434ab..90ebcae13409 100644 --- a/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c +++ b/drivers/net/can/usb/kvaser_usb/kvaser_usb_core.c @@ -593,7 +593,7 @@ static netdev_tx_t kvaser_usb_start_xmit(struct sk_buff *skb, if (unlikely(err)) { spin_lock_irqsave(&priv->tx_contexts_lock, flags); - can_free_echo_skb(netdev, context->echo_index); + can_free_echo_skb(netdev, context->echo_index, NULL); context->echo_index = dev->max_tx_urbs; --priv->active_tx_contexts; netif_wake_queue(netdev); diff --git a/drivers/net/can/usb/mcba_usb.c b/drivers/net/can/usb/mcba_usb.c index 1f649d178010..029e77dfa773 100644 --- a/drivers/net/can/usb/mcba_usb.c +++ b/drivers/net/can/usb/mcba_usb.c @@ -364,7 +364,7 @@ static netdev_tx_t mcba_usb_start_xmit(struct sk_buff *skb, return NETDEV_TX_OK; xmit_failed: - can_free_echo_skb(priv->netdev, ctx->ndx); + can_free_echo_skb(priv->netdev, ctx->ndx, NULL); mcba_usb_free_ctx(ctx); dev_kfree_skb(skb); stats->tx_dropped++; diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 573b11559d73..29227b5851fe 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -371,7 +371,7 @@ static netdev_tx_t peak_usb_ndo_start_xmit(struct sk_buff *skb, err = usb_submit_urb(urb, GFP_ATOMIC); if (err) { - can_free_echo_skb(netdev, context->echo_index); + can_free_echo_skb(netdev, context->echo_index, NULL); usb_unanchor_urb(urb); diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c index fa403c080871..11fddedc36d4 100644 --- a/drivers/net/can/usb/ucan.c +++ b/drivers/net/can/usb/ucan.c @@ -675,7 +675,7 @@ static void ucan_tx_complete_msg(struct ucan_priv *up, can_get_echo_skb(up->netdev, echo_index, NULL); } else { up->netdev->stats.tx_dropped++; - can_free_echo_skb(up->netdev, echo_index); + can_free_echo_skb(up->netdev, echo_index, NULL); } spin_unlock_irqrestore(&up->echo_skb_lock, flags); } @@ -843,7 +843,7 @@ static void ucan_write_bulk_callback(struct urb *urb) /* update counters an cleanup */ spin_lock_irqsave(&up->echo_skb_lock, flags); - can_free_echo_skb(up->netdev, context - up->context_array); + can_free_echo_skb(up->netdev, context - up->context_array, NULL); spin_unlock_irqrestore(&up->echo_skb_lock, flags); up->netdev->stats.tx_dropped++; @@ -1157,7 +1157,7 @@ static netdev_tx_t ucan_start_xmit(struct sk_buff *skb, * frees the skb */ spin_lock_irqsave(&up->echo_skb_lock, flags); - can_free_echo_skb(up->netdev, echo_index); + can_free_echo_skb(up->netdev, echo_index, NULL); spin_unlock_irqrestore(&up->echo_skb_lock, flags); if (ret == -ENODEV) { diff --git a/drivers/net/can/usb/usb_8dev.c b/drivers/net/can/usb/usb_8dev.c index e8c42430a4fc..b6e7ef0d5bc6 100644 --- a/drivers/net/can/usb/usb_8dev.c +++ b/drivers/net/can/usb/usb_8dev.c @@ -691,7 +691,7 @@ nofreecontext: return NETDEV_TX_BUSY; failed: - can_free_echo_skb(netdev, context->echo_index); + can_free_echo_skb(netdev, context->echo_index, NULL); usb_unanchor_urb(urb); usb_free_coherent(priv->udev, size, buf, urb->transfer_dma); diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index d438eb058069..d311bc369a39 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -23,7 +23,8 @@ struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr, unsigned int *frame_len_ptr); unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); -void can_free_echo_skb(struct net_device *dev, unsigned int idx); +void can_free_echo_skb(struct net_device *dev, unsigned int idx, + unsigned int *frame_len_ptr); struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd); -- cgit v1.2.3 From 289ea9e4ae595545e736a63ccaadba65f880e9a4 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 24 Feb 2021 09:20:04 +0900 Subject: can: add new CAN FD bittiming parameters: Transmitter Delay Compensation (TDC) At high bit rates, the propagation delay from the TX pin to the RX pin of the transceiver causes measurement errors: the sample point on the RX pin might occur on the previous bit. This issue is addressed in ISO 11898-1 section 11.3.3 "Transmitter delay compensation" (TDC). This patch adds two new structures: can_tdc and can_tdc_const in order to implement this TDC. The structures are then added to can_priv. A controller supports TDC if an only if can_priv::tdc_const is not NULL. TDC is active if and only if: - fd flag is on - can_priv::tdc.tdco is not zero. It is the driver responsibility to check those two conditions are met. No new controller modes are introduced (i.e. no CAN_CTRL_MODE_TDC) in order not to be redundant with above logic. The names of the parameters are chosen to match existing CAN controllers specification. References: - Bosch C_CAN FD8: https://www.bosch-semiconductors.com/media/ip_modules/pdf_2/c_can_fd8/users_manual_c_can_fd8_r210_1.pdf - Microchip CAN FD Controller Module: http://ww1.microchip.com/downloads/en/DeviceDoc/MCP251XXFD-CAN-FD-Controller-Module-Family-Reference-Manual-20005678B.pdf - SAM E701/S70/V70/V71 Family: https://www.mouser.com/datasheet/2/268/60001527A-1284321.pdf Link: https://lore.kernel.org/r/20210224002008.4158-2-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 65 +++++++++++++++++++++++++++++++++++++++++++ include/linux/can/dev.h | 3 ++ 2 files changed, 68 insertions(+) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 707575c668f4..b31a49f19b47 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2020 Pengutronix, Marc Kleine-Budde + * Copyright (c) 2021 Vincent Mailhol */ #ifndef _CAN_BITTIMING_H @@ -10,6 +11,70 @@ #define CAN_SYNC_SEG 1 +/* + * struct can_tdc - CAN FD Transmission Delay Compensation parameters + * + * At high bit rates, the propagation delay from the TX pin to the RX + * pin of the transceiver causes measurement errors: the sample point + * on the RX pin might occur on the previous bit. + * + * To solve this issue, ISO 11898-1 introduces in section 11.3.3 + * "Transmitter delay compensation" a SSP (Secondary Sample Point) + * equal to the distance, in time quanta, from the start of the bit + * time on the TX pin to the actual measurement on the RX pin. + * + * This structure contains the parameters to calculate that SSP. + * + * @tdcv: Transmitter Delay Compensation Value. Distance, in time + * quanta, from when the bit is sent on the TX pin to when it is + * received on the RX pin of the transmitter. Possible options: + * + * O: automatic mode. The controller dynamically measure @tdcv + * for each transmitted CAN FD frame. + * + * Other values: manual mode. Use the fixed provided value. + * + * @tdco: Transmitter Delay Compensation Offset. Offset value, in time + * quanta, defining the distance between the start of the bit + * reception on the RX pin of the transceiver and the SSP + * position such as SSP = @tdcv + @tdco. + * + * If @tdco is zero, then TDC is disabled and both @tdcv and + * @tdcf should be ignored. + * + * @tdcf: Transmitter Delay Compensation Filter window. Defines the + * minimum value for the SSP position in time quanta. If SSP is + * less than @tdcf, then no delay compensations occur and the + * normal sampling point is used instead. The feature is enabled + * if and only if @tdcv is set to zero (automatic mode) and @tdcf + * is configured to a value greater than @tdco. + */ +struct can_tdc { + u32 tdcv; + u32 tdco; + u32 tdcf; +}; + +/* + * struct can_tdc_const - CAN hardware-dependent constant for + * Transmission Delay Compensation + * + * @tdcv_max: Transmitter Delay Compensation Value maximum value. + * Should be set to zero if the controller does not support + * manual mode for tdcv. + * @tdco_max: Transmitter Delay Compensation Offset maximum value. + * Should not be zero. If the controller does not support TDC, + * then the pointer to this structure should be NULL. + * @tdcf_max: Transmitter Delay Compensation Filter window maximum + * value. Should be set to zero if the controller does not + * support this feature. + */ +struct can_tdc_const { + u32 tdcv_max; + u32 tdco_max; + u32 tdcf_max; +}; + #ifdef CONFIG_CAN_CALC_BITTIMING int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc); diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index ac4d83a1ab81..4795da0eb949 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -42,6 +42,9 @@ struct can_priv { struct can_bittiming bittiming, data_bittiming; const struct can_bittiming_const *bittiming_const, *data_bittiming_const; + struct can_tdc tdc; + const struct can_tdc_const *tdc_const; + const u16 *termination_const; unsigned int termination_const_cnt; u16 termination; -- cgit v1.2.3 From 4c9258dd26fdb3bacb35e767fa55c9a03a78a08e Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 24 Feb 2021 09:20:05 +0900 Subject: can: dev: reorder struct can_priv members for better packing Save eight bytes of holes on x86-64 architectures by reordering struct can_priv members. Before: $ pahole -C can_priv drivers/net/can/dev/dev.o struct can_priv { struct net_device * dev; /* 0 8 */ struct can_device_stats can_stats; /* 8 24 */ struct can_bittiming bittiming; /* 32 32 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct can_bittiming data_bittiming; /* 64 32 */ const struct can_bittiming_const * bittiming_const; /* 96 8 */ const struct can_bittiming_const * data_bittiming_const; /* 104 8 */ struct can_tdc tdc; /* 112 12 */ /* XXX 4 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ const struct can_tdc_const * tdc_const; /* 128 8 */ const u16 * termination_const; /* 136 8 */ unsigned int termination_const_cnt; /* 144 4 */ u16 termination; /* 148 2 */ /* XXX 2 bytes hole, try to pack */ const u32 * bitrate_const; /* 152 8 */ unsigned int bitrate_const_cnt; /* 160 4 */ /* XXX 4 bytes hole, try to pack */ const u32 * data_bitrate_const; /* 168 8 */ unsigned int data_bitrate_const_cnt; /* 176 4 */ u32 bitrate_max; /* 180 4 */ struct can_clock clock; /* 184 4 */ enum can_state state; /* 188 4 */ /* --- cacheline 3 boundary (192 bytes) --- */ u32 ctrlmode; /* 192 4 */ u32 ctrlmode_supported; /* 196 4 */ u32 ctrlmode_static; /* 200 4 */ int restart_ms; /* 204 4 */ struct delayed_work restart_work; /* 208 168 */ /* XXX last struct has 4 bytes of padding */ /* --- cacheline 5 boundary (320 bytes) was 56 bytes ago --- */ int (*do_set_bittiming)(struct net_device *); /* 376 8 */ /* --- cacheline 6 boundary (384 bytes) --- */ int (*do_set_data_bittiming)(struct net_device *); /* 384 8 */ int (*do_set_mode)(struct net_device *, enum can_mode); /* 392 8 */ int (*do_set_termination)(struct net_device *, u16); /* 400 8 */ int (*do_get_state)(const struct net_device *, enum can_state *); /* 408 8 */ int (*do_get_berr_counter)(const struct net_device *, struct can_berr_counter *); /* 416 8 */ unsigned int echo_skb_max; /* 424 4 */ /* XXX 4 bytes hole, try to pack */ struct sk_buff * * echo_skb; /* 432 8 */ /* size: 440, cachelines: 7, members: 31 */ /* sum members: 426, holes: 4, sum holes: 14 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 56 bytes */ }; After: $ pahole -C can_priv drivers/net/can/dev/dev.o struct can_priv { struct net_device * dev; /* 0 8 */ struct can_device_stats can_stats; /* 8 24 */ const struct can_bittiming_const * bittiming_const; /* 32 8 */ const struct can_bittiming_const * data_bittiming_const; /* 40 8 */ struct can_bittiming bittiming; /* 48 32 */ /* --- cacheline 1 boundary (64 bytes) was 16 bytes ago --- */ struct can_bittiming data_bittiming; /* 80 32 */ const struct can_tdc_const * tdc_const; /* 112 8 */ struct can_tdc tdc; /* 120 12 */ /* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */ unsigned int bitrate_const_cnt; /* 132 4 */ const u32 * bitrate_const; /* 136 8 */ const u32 * data_bitrate_const; /* 144 8 */ unsigned int data_bitrate_const_cnt; /* 152 4 */ u32 bitrate_max; /* 156 4 */ struct can_clock clock; /* 160 4 */ unsigned int termination_const_cnt; /* 164 4 */ const u16 * termination_const; /* 168 8 */ u16 termination; /* 176 2 */ /* XXX 2 bytes hole, try to pack */ enum can_state state; /* 180 4 */ u32 ctrlmode; /* 184 4 */ u32 ctrlmode_supported; /* 188 4 */ /* --- cacheline 3 boundary (192 bytes) --- */ u32 ctrlmode_static; /* 192 4 */ int restart_ms; /* 196 4 */ struct delayed_work restart_work; /* 200 168 */ /* XXX last struct has 4 bytes of padding */ /* --- cacheline 5 boundary (320 bytes) was 48 bytes ago --- */ int (*do_set_bittiming)(struct net_device *); /* 368 8 */ int (*do_set_data_bittiming)(struct net_device *); /* 376 8 */ /* --- cacheline 6 boundary (384 bytes) --- */ int (*do_set_mode)(struct net_device *, enum can_mode); /* 384 8 */ int (*do_set_termination)(struct net_device *, u16); /* 392 8 */ int (*do_get_state)(const struct net_device *, enum can_state *); /* 400 8 */ int (*do_get_berr_counter)(const struct net_device *, struct can_berr_counter *); /* 408 8 */ unsigned int echo_skb_max; /* 416 4 */ /* XXX 4 bytes hole, try to pack */ struct sk_buff * * echo_skb; /* 424 8 */ /* size: 432, cachelines: 7, members: 31 */ /* sum members: 426, holes: 2, sum holes: 6 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 48 bytes */ }; Link: https://lore.kernel.org/r/20210224002008.4158-3-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 4795da0eb949..27b275e463da 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -39,22 +39,23 @@ struct can_priv { struct net_device *dev; struct can_device_stats can_stats; - struct can_bittiming bittiming, data_bittiming; const struct can_bittiming_const *bittiming_const, *data_bittiming_const; - struct can_tdc tdc; + struct can_bittiming bittiming, data_bittiming; const struct can_tdc_const *tdc_const; + struct can_tdc tdc; - const u16 *termination_const; - unsigned int termination_const_cnt; - u16 termination; - const u32 *bitrate_const; unsigned int bitrate_const_cnt; + const u32 *bitrate_const; const u32 *data_bitrate_const; unsigned int data_bitrate_const_cnt; u32 bitrate_max; struct can_clock clock; + unsigned int termination_const_cnt; + const u16 *termination_const; + u16 termination; + enum can_state state; /* CAN controller features - see include/uapi/linux/can/netlink.h */ -- cgit v1.2.3 From c25cc7993243fdc00ab7e608e3764819538015ab Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 24 Feb 2021 09:20:08 +0900 Subject: can: bittiming: add calculation for CAN FD Transmitter Delay Compensation (TDC) The logic for the tdco calculation is to just reuse the normal sample point: tdco = sp. Because the sample point is expressed in tenth of percent and the tdco is expressed in time quanta, a conversion is needed. At the end, ssp = tdcv + tdco = tdcv + sp. Another popular method is to set tdco to the middle of the bit: tdc->tdco = can_bit_time(dbt) / 2 During benchmark tests, we could not find a clear advantages for one of the two methods. The tdco calculation is triggered each time the data_bittiming is changed so that users relying on automated calculation can use the netlink interface the exact same way without need of new parameters. For example, a command such as: ip link set canX type can bitrate 500000 dbitrate 4000000 fd on would trigger the calculation. The user using CONFIG_CAN_CALC_BITTIMING who does not want automated calculation needs to manually set tdco to zero. For example with: ip link set canX type can tdco 0 bitrate 500000 dbitrate 4000000 fd on (if the tdco parameter is provided in a previous command, it will be overwritten). If tdcv is set to zero (default), it is automatically calculated by the transiver for each frame. As such, there is no code in the kernel to calculate it. tdcf has no automated calculation functions because we could not figure out a formula for this parameter. Link: https://lore.kernel.org/r/20210224002008.4158-6-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/bittiming.c | 24 ++++++++++++++++++++++++ drivers/net/can/dev/netlink.c | 2 ++ include/linux/can/bittiming.h | 6 ++++++ 3 files changed, 32 insertions(+) (limited to 'include') diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c index f7fe226bb395..2907e60c9a57 100644 --- a/drivers/net/can/dev/bittiming.c +++ b/drivers/net/can/dev/bittiming.c @@ -174,6 +174,30 @@ int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, return 0; } + +void can_calc_tdco(struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + const struct can_bittiming *dbt = &priv->data_bittiming; + struct can_tdc *tdc = &priv->tdc; + const struct can_tdc_const *tdc_const = priv->tdc_const; + + if (!tdc_const) + return; + + /* As specified in ISO 11898-1 section 11.3.3 "Transmitter + * delay compensation" (TDC) is only applicable if data BRP is + * one or two. + */ + if (dbt->brp == 1 || dbt->brp == 2) { + /* Reuse "normal" sample point and convert it to time quanta */ + u32 sample_point_in_tq = can_bit_time(dbt) * dbt->sample_point / 1000; + + tdc->tdco = min(sample_point_in_tq, tdc_const->tdco_max); + } else { + tdc->tdco = 0; + } +} #endif /* CONFIG_CAN_CALC_BITTIMING */ /* Checks the validity of the specified bit-timing parameters prop_seg, diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index 8443480a703d..e38c2566aff4 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -186,6 +186,8 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], memcpy(&priv->data_bittiming, &dbt, sizeof(dbt)); + can_calc_tdco(dev); + if (priv->do_set_data_bittiming) { /* Finally, set the bit-timing registers */ err = priv->do_set_data_bittiming(dev); diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index b31a49f19b47..3c4cad7b52c0 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -78,6 +78,8 @@ struct can_tdc_const { #ifdef CONFIG_CAN_CALC_BITTIMING int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc); + +void can_calc_tdco(struct net_device *dev); #else /* !CONFIG_CAN_CALC_BITTIMING */ static inline int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, @@ -86,6 +88,10 @@ can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, netdev_err(dev, "bit-timing calculation not available\n"); return -EINVAL; } + +static inline void can_calc_tdco(struct net_device *dev) +{ +} #endif /* CONFIG_CAN_CALC_BITTIMING */ int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, -- cgit v1.2.3 From 1d7750760b70ba8b0e641146eee1b3a343d1b292 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 6 Mar 2021 14:40:40 +0900 Subject: can: bittiming: add CAN_KBPS, CAN_MBPS and CAN_MHZ macros Add three macro to simplify the readability of big bit timing numbers: - CAN_KBPS: kilobits per second (one thousand) - CAN_MBPS: megabits per second (one million) - CAN_MHZ: megahertz per second (one million) Example: u32 bitrate_max = 8 * CAN_MBPS; struct can_clock clock = {.freq = 80 * CAN_MHZ}; instead of: u32 bitrate_max = 8000000; struct can_clock clock = {.freq = 80000000}; Apply the new macro to driver/net/can/dev/bittiming.c. Link: https://lore.kernel.org/r/20210306054040.76483-1-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/bittiming.c | 4 ++-- include/linux/can/bittiming.h | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/can/dev/bittiming.c b/drivers/net/can/dev/bittiming.c index 2907e60c9a57..f49170eadd54 100644 --- a/drivers/net/can/dev/bittiming.c +++ b/drivers/net/can/dev/bittiming.c @@ -81,9 +81,9 @@ int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, if (bt->sample_point) { sample_point_nominal = bt->sample_point; } else { - if (bt->bitrate > 800000) + if (bt->bitrate > 800 * CAN_KBPS) sample_point_nominal = 750; - else if (bt->bitrate > 500000) + else if (bt->bitrate > 500 * CAN_KBPS) sample_point_nominal = 800; else sample_point_nominal = 875; diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 3c4cad7b52c0..ae7a3411167c 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -11,6 +11,14 @@ #define CAN_SYNC_SEG 1 + +/* Kilobits and Megabits per second */ +#define CAN_KBPS 1000UL +#define CAN_MBPS 1000000UL + +/* Megahertz */ +#define CAN_MHZ 1000000UL + /* * struct can_tdc - CAN FD Transmission Delay Compensation parameters * -- cgit v1.2.3 From 2b246b2569cd2ac6ff700d0dce56b8bae29b1842 Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Mon, 29 Mar 2021 18:45:15 -0700 Subject: icmp: add support for RFC 8335 PROBE Add definitions for PROBE ICMP types and codes. Add AFI definitions for IP and IPV6 as specified by IANA Add a struct to represent the additional header when probing by IP address (ctype == 3) for use in parsing incoming PROBE messages Add a struct to represent the entire Interface Identification Object (IIO) section of an incoming PROBE packet Signed-off-by: Andreas Roeseler Signed-off-by: David S. Miller --- include/uapi/linux/icmp.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index fb169a50895e..222325d1d80e 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -20,6 +20,9 @@ #include #include +#include +#include +#include #define ICMP_ECHOREPLY 0 /* Echo Reply */ #define ICMP_DEST_UNREACH 3 /* Destination Unreachable */ @@ -66,6 +69,23 @@ #define ICMP_EXC_TTL 0 /* TTL count exceeded */ #define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */ +/* Codes for EXT_ECHO (PROBE) */ +#define ICMP_EXT_ECHO 42 +#define ICMP_EXT_ECHOREPLY 43 +#define ICMP_EXT_MAL_QUERY 1 /* Malformed Query */ +#define ICMP_EXT_NO_IF 2 /* No such Interface */ +#define ICMP_EXT_NO_TABLE_ENT 3 /* No such Table Entry */ +#define ICMP_EXT_MULT_IFS 4 /* Multiple Interfaces Satisfy Query */ + +/* Constants for EXT_ECHO (PROBE) */ +#define EXT_ECHOREPLY_ACTIVE (1 << 2)/* active bit in reply message */ +#define EXT_ECHOREPLY_IPV4 (1 << 1)/* ipv4 bit in reply message */ +#define EXT_ECHOREPLY_IPV6 1 /* ipv6 bit in reply message */ +#define EXT_ECHO_CTYPE_NAME 1 +#define EXT_ECHO_CTYPE_INDEX 2 +#define EXT_ECHO_CTYPE_ADDR 3 +#define ICMP_AFI_IP 1 /* Address Family Identifier for ipv4 */ +#define ICMP_AFI_IP6 2 /* Address Family Identifier for ipv6 */ struct icmphdr { __u8 type; @@ -118,4 +138,26 @@ struct icmp_extobj_hdr { __u8 class_type; }; +/* RFC 8335: 2.1 Header for c-type 3 payload */ +struct icmp_ext_echo_ctype3_hdr { + __be16 afi; + __u8 addrlen; + __u8 reserved; +}; + +/* RFC 8335: 2.1 Interface Identification Object */ +struct icmp_ext_echo_iio { + struct icmp_extobj_hdr extobj_hdr; + union { + char name[IFNAMSIZ]; + __be32 ifindex; + struct { + struct icmp_ext_echo_ctype3_hdr ctype3_hdr; + union { + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + } ip_addr; + } addr; + } ident; +}; #endif /* _UAPI_LINUX_ICMP_H */ -- cgit v1.2.3 From 750f4fc2a12f6632b5aa04526bf57fa06bfe8467 Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Mon, 29 Mar 2021 18:45:21 -0700 Subject: ICMPV6: add support for RFC 8335 PROBE Add definitions for the ICMPV6 type of Extended Echo Request and Extended Echo Reply, as defined by sections 2 and 3 of RFC 8335. Signed-off-by: Andreas Roeseler Signed-off-by: David S. Miller --- include/uapi/linux/icmpv6.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 0564fd7ccde4..ecaece3af38d 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -140,6 +140,9 @@ struct icmp6hdr { #define ICMPV6_UNK_OPTION 2 #define ICMPV6_HDR_INCOMP 3 +/* Codes for EXT_ECHO (PROBE) */ +#define ICMPV6_EXT_ECHO_REQUEST 160 +#define ICMPV6_EXT_ECHO_REPLY 161 /* * constants for (set|get)sockopt */ -- cgit v1.2.3 From f1b8fa9fa5865c58c093cde6d782104c22df9088 Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Mon, 29 Mar 2021 18:45:29 -0700 Subject: net: add sysctl for enabling RFC 8335 PROBE messages Section 8 of RFC 8335 specifies potential security concerns of responding to PROBE requests, and states that nodes that support PROBE functionality MUST be able to enable/disable responses and that responses MUST be disabled by default Signed-off-by: Andreas Roeseler Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 6 ++++++ include/net/netns/ipv4.h | 1 + net/ipv4/sysctl_net_ipv4.c | 9 +++++++++ 3 files changed, 16 insertions(+) (limited to 'include') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index c7952ac5bd2f..4130bce40765 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1143,6 +1143,12 @@ icmp_echo_ignore_all - BOOLEAN Default: 0 +icmp_echo_enable_probe - BOOLEAN + If set to one, then the kernel will respond to RFC 8335 PROBE + requests sent to it. + + Default: 0 + icmp_echo_ignore_broadcasts - BOOLEAN If set non-zero, then the kernel will ignore all ICMP ECHO and TIMESTAMP requests sent to it via broadcast/multicast. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d377266d133f..9c8dd424d79b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -84,6 +84,7 @@ struct netns_ipv4 { #endif u8 sysctl_icmp_echo_ignore_all; + u8 sysctl_icmp_echo_enable_probe; u8 sysctl_icmp_echo_ignore_broadcasts; u8 sysctl_icmp_ignore_bogus_error_responses; u8 sysctl_icmp_errors_use_inbound_ifaddr; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a7e5cf5d6cc..e3cb2d96b55e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -598,6 +598,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, + { + .procname = "icmp_echo_enable_probe", + .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, { .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, -- cgit v1.2.3 From 504a40113cc4f329dd75bbf6e4b060603224d814 Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Mon, 29 Mar 2021 18:45:43 -0700 Subject: ipv6: add ipv6_dev_find to stubs Add ipv6_dev_find to ipv6_stub to allow lookup of net_devices by IPV6 address in net/ipv4/icmp.c. Signed-off-by: Andreas Roeseler Signed-off-by: David S. Miller --- include/net/ipv6_stubs.h | 2 ++ net/ipv6/addrconf_core.c | 7 +++++++ net/ipv6/af_inet6.c | 1 + 3 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 8fce558b5fea..afbce90c4480 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -66,6 +66,8 @@ struct ipv6_stub { int (*ipv6_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); + struct net_device *(*ipv6_dev_find)(struct net *net, const struct in6_addr *addr, + struct net_device *dev); }; extern const struct ipv6_stub *ipv6_stub __read_mostly; diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index a36626afbc02..1d4054bb345b 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -198,6 +198,12 @@ static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct s return -EAFNOSUPPORT; } +static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr, + struct net_device *dev) +{ + return ERR_PTR(-EAFNOSUPPORT); +} + const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow, .ipv6_route_input = eafnosupport_ipv6_route_input, @@ -209,6 +215,7 @@ const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .fib6_nh_init = eafnosupport_fib6_nh_init, .ip6_del_rt = eafnosupport_ip6_del_rt, .ipv6_fragment = eafnosupport_ipv6_fragment, + .ipv6_dev_find = eafnosupport_ipv6_dev_find, }; EXPORT_SYMBOL_GPL(ipv6_stub); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3c9bacffc9c3..4f7ca5807046 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -1032,6 +1032,7 @@ static const struct ipv6_stub ipv6_stub_impl = { #endif .nd_tbl = &nd_tbl, .ipv6_fragment = ip6_fragment, + .ipv6_dev_find = ipv6_dev_find, }; static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { -- cgit v1.2.3 From 913d55037616659c04763e756f948fcbaef0bbee Mon Sep 17 00:00:00 2001 From: He Fengqing Date: Tue, 30 Mar 2021 02:48:43 +0000 Subject: bpf: Remove unused bpf_load_pointer Remove unused bpf_load_pointer function in filter.h. The last user of it has been removed with 24dea04767e6 ("bpf, x32: remove ld_abs/ld_ind"). Signed-off-by: He Fengqing Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210330024843.3479844-1-hefengqing@huawei.com --- include/linux/filter.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index eecfd82db648..9a09547bc7ba 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1246,15 +1246,6 @@ static inline u16 bpf_anc_helper(const struct sock_filter *ftest) void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size); -static inline void *bpf_load_pointer(const struct sk_buff *skb, int k, - unsigned int size, void *buffer) -{ - if (k >= 0) - return skb_header_pointer(skb, k, size, buffer); - - return bpf_internal_load_pointer_neg_helper(skb, k, size); -} - static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; -- cgit v1.2.3 From 000ac44da7d0adfc5e62e6c019246a4afeeffd04 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Mar 2021 12:28:49 +0200 Subject: udp: fixup csum for GSO receive slow path When UDP packets generated locally by a socket with UDP_SEGMENT traverse the following path: UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) -> UDP socket (no UDP_GRO) ip_summed will be set to CHECKSUM_PARTIAL at creation time and such checksum mode will be preserved in the above path up to the UDP tunnel receive code where we have: __iptunnel_pull_header() -> skb_pull_rcsum() -> skb_postpull_rcsum() -> __skb_postpull_rcsum() The latter will convert the skb to CHECKSUM_NONE. The UDP GSO packet will be later segmented as part of the rx socket receive operation, and will present a CHECKSUM_NONE after segmentation. Additionally the segmented packets UDP CB still refers to the original GSO packet len. Overall that causes unexpected/wrong csum validation errors later in the UDP receive path. We could possibly address the issue with some additional checks and csum mangling in the UDP tunnel code. Since the issue affects only this UDP receive slow path, let's set a suitable csum status there. Note that SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets lacking an UDP encapsulation present a valid checksum when landing to udp_queue_rcv_skb(), as the UDP checksum has been validated by the GRO engine. v2 -> v3: - even more verbose commit message and comments v1 -> v2: - restrict the csum update to the packets strictly needing them - hopefully clarify the commit message and code comments Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/udp.h | 23 +++++++++++++++++++++++ net/ipv4/udp.c | 2 ++ net/ipv6/udp.c | 1 + 3 files changed, 26 insertions(+) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index d4d064c59232..adf2ff8ac87c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -515,6 +515,29 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; } +static inline void udp_post_segment_fix_csum(struct sk_buff *skb) +{ + /* UDP-lite can't land here - no GRO */ + WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov); + + /* UDP packets generated with UDP_SEGMENT and traversing: + * + * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) + * + * can reach an UDP socket with CHECKSUM_NONE, because + * __iptunnel_pull_header() converts CHECKSUM_PARTIAL into NONE. + * SKB_GSO_UDP_L4 or SKB_GSO_FRAGLIST packets with no UDP tunnel will + * have a valid checksum, as the GRO engine validates the UDP csum + * before the aggregation and nobody strips such info in between. + * Instead of adding another check in the tunnel fastpath, we can force + * a valid csum after the segmentation. + * Additionally fixup the UDP CB. + */ + UDP_SKB_CB(skb)->cscov = skb->len; + if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid) + skb->csum_valid = 1; +} + #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a0478b17243..fe85dcf8c008 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2178,6 +2178,8 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) segs = udp_rcv_segment(sk, skb, true); skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); + + udp_post_segment_fix_csum(skb); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d25e5a9252fd..fa2f54738392 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -749,6 +749,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); + udp_post_segment_fix_csum(skb); ret = udpv6_queue_rcv_one_skb(sk, skb); if (ret > 0) ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret, -- cgit v1.2.3 From 78352f73dc5047f3f744764cc45912498c52f3c9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Mar 2021 12:28:52 +0200 Subject: udp: never accept GSO_FRAGLIST packets Currently the UDP protocol delivers GSO_FRAGLIST packets to the sockets without the expected segmentation. This change addresses the issue introducing and maintaining a couple of new fields to explicitly accept SKB_GSO_UDP_L4 or GSO_FRAGLIST packets. Additionally updates udp_unexpected_gso() accordingly. UDP sockets enabling UDP_GRO stil keep accept_udp_fraglist zeroed. v1 -> v2: - use 2 bits instead of a whole GSO bitmask (Willem) Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 16 +++++++++++++--- net/ipv4/udp.c | 3 +++ 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index aa84597bdc33..ae58ff3b6b5b 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -51,7 +51,9 @@ struct udp_sock { * different encapsulation layer set * this */ - gro_enabled:1; /* Can accept GRO packets */ + gro_enabled:1, /* Request GRO aggregation */ + accept_udp_l4:1, + accept_udp_fraglist:1; /* * Following member retains the information to create a UDP header * when the socket is uncorked. @@ -131,8 +133,16 @@ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) { - return !udp_sk(sk)->gro_enabled && skb_is_gso(skb) && - skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4; + if (!skb_is_gso(skb)) + return false; + + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4) + return true; + + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist) + return true; + + return false; } #define udp_portaddr_for_each_entry(__sk, list) \ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fe85dcf8c008..c0695ce42dc5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2666,9 +2666,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, case UDP_GRO: lock_sock(sk); + + /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk->sk_socket); up->gro_enabled = valbool; + up->accept_udp_l4 = valbool; release_sock(sk); break; -- cgit v1.2.3 From d18931a92a0b5feddd8a39d097b90ae2867db02f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Mar 2021 12:28:53 +0200 Subject: vxlan: allow L4 GRO passthrough When passing up an UDP GSO packet with L4 aggregation, there is no need to segment it at the vxlan level. We can propagate the packet untouched and let it be segmented later, if needed. Introduce an helper to allow let the UDP socket to accept any L4 aggregation and use it in the vxlan driver. v1 -> v2: - updated to use the newly introduced UDP socket 'accept*' fields Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 1 + include/linux/udp.h | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 7665817f3cb6..39ee1300cdd9 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -3484,6 +3484,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, if (err < 0) return ERR_PTR(err); + udp_allow_gso(sock->sk); return sock; } diff --git a/include/linux/udp.h b/include/linux/udp.h index ae58ff3b6b5b..ae66dadd8543 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -145,6 +145,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) return false; } +static inline void udp_allow_gso(struct sock *sk) +{ + udp_sk(sk)->accept_udp_l4 = 1; + udp_sk(sk)->accept_udp_fraglist = 1; +} + #define udp_portaddr_for_each_entry(__sk, list) \ hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node) -- cgit v1.2.3 From 77ccee96a67422ac05fc47327cac4e4287fd0d8a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Mar 2021 18:25:09 +0100 Subject: netfilter: nf_log_bridge: merge with nf_log_syslog Provide bridge log support from nf_log_syslog. After the merge there is no need to load the "real packet loggers", all of them now reside in the same module. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 1 - net/bridge/netfilter/Kconfig | 4 -- net/bridge/netfilter/Makefile | 3 -- net/bridge/netfilter/nf_log_bridge.c | 79 ------------------------------------ net/netfilter/nf_log.c | 7 ---- net/netfilter/nf_log_syslog.c | 22 ++++++++++ 6 files changed, 22 insertions(+), 94 deletions(-) delete mode 100644 net/bridge/netfilter/nf_log_bridge.c (limited to 'include') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 716db4a0fed8..a6b85068c294 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -68,7 +68,6 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf); int nf_logger_find_get(int pf, enum nf_log_type type); void nf_logger_put(int pf, enum nf_log_type type); -void nf_logger_request_module(int pf, enum nf_log_type type); #define MODULE_ALIAS_NF_LOGGER(family, type) \ MODULE_ALIAS("nf-logger-" __stringify(family) "-" __stringify(type)) diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index ac5372121e60..7f304a19ac1b 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -23,10 +23,6 @@ config NFT_BRIDGE_REJECT help Add support to reject packets. -config NF_LOG_BRIDGE - tristate "Bridge packet logging" - select NF_LOG_COMMON - endif # NF_TABLES_BRIDGE config NF_CONNTRACK_BRIDGE diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 8e2c5759d964..1c9ce49ab651 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -9,9 +9,6 @@ obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # connection tracking obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o -# packet logging -obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o - obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o # tables diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c deleted file mode 100644 index 1ad61d1017b6..000000000000 --- a/net/bridge/netfilter/nf_log_bridge.c +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * (C) 2014 by Pablo Neira Ayuso - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -static void nf_log_bridge_packet(struct net *net, u_int8_t pf, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb, - in, out, loginfo, prefix); -} - -static struct nf_logger nf_bridge_logger __read_mostly = { - .name = "nf_log_bridge", - .type = NF_LOG_TYPE_LOG, - .logfn = nf_log_bridge_packet, - .me = THIS_MODULE, -}; - -static int __net_init nf_log_bridge_net_init(struct net *net) -{ - return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); -} - -static void __net_exit nf_log_bridge_net_exit(struct net *net) -{ - nf_log_unset(net, &nf_bridge_logger); -} - -static struct pernet_operations nf_log_bridge_net_ops = { - .init = nf_log_bridge_net_init, - .exit = nf_log_bridge_net_exit, -}; - -static int __init nf_log_bridge_init(void) -{ - int ret; - - /* Request to load the real packet loggers. */ - nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG); - nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG); - - ret = register_pernet_subsys(&nf_log_bridge_net_ops); - if (ret < 0) - return ret; - - nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); - return 0; -} - -static void __exit nf_log_bridge_exit(void) -{ - unregister_pernet_subsys(&nf_log_bridge_net_ops); - nf_log_unregister(&nf_bridge_logger); -} - -module_init(nf_log_bridge_init); -module_exit(nf_log_bridge_exit); - -MODULE_AUTHOR("Pablo Neira Ayuso "); -MODULE_DESCRIPTION("Netfilter bridge packet logging"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 6cb9f9474b05..eaa8181f5ef7 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -151,13 +151,6 @@ void nf_log_unbind_pf(struct net *net, u_int8_t pf) } EXPORT_SYMBOL(nf_log_unbind_pf); -void nf_logger_request_module(int pf, enum nf_log_type type) -{ - if (loggers[pf][type] == NULL) - request_module("nf-logger-%u-%u", pf, type); -} -EXPORT_SYMBOL_GPL(nf_logger_request_module); - int nf_logger_find_get(int pf, enum nf_log_type type) { struct nf_logger *logger; diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 617e0071c0c4..6b56251de22a 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -787,6 +787,13 @@ static struct nf_logger nf_netdev_logger __read_mostly = { .me = THIS_MODULE, }; +static struct nf_logger nf_bridge_logger __read_mostly = { + .name = "nf_log_bridge", + .type = NF_LOG_TYPE_LOG, + .logfn = nf_log_netdev_packet, + .me = THIS_MODULE, +}; + static int __net_init nf_log_syslog_net_init(struct net *net) { int ret = nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger); @@ -805,7 +812,13 @@ static int __net_init nf_log_syslog_net_init(struct net *net) ret = nf_log_set(net, NFPROTO_NETDEV, &nf_netdev_logger); if (ret) goto err3; + + ret = nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger); + if (ret) + goto err4; return 0; +err4: + nf_log_unset(net, &nf_netdev_logger); err3: nf_log_unset(net, &nf_ip6_logger); err2: @@ -852,7 +865,13 @@ static int __init nf_log_syslog_init(void) if (ret < 0) goto err4; + ret = nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger); + if (ret < 0) + goto err5; + return 0; +err5: + nf_log_unregister(&nf_netdev_logger); err4: nf_log_unregister(&nf_ip6_logger); err3: @@ -872,6 +891,7 @@ static void __exit nf_log_syslog_exit(void) nf_log_unregister(&nf_arp_logger); nf_log_unregister(&nf_ip6_logger); nf_log_unregister(&nf_netdev_logger); + nf_log_unregister(&nf_bridge_logger); } module_init(nf_log_syslog_init); @@ -881,9 +901,11 @@ MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("Netfilter syslog packet logging"); MODULE_LICENSE("GPL"); MODULE_ALIAS("nf_log_arp"); +MODULE_ALIAS("nf_log_bridge"); MODULE_ALIAS("nf_log_ipv4"); MODULE_ALIAS("nf_log_ipv6"); MODULE_ALIAS("nf_log_netdev"); +MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0); MODULE_ALIAS_NF_LOGGER(AF_INET, 0); MODULE_ALIAS_NF_LOGGER(3, 0); MODULE_ALIAS_NF_LOGGER(5, 0); /* NFPROTO_NETDEV */ -- cgit v1.2.3 From e465cccd0b9de113a81280bd52ee717bf5e3d1a2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Mar 2021 18:25:10 +0100 Subject: netfilter: nf_log_common: merge with nf_log_syslog Remove nf_log_common. Now that all per-af modules have been merged there is no longer a need to provide a helper module. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 24 ----- net/netfilter/Kconfig | 8 +- net/netfilter/Makefile | 2 - net/netfilter/nf_log_common.c | 224 ----------------------------------------- net/netfilter/nf_log_syslog.c | 181 ++++++++++++++++++++++++++++++++- 5 files changed, 181 insertions(+), 258 deletions(-) delete mode 100644 net/netfilter/nf_log_common.c (limited to 'include') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index a6b85068c294..e55eedc84ed7 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -98,28 +98,4 @@ struct nf_log_buf; struct nf_log_buf *nf_log_buf_open(void); __printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...); void nf_log_buf_close(struct nf_log_buf *m); - -/* common logging functions */ -int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset); -int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset, - unsigned int logflags); -void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, - struct sock *sk); -void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb); -void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix); -void nf_log_l2packet(struct net *net, u_int8_t pf, - __be16 protocol, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix); - #endif /* _NF_LOG_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 6aef981a8446..fcd8682704c4 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -71,16 +71,13 @@ config NF_CONNTRACK To compile it as a module, choose M here. If unsure, say N. -config NF_LOG_COMMON - tristate - config NF_LOG_SYSLOG tristate "Syslog packet logging" default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON help This option enable support for packet logging via syslog. - It supports IPv4 and common transport protocols such as TCP and UDP. + It supports IPv4, IPV6, ARP and common transport protocols such + as TCP and UDP. This is a simpler but less flexible logging method compared to CONFIG_NETFILTER_NETLINK_LOG. If both are enabled the backend to use can be configured at run-time @@ -930,7 +927,6 @@ config NETFILTER_XT_TARGET_LED config NETFILTER_XT_TARGET_LOG tristate "LOG target support" - select NF_LOG_COMMON select NF_LOG_SYSLOG select NF_LOG_IPV6 if IP6_NF_IPTABLES default m if NETFILTER_ADVANCED=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 429be36fe4c7..e80e010354b1 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -48,8 +48,6 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o nf_nat-y := nf_nat_core.o nf_nat_proto.o nf_nat_helper.o -# generic transport layer logging -obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o obj-$(CONFIG_NF_NAT) += nf_nat.o diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c deleted file mode 100644 index fd7c5f0f5c25..000000000000 --- a/net/netfilter/nf_log_common.c +++ /dev/null @@ -1,224 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset) -{ - struct udphdr _udph; - const struct udphdr *uh; - - if (proto == IPPROTO_UDP) - /* Max length: 10 "PROTO=UDP " */ - nf_log_buf_add(m, "PROTO=UDP "); - else /* Max length: 14 "PROTO=UDPLITE " */ - nf_log_buf_add(m, "PROTO=UDPLITE "); - - if (fragment) - goto out; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); - if (uh == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", - ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); - -out: - return 0; -} -EXPORT_SYMBOL_GPL(nf_log_dump_udp_header); - -int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, - u8 proto, int fragment, unsigned int offset, - unsigned int logflags) -{ - struct tcphdr _tcph; - const struct tcphdr *th; - - /* Max length: 10 "PROTO=TCP " */ - nf_log_buf_add(m, "PROTO=TCP "); - - if (fragment) - return 0; - - /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (th == NULL) { - nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); - return 1; - } - - /* Max length: 20 "SPT=65535 DPT=65535 " */ - nf_log_buf_add(m, "SPT=%u DPT=%u ", - ntohs(th->source), ntohs(th->dest)); - /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (logflags & NF_LOG_TCPSEQ) { - nf_log_buf_add(m, "SEQ=%u ACK=%u ", - ntohl(th->seq), ntohl(th->ack_seq)); - } - - /* Max length: 13 "WINDOW=65535 " */ - nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); - /* Max length: 9 "RES=0x3C " */ - nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & - TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ - if (th->cwr) - nf_log_buf_add(m, "CWR "); - if (th->ece) - nf_log_buf_add(m, "ECE "); - if (th->urg) - nf_log_buf_add(m, "URG "); - if (th->ack) - nf_log_buf_add(m, "ACK "); - if (th->psh) - nf_log_buf_add(m, "PSH "); - if (th->rst) - nf_log_buf_add(m, "RST "); - if (th->syn) - nf_log_buf_add(m, "SYN "); - if (th->fin) - nf_log_buf_add(m, "FIN "); - /* Max length: 11 "URGP=65535 " */ - nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); - - if ((logflags & NF_LOG_TCPOPT) && th->doff*4 > sizeof(struct tcphdr)) { - u_int8_t _opt[60 - sizeof(struct tcphdr)]; - const u_int8_t *op; - unsigned int i; - unsigned int optsize = th->doff*4 - sizeof(struct tcphdr); - - op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), - optsize, _opt); - if (op == NULL) { - nf_log_buf_add(m, "OPT (TRUNCATED)"); - return 1; - } - - /* Max length: 127 "OPT (" 15*4*2chars ") " */ - nf_log_buf_add(m, "OPT ("); - for (i = 0; i < optsize; i++) - nf_log_buf_add(m, "%02X", op[i]); - - nf_log_buf_add(m, ") "); - } - - return 0; -} -EXPORT_SYMBOL_GPL(nf_log_dump_tcp_header); - -void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, - struct sock *sk) -{ - if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) - return; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket && sk->sk_socket->file) { - const struct cred *cred = sk->sk_socket->file->f_cred; - nf_log_buf_add(m, "UID=%u GID=%u ", - from_kuid_munged(&init_user_ns, cred->fsuid), - from_kgid_munged(&init_user_ns, cred->fsgid)); - } - read_unlock_bh(&sk->sk_callback_lock); -} -EXPORT_SYMBOL_GPL(nf_log_dump_sk_uid_gid); - -void -nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, - unsigned int hooknum, const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, const char *prefix) -{ - const struct net_device *physoutdev __maybe_unused; - const struct net_device *physindev __maybe_unused; - - nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", - '0' + loginfo->u.log.level, prefix, - in ? in->name : "", - out ? out->name : ""); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - physindev = nf_bridge_get_physindev(skb); - if (physindev && in != physindev) - nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); - physoutdev = nf_bridge_get_physoutdev(skb); - if (physoutdev && out != physoutdev) - nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); -#endif -} -EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); - -void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) -{ - u16 vid; - - if (!skb_vlan_tag_present(skb)) - return; - - vid = skb_vlan_tag_get(skb); - nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); -} -EXPORT_SYMBOL_GPL(nf_log_dump_vlan); - -/* bridge and netdev logging families share this code. */ -void nf_log_l2packet(struct net *net, u_int8_t pf, - __be16 protocol, - unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct nf_loginfo *loginfo, - const char *prefix) -{ - switch (protocol) { - case htons(ETH_P_IP): - nf_log_packet(net, NFPROTO_IPV4, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - case htons(ETH_P_IPV6): - nf_log_packet(net, NFPROTO_IPV6, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - case htons(ETH_P_ARP): - case htons(ETH_P_RARP): - nf_log_packet(net, NFPROTO_ARP, hooknum, skb, in, out, - loginfo, "%s", prefix); - break; - } -} -EXPORT_SYMBOL_GPL(nf_log_l2packet); - -static int __init nf_log_common_init(void) -{ - return 0; -} - -static void __exit nf_log_common_exit(void) {} - -module_init(nf_log_common_init); -module_exit(nf_log_common_exit); - -MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 6b56251de22a..2518818ed479 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -39,6 +40,16 @@ struct arppayload { unsigned char ip_dst[4]; }; +static void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) +{ + u16 vid; + + if (!skb_vlan_tag_present(skb)) + return; + + vid = skb_vlan_tag_get(skb); + nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); +} static void noinline_for_stack dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, @@ -89,6 +100,30 @@ dump_arp_packet(struct nf_log_buf *m, ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); } +static void +nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf, + unsigned int hooknum, const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *prefix) +{ + const struct net_device *physoutdev __maybe_unused; + const struct net_device *physindev __maybe_unused; + + nf_log_buf_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, + in ? in->name : "", + out ? out->name : ""); +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + physindev = nf_bridge_get_physindev(skb); + if (physindev && in != physindev) + nf_log_buf_add(m, "PHYSIN=%s ", physindev->name); + physoutdev = nf_bridge_get_physoutdev(skb); + if (physoutdev && out != physoutdev) + nf_log_buf_add(m, "PHYSOUT=%s ", physoutdev->name); +#endif +} + static void nf_log_arp_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -121,6 +156,138 @@ static struct nf_logger nf_arp_logger __read_mostly = { .me = THIS_MODULE, }; +static void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, + struct sock *sk) +{ + if (!sk || !sk_fullsock(sk) || !net_eq(net, sock_net(sk))) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + const struct cred *cred = sk->sk_socket->file->f_cred; + + nf_log_buf_add(m, "UID=%u GID=%u ", + from_kuid_munged(&init_user_ns, cred->fsuid), + from_kgid_munged(&init_user_ns, cred->fsgid)); + } + read_unlock_bh(&sk->sk_callback_lock); +} + +static noinline_for_stack int +nf_log_dump_tcp_header(struct nf_log_buf *m, + const struct sk_buff *skb, + u8 proto, int fragment, + unsigned int offset, + unsigned int logflags) +{ + struct tcphdr _tcph; + const struct tcphdr *th; + + /* Max length: 10 "PROTO=TCP " */ + nf_log_buf_add(m, "PROTO=TCP "); + + if (fragment) + return 0; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (!th) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (logflags & NF_LOG_TCPSEQ) { + nf_log_buf_add(m, "SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + } + + /* Max length: 13 "WINDOW=65535 " */ + nf_log_buf_add(m, "WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & + TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + nf_log_buf_add(m, "CWR "); + if (th->ece) + nf_log_buf_add(m, "ECE "); + if (th->urg) + nf_log_buf_add(m, "URG "); + if (th->ack) + nf_log_buf_add(m, "ACK "); + if (th->psh) + nf_log_buf_add(m, "PSH "); + if (th->rst) + nf_log_buf_add(m, "RST "); + if (th->syn) + nf_log_buf_add(m, "SYN "); + if (th->fin) + nf_log_buf_add(m, "FIN "); + /* Max length: 11 "URGP=65535 " */ + nf_log_buf_add(m, "URGP=%u ", ntohs(th->urg_ptr)); + + if ((logflags & NF_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { + unsigned int optsize = th->doff * 4 - sizeof(struct tcphdr); + u8 _opt[60 - sizeof(struct tcphdr)]; + unsigned int i; + const u8 *op; + + op = skb_header_pointer(skb, offset + sizeof(struct tcphdr), + optsize, _opt); + if (!op) { + nf_log_buf_add(m, "OPT (TRUNCATED)"); + return 1; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + nf_log_buf_add(m, "OPT ("); + for (i = 0; i < optsize; i++) + nf_log_buf_add(m, "%02X", op[i]); + + nf_log_buf_add(m, ") "); + } + + return 0; +} + +static noinline_for_stack int +nf_log_dump_udp_header(struct nf_log_buf *m, + const struct sk_buff *skb, + u8 proto, int fragment, + unsigned int offset) +{ + struct udphdr _udph; + const struct udphdr *uh; + + if (proto == IPPROTO_UDP) + /* Max length: 10 "PROTO=UDP " */ + nf_log_buf_add(m, "PROTO=UDP "); + else /* Max length: 14 "PROTO=UDPLITE " */ + nf_log_buf_add(m, "PROTO=UDPLITE "); + + if (fragment) + goto out; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (!uh) { + nf_log_buf_add(m, "INCOMPLETE [%u bytes] ", skb->len - offset); + + return 1; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + nf_log_buf_add(m, "SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); + +out: + return 0; +} + /* One level of recursion won't kill us */ static noinline_for_stack void dump_ipv4_packet(struct net *net, struct nf_log_buf *m, @@ -776,8 +943,18 @@ static void nf_log_netdev_packet(struct net *net, u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { - nf_log_l2packet(net, pf, skb->protocol, hooknum, skb, in, out, - loginfo, prefix); + switch (skb->protocol) { + case htons(ETH_P_IP): + nf_log_ip_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + case htons(ETH_P_IPV6): + nf_log_ip6_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + nf_log_arp_packet(net, pf, hooknum, skb, in, out, loginfo, prefix); + break; + } } static struct nf_logger nf_netdev_logger __read_mostly = { -- cgit v1.2.3 From cefa31a9d46112c0706c218ea549bccf298a0068 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 25 Mar 2021 18:25:12 +0100 Subject: netfilter: nft_log: perform module load from nf_tables modprobe calls from the nf_logger_find_get() API causes deadlock in very special cases because they occur with the nf_tables transaction mutex held. In the specific case of nf_log, deadlock is via: A nf_tables -> transaction mutex -> nft_log -> modprobe -> nf_log_syslog \ -> pernet_ops rwsem -> wait for C B netlink event -> rtnl_mutex -> nf_tables transaction mutex -> wait for A C close() -> ip6mr_sk_done -> rtnl_mutex -> wait for B Earlier patch added NFLOG/xt_LOG module softdeps to avoid the need to load the backend module during a transaction. For nft_log we would have to add a softdep for both nfnetlink_log or nf_log_syslog, since we do not know in advance which of the two backends are going to be configured. This defers the modprobe op until after the transaction mutex is released. Tested-by: Phil Sutter Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 +++++ net/netfilter/nf_log.c | 3 --- net/netfilter/nf_tables_api.c | 5 +++-- net/netfilter/nft_log.c | 20 +++++++++++++++++++- 4 files changed, 27 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0cef5ad9768a..8fefa112ae89 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1562,4 +1562,9 @@ void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); +#ifdef CONFIG_MODULES +__printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); +#else +static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } +#endif #endif /* _NET_NF_TABLES_H */ diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index eaa8181f5ef7..edee7fa944c1 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -170,9 +170,6 @@ int nf_logger_find_get(int pf, enum nf_log_type type) return 0; } - if (rcu_access_pointer(loggers[pf][type]) == NULL) - request_module("nf-logger-%u-%u", pf, type); - rcu_read_lock(); logger = rcu_dereference(loggers[pf][type]); if (logger == NULL) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fc2526b8bd55..c09b67f2f64c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -586,8 +586,8 @@ struct nft_module_request { }; #ifdef CONFIG_MODULES -static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, - ...) +__printf(2, 3) int nft_request_module(struct net *net, const char *fmt, + ...) { char module_name[MODULE_NAME_LEN]; struct nft_module_request *req; @@ -620,6 +620,7 @@ static __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, return -EAGAIN; } +EXPORT_SYMBOL_GPL(nft_request_module); #endif static void lockdep_nfnl_nft_mutex_not_held(void) diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index a06a46b039c5..54f6c2035e84 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -128,6 +128,20 @@ static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = { [NFTA_LOG_FLAGS] = { .type = NLA_U32 }, }; +static int nft_log_modprobe(struct net *net, enum nf_log_type t) +{ + switch (t) { + case NF_LOG_TYPE_LOG: + return nft_request_module(net, "%s", "nf_log_syslog"); + case NF_LOG_TYPE_ULOG: + return nft_request_module(net, "%s", "nfnetlink_log"); + case NF_LOG_TYPE_MAX: + break; + } + + return -ENOENT; +} + static int nft_log_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -197,8 +211,12 @@ static int nft_log_init(const struct nft_ctx *ctx, return 0; err = nf_logger_find_get(ctx->family, li->type); - if (err < 0) + if (err < 0) { + if (nft_log_modprobe(ctx->net, li->type) == -EAGAIN) + err = -EAGAIN; + goto err1; + } return 0; -- cgit v1.2.3 From 5c701e71961af0ec8227ea615f1646dbe98aea1a Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Sat, 27 Mar 2021 10:54:47 +0800 Subject: netfilter: ipset: Remove duplicate declaration struct ip_set is declared twice. One is declared at 79th line, so remove the duplicate. Signed-off-by: Wan Jiabing Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 46d9a0c26c67..10279c4830ac 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -124,8 +124,6 @@ struct ip_set_ext { bool target; }; -struct ip_set; - #define ext_timeout(e, s) \ ((unsigned long *)(((void *)(e)) + (s)->offset[IPSET_EXT_ID_TIMEOUT])) #define ext_counter(e, s) \ -- cgit v1.2.3 From 8b9229d15877ec77775633f058d14145f6eb98fa Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 28 Mar 2021 23:08:55 +0200 Subject: netfilter: flowtable: dst_check() from garbage collector path Move dst_check() to the garbage collector path. Stale routes trigger the flow entry teardown state which makes affected flows go back to the classic forwarding path to re-evaluate flow offloading. IPv6 requires the dst cookie to work, store it in the flow_tuple, otherwise dst_check() always fails. Fixes: e5075c0badaa ("netfilter: flowtable: call dst_check() to fall back to classic forwarding") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 5 ++++- net/netfilter/nf_flow_table_core.c | 37 ++++++++++++++++++++++++++++++++++- net/netfilter/nf_flow_table_ip.c | 22 ++++----------------- 3 files changed, 44 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 4d991c1e93ef..583b327d8fc0 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -129,7 +129,10 @@ struct flow_offload_tuple { in_vlan_ingress:2; u16 mtu; union { - struct dst_entry *dst_cache; + struct { + struct dst_entry *dst_cache; + u32 dst_cookie; + }; struct { u32 ifidx; u32 hw_ifidx; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 1bce1d2805c4..76573bae6664 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -74,6 +74,18 @@ err_ct_refcnt: } EXPORT_SYMBOL_GPL(flow_offload_alloc); +static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) +{ + const struct rt6_info *rt; + + if (flow_tuple->l3proto == NFPROTO_IPV6) { + rt = (const struct rt6_info *)flow_tuple->dst_cache; + return rt6_get_cookie(rt); + } + + return 0; +} + static int flow_offload_fill_route(struct flow_offload *flow, const struct nf_flow_route *route, enum flow_offload_tuple_dir dir) @@ -116,6 +128,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, return -1; flow_tuple->dst_cache = dst; + flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; } flow_tuple->xmit_type = route->tuple[dir].xmit_type; @@ -390,11 +403,33 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } +static bool flow_offload_stale_dst(struct flow_offload_tuple *tuple) +{ + struct dst_entry *dst; + + if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || + tuple->xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { + dst = tuple->dst_cache; + if (!dst_check(dst, tuple->dst_cookie)) + return true; + } + + return false; +} + +static bool nf_flow_has_stale_dst(struct flow_offload *flow) +{ + return flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple) || + flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple); +} + static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct)) + if (nf_flow_has_expired(flow) || + nf_ct_is_dying(flow->ct) || + nf_flow_has_stale_dst(flow)) set_bit(NF_FLOW_TEARDOWN, &flow->flags); if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 12cb0cc6958c..889cf88d3dba 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -364,15 +364,6 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) return NF_ACCEPT; - if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || - tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { - rt = (struct rtable *)tuplehash->tuple.dst_cache; - if (!dst_check(&rt->dst, 0)) { - flow_offload_teardown(flow); - return NF_ACCEPT; - } - } - if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; @@ -391,6 +382,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { + rt = (struct rtable *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; @@ -399,6 +391,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: + rt = (struct rtable *)tuplehash->tuple.dst_cache; outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); @@ -607,15 +600,6 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) return NF_ACCEPT; - if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || - tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { - rt = (struct rt6_info *)tuplehash->tuple.dst_cache; - if (!dst_check(&rt->dst, 0)) { - flow_offload_teardown(flow); - return NF_ACCEPT; - } - } - if (skb_try_make_writable(skb, thoff + hdrsize)) return NF_DROP; @@ -633,6 +617,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { + rt = (struct rt6_info *)tuplehash->tuple.dst_cache; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; @@ -641,6 +626,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: + rt = (struct rt6_info *)tuplehash->tuple.dst_cache; outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); -- cgit v1.2.3 From 19c28b1374fb1073a9ec873a6c10bf5f16b10b9d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 30 Mar 2021 16:58:37 +0200 Subject: netfilter: add helper function to set up the nfnetlink header and use it This patch adds a helper function to set up the netlink and nfnetlink headers. Update existing codebase to use it. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 27 ++++++++++ net/netfilter/ipset/ip_set_core.c | 17 ++---- net/netfilter/nf_conntrack_netlink.c | 77 ++++++++------------------ net/netfilter/nf_tables_api.c | 102 ++++++++++------------------------- net/netfilter/nf_tables_trace.c | 9 +--- net/netfilter/nfnetlink_acct.c | 11 ++-- net/netfilter/nfnetlink_cthelper.c | 11 ++-- net/netfilter/nfnetlink_cttimeout.c | 22 +++----- net/netfilter/nfnetlink_log.c | 11 ++-- net/netfilter/nfnetlink_queue.c | 12 ++--- net/netfilter/nft_compat.c | 11 ++-- 11 files changed, 102 insertions(+), 208 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index f6267e2883f2..791d516e1e88 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -57,6 +57,33 @@ static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) return subsys << 8 | msg_type; } +static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version, + __be16 res_id) +{ + struct nfgenmsg *nfmsg; + + nfmsg = nlmsg_data(nlh); + nfmsg->nfgen_family = family; + nfmsg->version = version; + nfmsg->res_id = res_id; +} + +static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid, + u32 seq, int type, int flags, + u8 family, u8 version, + __be16 res_id) +{ + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); + if (!nlh) + return NULL; + + nfnl_fill_hdr(nlh, family, version, res_id); + + return nlh; +} + void nfnl_lock(__u8 subsys_id); void nfnl_unlock(__u8 subsys_id); #ifdef CONFIG_PROVE_LOCKING diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 89009c82a6b2..359ff8ec236a 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -963,20 +963,9 @@ static struct nlmsghdr * start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, enum ipset_cmd cmd) { - struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; - - nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), - sizeof(*nfmsg), flags); - if (!nlh) - return NULL; - - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = NFPROTO_IPV4; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - - return nlh; + return nfnl_msg_put(skb, portid, seq, + nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), flags, + NFPROTO_IPV4, NFNETLINK_V0, 0); } /* Create a set */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1d519b0e51a5..c67a6ec22a74 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -555,22 +555,17 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; unsigned int event; if (portid) flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -713,7 +708,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct nlattr *nest_parms; struct nf_conn *ct = item->ct; struct sk_buff *skb; @@ -743,15 +737,11 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, nf_ct_l3num(ct), + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = nf_ct_l3num(ct); - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - zone = nf_ct_zone(ct); nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG); @@ -2490,20 +2480,15 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, __u16 cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_FOUND, htonl(st->found)) || nla_put_be32(skb, CTA_STATS_INVALID, htonl(st->invalid)) || nla_put_be32(skb, CTA_STATS_INSERT, htonl(st->insert)) || @@ -2575,20 +2560,15 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; unsigned int nr_conntracks = atomic_read(&net->ct.count); event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; @@ -3085,19 +3065,14 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct nf_conntrack_expect *exp) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -3117,7 +3092,6 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *skb; unsigned int type, group; int flags = 0; @@ -3140,15 +3114,11 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) goto errout; type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); - nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, item->portid, 0, type, flags, + exp->tuple.src.l3num, NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = exp->tuple.src.l3num; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (ctnetlink_exp_dump_expect(skb, exp) < 0) goto nla_put_failure; @@ -3716,20 +3686,15 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, const struct ip_conntrack_stat *st) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_EXP_GET_STATS_CPU); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, htons(cpu)); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(cpu); - if (nla_put_be32(skb, CTA_STATS_EXP_NEW, htonl(st->expect_new)) || nla_put_be32(skb, CTA_STATS_EXP_CREATE, htonl(st->expect_create)) || nla_put_be32(skb, CTA_STATS_EXP_DELETE, htonl(st->expect_delete))) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e894d70b5d5f..005f1c620fc0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -707,18 +707,13 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(net); - if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) || nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) || nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) || @@ -1468,18 +1463,13 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_chain *chain) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(net); - if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), @@ -2825,20 +2815,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, const struct nft_rule *prule) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; const struct nft_expr *expr, *next; struct nlattr *list; u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, + nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(net); - if (nla_put_string(skb, NFTA_RULE_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) @@ -3809,7 +3794,6 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; u32 portid = ctx->portid; struct nlattr *nest; @@ -3817,16 +3801,11 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, int i; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, + NFNETLINK_V0, nft_base_seq(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(ctx->net); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -4795,7 +4774,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) struct nft_set *set; struct nft_set_dump_args args; bool set_found = false; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; u32 portid, seq; @@ -4828,16 +4806,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - NLM_F_MULTI); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, + table->family, NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = table->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(net); - if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name)) @@ -4894,22 +4867,16 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, const struct nft_set *set, const struct nft_set_elem *elem) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct nlattr *nest; int err; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), - flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, + NFNETLINK_V0, nft_base_seq(ctx->net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = ctx->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(ctx->net); - if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name)) goto nla_put_failure; if (nla_put_string(skb, NFTA_SET_NAME, set->name)) @@ -6227,19 +6194,14 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, int family, const struct nft_table *table, struct nft_object *obj, bool reset) { - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(net); - if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) || nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) || nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) || @@ -7139,20 +7101,15 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, struct list_head *hook_list) { struct nlattr *nest, *nest_devs; - struct nfgenmsg *nfmsg; struct nft_hook *hook; struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(net); - if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) || nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) || nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) || @@ -7385,19 +7342,14 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = nft_base_seq(net); - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 87b36da5cd98..0cf3278007ba 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -183,7 +183,6 @@ static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) void nft_trace_notify(struct nft_traceinfo *info) { const struct nft_pktinfo *pkt = info->pkt; - struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; @@ -219,15 +218,11 @@ void nft_trace_notify(struct nft_traceinfo *info) return; event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE); - nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, event, 0, info->basechain->type->family, + NFNETLINK_V0, 0); if (!nlh) goto nla_put_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = info->basechain->type->family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(nft_pf(pkt)))) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 0fa1653b5f19..bb930f3b06c7 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -145,21 +145,16 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_acct *acct) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; u64 pkts, bytes; u32 old_flags; event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFACCT_NAME, acct->name)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 0f94fce1d3ed..22f6f7fcc724 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -526,20 +526,15 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct nf_conntrack_helper *helper) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; int status; event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFCTH_NAME, helper->name)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 89a381f7f945..de831a257512 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -160,22 +160,17 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int event, struct ctnl_timeout *timeout) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; const struct nf_conntrack_l4proto *l4proto = timeout->timeout.l4proto; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, CTA_TIMEOUT_NAME, timeout->name) || nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->timeout.l3num)) || @@ -382,21 +377,16 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, const unsigned int *timeouts) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; struct nlattr *nest_parms; int ret; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = AF_UNSPEC; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, l4proto->l4proto)) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 26776b88a539..d5f458d0ff3d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -456,20 +456,15 @@ __build_packet_message(struct nfnl_log_net *log, { struct nfulnl_msg_packet_hdr pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; sk_buff_data_t old_tail = inst->skb->tail; struct sock *sk; const unsigned char *hwhdrp; - nlh = nlmsg_put(inst->skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(inst->skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), + 0, pf, NFNETLINK_V0, htons(inst->group_num)); if (!nlh) return -1; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(inst->group_num); memset(&pmsg, 0, sizeof(pmsg)); pmsg.hw_protocol = skb->protocol; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 48a07914fd94..37e81d895e61 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -383,7 +383,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; @@ -471,18 +470,15 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nlmsg_failure; } - nlh = nlmsg_put(skb, 0, 0, - nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), - sizeof(struct nfgenmsg), 0); + nlh = nfnl_msg_put(skb, 0, 0, + nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), + 0, entry->state.pf, NFNETLINK_V0, + htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = entry->state.pf; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = htons(queue->queue_num); nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8e56f353ff35..b8dbd20a6a4c 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -591,19 +591,14 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, int rev, int target) { struct nlmsghdr *nlh; - struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event); - nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); - if (nlh == NULL) + nlh = nfnl_msg_put(skb, portid, seq, event, flags, family, + NFNETLINK_V0, 0); + if (!nlh) goto nlmsg_failure; - nfmsg = nlmsg_data(nlh); - nfmsg->nfgen_family = family; - nfmsg->version = NFNETLINK_V0; - nfmsg->res_id = 0; - if (nla_put_string(skb, NFTA_COMPAT_NAME, name) || nla_put_be32(skb, NFTA_COMPAT_REV, htonl(rev)) || nla_put_be32(skb, NFTA_COMPAT_TYPE, htonl(target))) -- cgit v1.2.3 From 1e5d1f69d9fb8ea0679f9e85915e8e7fdacfbe7a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Mar 2021 20:59:52 -0700 Subject: ethtool: support FEC settings over netlink Add FEC API to netlink. This is not a 1-to-1 conversion. FEC settings already depend on link modes to tell user which modes are supported. Take this further an use link modes for manual configuration. Old struct ethtool_fecparam is still used to talk to the drivers, so we need to translate back and forth. We can revisit the internal API if number of FEC encodings starts to grow. Enforce only one active FEC bit (by using a bit position rather than another mask). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 62 ++++++- include/uapi/linux/ethtool_netlink.h | 17 ++ net/ethtool/Makefile | 2 +- net/ethtool/fec.c | 238 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 19 +++ net/ethtool/netlink.h | 4 + 6 files changed, 339 insertions(+), 3 deletions(-) create mode 100644 net/ethtool/fec.c (limited to 'include') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 05073482db05..4bdb4298f178 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -208,6 +208,8 @@ Userspace to kernel: ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` action start raw TDR cable test ``ETHTOOL_MSG_TUNNEL_INFO_GET`` get tunnel offload info + ``ETHTOOL_MSG_FEC_GET`` get FEC settings + ``ETHTOOL_MSG_FEC_SET`` set FEC settings ===================================== ================================ Kernel to userspace: @@ -242,6 +244,8 @@ Kernel to userspace: ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF`` Cable test TDR results ``ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY`` tunnel offload info + ``ETHTOOL_MSG_FEC_GET_REPLY`` FEC settings + ``ETHTOOL_MSG_FEC_NTF`` FEC settings ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1280,6 +1284,60 @@ Kernel response contents: For UDP tunnel table empty ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` indicates that the table contains static entries, hard-coded by the NIC. +FEC_GET +======= + +Gets FEC configuration and state like ``ETHTOOL_GFECPARAM`` ioctl request. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ===================================== ====== ========================== + +Kernel response contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ``ETHTOOL_A_FEC_MODES`` bitset configured modes + ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection + ``ETHTOOL_A_FEC_ACTIVE`` u32 index of active FEC mode + ===================================== ====== ========================== + +``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently +active on the interface. This attribute may not be present if device does +not support FEC. + +``ETHTOOL_A_FEC_MODES`` and ``ETHTOOL_A_FEC_AUTO`` are only meaningful when +autonegotiation is disabled. If ``ETHTOOL_A_FEC_AUTO`` is non-zero driver will +select the FEC mode automatically based on the parameters of the SFP module. +This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface. +``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode +bits (rather than old ``ETHTOOL_FEC_*`` bits). + +FEC_SET +======= + +Sets FEC parameters like ``ETHTOOL_SFECPARAM`` ioctl request. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ``ETHTOOL_A_FEC_MODES`` bitset configured modes + ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection + ===================================== ====== ========================== + +``FEC_SET`` is only meaningful when autonegotiation is disabled. Otherwise +FEC mode is selected as part of autonegotiation. + +``ETHTOOL_A_FEC_MODES`` selects which FEC mode should be used. It's recommended +to set only one bit, if multiple bits are set driver may choose between them +in an implementation specific way. + +``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP +module parameters. This does not mean autonegotiation. + Request translation =================== @@ -1373,8 +1431,8 @@ are netlink only. ``ETHTOOL_MSG_LINKMODES_SET`` ``ETHTOOL_PHY_GTUNABLE`` n/a ``ETHTOOL_PHY_STUNABLE`` n/a - ``ETHTOOL_GFECPARAM`` n/a - ``ETHTOOL_SFECPARAM`` n/a + ``ETHTOOL_GFECPARAM`` ``ETHTOOL_MSG_FEC_GET`` + ``ETHTOOL_SFECPARAM`` ``ETHTOOL_MSG_FEC_SET`` n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' n/a ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT'' n/a ``ETHTOOL_MSG_TUNNEL_INFO_GET`` diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index a286635ac9b8..7f1bdb5b31ba 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -42,6 +42,8 @@ enum { ETHTOOL_MSG_CABLE_TEST_ACT, ETHTOOL_MSG_CABLE_TEST_TDR_ACT, ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_FEC_GET, + ETHTOOL_MSG_FEC_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -80,6 +82,8 @@ enum { ETHTOOL_MSG_CABLE_TEST_NTF, ETHTOOL_MSG_CABLE_TEST_TDR_NTF, ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, + ETHTOOL_MSG_FEC_GET_REPLY, + ETHTOOL_MSG_FEC_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -629,6 +633,19 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +/* FEC */ + +enum { + ETHTOOL_A_FEC_UNSPEC, + ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEC_MODES, /* bitset */ + ETHTOOL_A_FEC_AUTO, /* u8 */ + ETHTOOL_A_FEC_ACTIVE, /* u32 */ + + __ETHTOOL_A_FEC_CNT, + ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 7a849ff22dad..c2dc9033a8f7 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o + tunnels.o fec.o diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c new file mode 100644 index 000000000000..31454b9188bd --- /dev/null +++ b/net/ethtool/fec.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct fec_req_info { + struct ethnl_req_info base; +}; + +struct fec_reply_data { + struct ethnl_reply_data base; + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes); + u32 active_fec; + u8 fec_auto; +}; + +#define FEC_REPDATA(__reply_base) \ + container_of(__reply_base, struct fec_reply_data, base) + +#define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1) + +const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static void +ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto) +{ + if (fec_auto) + *fec_auto = !!(fec & ETHTOOL_FEC_AUTO); + + if (fec & ETHTOOL_FEC_OFF) + __set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes); + if (fec & ETHTOOL_FEC_RS) + __set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes); + if (fec & ETHTOOL_FEC_BASER) + __set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes); + if (fec & ETHTOOL_FEC_LLRS) + __set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes); +} + +static int +ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec, + unsigned long *link_modes, u8 fec_auto) +{ + memset(fec, 0, sizeof(*fec)); + + if (fec_auto) + fec->fec |= ETHTOOL_FEC_AUTO; + + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_OFF; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_RS; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_BASER; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_LLRS; + + if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS)) + return -EINVAL; + + return 0; +} + +static int fec_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {}; + struct fec_reply_data *data = FEC_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + struct ethtool_fecparam fec = {}; + int ret; + + if (!dev->ethtool_ops->get_fecparam) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = dev->ethtool_ops->get_fecparam(dev, &fec); + ethnl_ops_complete(dev); + if (ret) + return ret; + + WARN_ON_ONCE(fec.reserved); + + ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes, + &data->fec_auto); + + ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL); + data->active_fec = find_first_bit(active_fec_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS); + /* Don't report attr if no FEC mode set. Note that + * ethtool_fecparam_to_link_modes() ignores NONE and AUTO. + */ + if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS) + data->active_fec = 0; + + return 0; +} + +static int fec_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int len = 0; + int ret; + + ret = ethnl_bitset_size(data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + + len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ + nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ + + return len; +} + +static int fec_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int ret; + + ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES, + data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + + if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) || + (data->active_fec && + nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec))) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_fec_request_ops = { + .request_cmd = ETHTOOL_MSG_FEC_GET, + .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, + .hdr_attr = ETHTOOL_A_FEC_HEADER, + .req_info_size = sizeof(struct fec_req_info), + .reply_data_size = sizeof(struct fec_reply_data), + + .prepare_data = fec_prepare_data, + .reply_size = fec_reply_size, + .fill_reply = fec_fill_reply, +}; + +/* FEC_SET */ + +const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_FEC_MODES] = { .type = NLA_NESTED }, + [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {}; + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct ethtool_fecparam fec = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + bool mod = false; + u8 fec_auto; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEC_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_fecparam || !ops->set_fecparam) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ret = ops->get_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + + ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto); + + ret = ethnl_update_bitset(fec_link_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_FEC_MODES], + link_mode_names, info->extack, &mod); + if (ret < 0) + goto out_ops; + ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod); + + ret = 0; + if (!mod) + goto out_ops; + + ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "invalid FEC modes requested"); + goto out_ops; + } + if (!fec.fec) { + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "no FEC modes set"); + goto out_ops; + } + + ret = dev->ethtool_ops->set_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_FEC_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 50d3c8896f91..705a4b201564 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -244,6 +244,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, }; @@ -551,6 +552,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_COALESCE_NTF] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, }; /* default notification handler */ @@ -643,6 +645,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -912,6 +915,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_FEC_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_fec_get_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_FEC_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_fec, + .policy = ethnl_fec_set_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 6eabd58d81bf..785f7ee45930 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -344,6 +344,7 @@ extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; +extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -375,6 +376,8 @@ extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; +extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; +extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -392,5 +395,6 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ -- cgit v1.2.3 From 1caf8d39c58f3f63193d02928c8dce3fa07cee52 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:05 -0700 Subject: inet: shrink inet_timewait_death_row by 48 bytes struct inet_timewait_death_row uses two cache lines, because we want tw_count to use a full cache line to avoid false sharing. Rework its definition and placement in netns_ipv4 so that: 1) We add 60 bytes of padding after tw_count to avoid false sharing, knowing that tcp_death_row will have ____cacheline_aligned_in_smp attribute. 2) We do not risk padding before tcp_death_row, because we move it at the beginning of netns_ipv4, even if new fields are added later. 3) We do not waste 48 bytes of padding after it. Note that I have not changed dccp. pahole result for struct netns_ipv4 before/after the patch : /* size: 832, cachelines: 13, members: 139 */ /* sum members: 721, holes: 12, sum holes: 95 */ /* padding: 16 */ /* paddings: 2, sum paddings: 55 */ -> /* size: 768, cachelines: 12, members: 139 */ /* sum members: 673, holes: 11, sum holes: 39 */ /* padding: 56 */ /* paddings: 2, sum paddings: 7 */ /* forced alignments: 1 */ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9c8dd424d79b..1085ed4e0788 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -32,14 +32,18 @@ struct inet_hashinfo; struct inet_timewait_death_row { atomic_t tw_count; + char tw_pad[L1_CACHE_BYTES - sizeof(atomic_t)]; - struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; + struct inet_hashinfo *hashinfo; int sysctl_max_tw_buckets; }; struct tcp_fastopen_context; struct netns_ipv4 { + /* Please keep tcp_death_row at first field in netns_ipv4 */ + struct inet_timewait_death_row tcp_death_row ____cacheline_aligned_in_smp; + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -175,7 +179,6 @@ struct netns_ipv4 { int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; - struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; const struct tcp_congestion_ops __rcu *tcp_congestion_control; -- cgit v1.2.3 From 490f33c4e70431d0a4d01666a6525fdd43299cde Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:06 -0700 Subject: inet: shrink netns_ipv4 by another cache line By shuffling around some fields to remove 8 bytes of hole, we can save one cache line. pahole result before/after the patch : /* size: 768, cachelines: 12, members: 139 */ /* sum members: 673, holes: 11, sum holes: 39 */ /* padding: 56 */ /* paddings: 2, sum paddings: 7 */ /* forced alignments: 1 */ -> /* size: 704, cachelines: 11, members: 139 */ /* sum members: 673, holes: 10, sum holes: 31 */ /* paddings: 2, sum paddings: 7 */ /* forced alignments: 1 */ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1085ed4e0788..538ed69919dc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -57,17 +57,17 @@ struct netns_ipv4 { struct mutex ra_mutex; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; - bool fib_has_custom_rules; - unsigned int fib_rules_require_fldissect; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; + unsigned int fib_rules_require_fldissect; + bool fib_has_custom_rules; #endif bool fib_has_custom_local_routes; + bool fib_offload_disabled; #ifdef CONFIG_IP_ROUTE_CLASSID int fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; - bool fib_offload_disabled; struct sock *fibnl; struct sock * __percpu *icmp_sk; -- cgit v1.2.3 From b2908fac5b7b23c03fa1d3e1055ad95ba305c871 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:07 -0700 Subject: ipv4: convert fib_notify_on_flag_change sysctl to u8 Reduce footprint of sysctls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 538ed69919dc..b187ac597b8c 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -191,7 +191,7 @@ struct netns_ipv4 { int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; - int sysctl_fib_notify_on_flag_change; + u8 sysctl_fib_notify_on_flag_change; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9199f507a005..a2352d8d88cc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1364,9 +1364,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, -- cgit v1.2.3 From cd04bd022258f4aa6e8392c8133dbbf31da0f12f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:08 -0700 Subject: ipv4: convert udp_l3mdev_accept sysctl to u8 Reduce footprint of sysctls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b187ac597b8c..d309b1b89715 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -194,7 +194,7 @@ struct netns_ipv4 { u8 sysctl_fib_notify_on_flag_change; #ifdef CONFIG_NET_L3_MASTER_DEV - int sysctl_udp_l3mdev_accept; + u8 sysctl_udp_l3mdev_accept; #endif int sysctl_igmp_max_memberships; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a2352d8d88cc..1b6ce649a433 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1065,9 +1065,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "udp_l3mdev_accept", .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -- cgit v1.2.3 From be205fe6ec4ffd6875f69e61205163fb686a5c74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:09 -0700 Subject: ipv4: convert fib_multipath_{use_neigh|hash_policy} sysctls to u8 Make room for better packing of netns_ipv4 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 4 ++-- net/ipv4/sysctl_net_ipv4.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d309b1b89715..eb6ca07d3b0f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -220,8 +220,8 @@ struct netns_ipv4 { #endif #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH - int sysctl_fib_multipath_use_neigh; - int sysctl_fib_multipath_hash_policy; + u8 sysctl_fib_multipath_use_neigh; + u8 sysctl_fib_multipath_hash_policy; #endif struct fib_notifier_ops *notifier_ops; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1b6ce649a433..ad75d6bb2df7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -456,7 +456,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ipv4.sysctl_fib_multipath_hash_policy); int ret; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); @@ -1038,16 +1038,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_multipath_use_neigh", .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, -- cgit v1.2.3 From 7d4b37ebb934aa32a54666fe9153d127c33ff89a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:10 -0700 Subject: ipv4: convert igmp_link_local_mcast_reports sysctl to u8 This sysctl is a bool, can use less storage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index eb6ca07d3b0f..fafcedf64383 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -197,9 +197,9 @@ struct netns_ipv4 { u8 sysctl_udp_l3mdev_accept; #endif + u8 sysctl_igmp_llm_reports; int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; - int sysctl_igmp_llm_reports; int sysctl_igmp_qrv; struct ping_group_range ping_group_range; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ad75d6bb2df7..fd2b35065bb2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -848,9 +848,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "igmp_link_local_mcast_reports", .data = &init_net.ipv4.sysctl_igmp_llm_reports, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "igmp_max_memberships", -- cgit v1.2.3 From 1c3289c931740f235b29be5182e5f2dfb004593d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:11 -0700 Subject: tcp: convert tcp_comp_sack_nr sysctl to u8 tcp_comp_sack_nr max value was already 255. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index fafcedf64383..87e1612497ea 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -171,12 +171,12 @@ struct netns_ipv4 { u8 sysctl_tcp_min_tso_segs; u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; + u8 sysctl_tcp_comp_sack_nr; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; - int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; int sysctl_max_syn_backlog; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fd2b35065bb2..a09e466ce11d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -46,7 +46,6 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; -static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; @@ -1330,11 +1329,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &comp_sack_nr_max, }, { .procname = "tcp_reflect_tos", -- cgit v1.2.3 From a6175633a2af0eae07127311563d2a75096c111a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:12 -0700 Subject: ipv6: convert elligible sysctls to u8 Convert most sysctls that can fit in a byte. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 24 ++++++++++++------------ net/ipv6/icmp.c | 12 ++++++------ net/ipv6/sysctl_net_ipv6.c | 38 ++++++++++++++++++-------------------- 3 files changed, 36 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 21c0debbd39e..84f4a8bec397 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -20,7 +20,6 @@ struct netns_sysctl_ipv6 { struct ctl_table_header *frags_hdr; struct ctl_table_header *xfrm6_hdr; #endif - int bindv6only; int flush_delay; int ip6_rt_max_size; int ip6_rt_gc_min_interval; @@ -29,21 +28,22 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; - int multipath_hash_policy; - int flowlabel_consistency; - int auto_flowlabels; + u8 bindv6only; + u8 multipath_hash_policy; + u8 flowlabel_consistency; + u8 auto_flowlabels; int icmpv6_time; - int icmpv6_echo_ignore_all; - int icmpv6_echo_ignore_multicast; - int icmpv6_echo_ignore_anycast; + u8 icmpv6_echo_ignore_all; + u8 icmpv6_echo_ignore_multicast; + u8 icmpv6_echo_ignore_anycast; DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); unsigned long *icmpv6_ratemask_ptr; - int anycast_src_echo_reply; - int ip_nonlocal_bind; - int fwmark_reflect; + u8 anycast_src_echo_reply; + u8 ip_nonlocal_bind; + u8 fwmark_reflect; + u8 flowlabel_state_ranges; int idgen_retries; int idgen_delay; - int flowlabel_state_ranges; int flowlabel_reflect; int max_dst_opts_cnt; int max_hbh_opts_cnt; @@ -51,7 +51,7 @@ struct netns_sysctl_ipv6 { int max_hbh_opts_len; int seg6_flowlabel; bool skip_notify_on_dev_down; - int fib_notify_on_flag_change; + u8 fib_notify_on_flag_change; }; struct netns_ipv6 { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 29d38d6b55fb..1bca2b09d77e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1169,23 +1169,23 @@ static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "echo_ignore_all", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_multicast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_anycast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ratemask", diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 263ab43ed06b..27102c3d6e1d 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,7 +23,6 @@ static int two = 2; static int flowlabel_reflect_max = 0x7; -static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, @@ -34,7 +33,7 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_policy); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); @@ -45,39 +44,38 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", .data = &init_net.ipv6.sysctl.bindv6only, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "anycast_src_echo_reply", .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_consistency", .data = &init_net.ipv6.sysctl.flowlabel_consistency, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "auto_flowlabels", .data = &init_net.ipv6.sysctl.auto_flowlabels, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &auto_flowlabels_min, + .proc_handler = proc_dou8vec_minmax, .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "idgen_retries", @@ -96,16 +94,16 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "flowlabel_state_ranges", .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_reflect", @@ -147,7 +145,7 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv6.sysctl.multipath_hash_policy, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, @@ -163,9 +161,9 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, -- cgit v1.2.3 From 0dd39d952f75a678b2ebcac8bd60f449f303c755 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:13 -0700 Subject: ipv6: move ip6_dst_ops first in netns_ipv6 ip6_dst_ops have cache line alignement. Moving it at beginning of netns_ipv6 removes a 48 byte hole, and shrinks netns_ipv6 from 12 to 11 cache lines. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 84f4a8bec397..808f0f79ea9c 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -55,6 +55,9 @@ struct netns_sysctl_ipv6 { }; struct netns_ipv6 { + /* Keep ip6_dst_ops at the beginning of netns_sysctl_ipv6 */ + struct dst_ops ip6_dst_ops; + struct netns_sysctl_ipv6 sysctl; struct ipv6_devconf *devconf_all; struct ipv6_devconf *devconf_dflt; @@ -76,7 +79,6 @@ struct netns_ipv6 { struct hlist_head *fib_table_hash; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; - struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; unsigned int ip6_rt_gc_expire; -- cgit v1.2.3 From b01fd6e802b6d0a635176f943315670b679d8d7b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:23 -0700 Subject: skmsg: Introduce a spinlock to protect ingress_msg Currently we rely on lock_sock to protect ingress_msg, it is too big for this, we can actually just use a spinlock to protect this list like protecting other skb queues. __tcp_bpf_recvmsg() is still special because of peeking, it still has to use lock_sock. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/skmsg.c | 3 +++ net/ipv4/tcp_bpf.c | 18 ++++++------------ 3 files changed, 55 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 6c09d94be2e9..f2d45a73b2b2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -89,6 +89,7 @@ struct sk_psock { #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; + spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + spin_lock_bh(&psock->ingress_lock); list_add_tail(&msg->list, &psock->ingress_msg); + spin_unlock_bh(&psock->ingress_lock); +} + +static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + if (msg) + list_del(&msg->list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + struct sk_msg *ret; + + spin_lock_bh(&psock->ingress_lock); + if (list_is_last(&msg->list, &psock->ingress_msg)) + ret = NULL; + else + ret = list_next_entry(msg, list); + spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) @@ -292,6 +331,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock) return psock ? list_empty(&psock->ingress_msg) : true; } +static inline void kfree_sk_msg(struct sk_msg *msg) +{ + if (msg->skb) + consume_skb(msg->skb); + kfree(msg); +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bebf84ed4e30..305dddc51857 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -592,6 +592,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_WORK(&psock->work, sk_psock_backlog); INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); @@ -638,7 +639,9 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } + spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); + spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 17c322b875fd..ae980716d896 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -18,9 +18,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg_rx; int i, copied = 0; - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - + msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge; @@ -68,22 +66,18 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end); if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) break; - msg_rx = list_next_entry(msg_rx, list); continue; } msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); + msg_rx = sk_psock_peek_msg(psock); } return copied; -- cgit v1.2.3 From 0739cd28f2645e814586c7536ba5da9825cb8029 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:24 -0700 Subject: net: Introduce skb_send_sock() for sock_map We only have skb_send_sock_locked() which requires callers to use lock_sock(). Introduce a variant skb_send_sock() which locks on its own, callers do not need to lock it any more. This will save us from adding a ->sendmsg_locked for each protocol. To reuse the code, pass function pointers to __skb_send_sock() and build skb_send_sock() and skb_send_sock_locked() on top. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210331023237.41094-4-xiyou.wangcong@gmail.com --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 55 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c8def85fcc22..dbf820a50a39 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3626,6 +3626,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e8320b5d651a..3ad9e8425ab2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2500,9 +2500,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits); -/* Send skb data on a socket. Socket must be locked. */ -int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, - int len) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendmsg(sock, msg, vec, num, size); +} + +static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendpage(sock, page, offset, size, flags); +} + +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size); +typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, + int len, sendmsg_func sendmsg, sendpage_func sendpage) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -2522,7 +2545,8 @@ do_frag_list: memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; - ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, + sendmsg_unlocked, sk, &msg, &kv, 1, slen); if (ret <= 0) goto error; @@ -2553,9 +2577,11 @@ do_frag_list: slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = kernel_sendpage_locked(sk, skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, + sendpage_unlocked, sk, + skb_frag_page(frag), + skb_frag_off(frag) + offset, + slen, MSG_DONTWAIT); if (ret <= 0) goto error; @@ -2587,8 +2613,23 @@ out: error: return orig_len == len ? ret : orig_len - len; } + +/* Send skb data on a socket. Socket must be locked. */ +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len) +{ + return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, + kernel_sendpage_locked); +} EXPORT_SYMBOL_GPL(skb_send_sock_locked); +/* Send skb data on a socket. Socket must be unlocked. */ +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, + sendpage_unlocked); +} + /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer -- cgit v1.2.3 From 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:25 -0700 Subject: skmsg: Avoid lock_sock() in sk_psock_backlog() We do not have to lock the sock to avoid losing sk_socket, instead we can purge all the ingress queues when we close the socket. Sending or receiving packets after orphaning socket makes no sense. We do purge these queues when psock refcnt reaches zero but here we want to purge them explicitly in sock_map_close(). There are also some nasty race conditions on testing bit SK_PSOCK_TX_ENABLED and queuing/canceling the psock work, we can expand psock->ingress_lock a bit to protect them too. As noticed by John, we still have to lock the psock->work, because the same work item could be running concurrently on different CPU's. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ net/core/skmsg.c | 50 ++++++++++++++++++++++++++++++++++---------------- net/core/sock_map.c | 1 + 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index f2d45a73b2b2..7382c4b518d7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); struct proto *sk_proto; + struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; union { @@ -347,6 +348,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) } struct sk_psock *sk_psock_init(struct sock *sk, int node); +void sk_psock_stop(struct sk_psock *psock, bool wait); #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 305dddc51857..9c25020086a9 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; - return skb_send_sock_locked(psock->sk, skb, off, len); + return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb); } @@ -511,8 +511,7 @@ static void sk_psock_backlog(struct work_struct *work) u32 len, off; int ret; - /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); + mutex_lock(&psock->work_mutex); if (state->skb) { skb = state->skb; len = state->len; @@ -529,7 +528,7 @@ start: skb_bpf_redirect_clear(skb); do { ret = -EIO; - if (likely(psock->sk->sk_socket)) + if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { @@ -553,7 +552,7 @@ start: kfree_skb(skb); } end: - release_sock(psock->sk); + mutex_unlock(&psock->work_mutex); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -591,6 +590,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) spin_lock_init(&psock->link_lock); INIT_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); @@ -631,7 +631,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) } } -static void sk_psock_zap_ingress(struct sk_psock *psock) +static void __sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; @@ -639,9 +639,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } - spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); - spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) @@ -654,6 +652,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +void sk_psock_stop(struct sk_psock *psock, bool wait) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + __sk_psock_zap_ingress(psock); + spin_unlock_bh(&psock->ingress_lock); + + if (wait) + cancel_work_sync(&psock->work); +} + static void sk_psock_done_strp(struct sk_psock *psock); static void sk_psock_destroy_deferred(struct work_struct *gc) @@ -665,12 +675,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) sk_psock_done_strp(psock); cancel_work_sync(&psock->work); + mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); if (psock->sk_redir) sock_put(psock->sk_redir); @@ -688,8 +698,7 @@ static void sk_psock_destroy(struct rcu_head *rcu) void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); + sk_psock_stop(psock, false); write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); @@ -699,7 +708,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); call_rcu(&psock->rcu, sk_psock_destroy); } @@ -770,14 +778,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + kfree_skb(skb); + return; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); kfree_skb(skb); return; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); + spin_unlock_bh(&psock_other->ingress_lock); } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -845,8 +859,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } + spin_unlock_bh(&psock->ingress_lock); } break; case __SK_REDIRECT: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dd53a7771d7e..e564fdeaada1 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); + sk_psock_stop(psock, true); release_sock(sk); saved_close(sk, timeout); } -- cgit v1.2.3 From 7786dfc41a74e0567557b5c4a28fc8482f5f5691 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:26 -0700 Subject: skmsg: Use rcu work for destroying psock The RCU callback sk_psock_destroy() only queues work psock->gc, so we can just switch to rcu work to simplify the code. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-6-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 5 +---- net/core/skmsg.c | 17 +++++------------ 2 files changed, 6 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 7382c4b518d7..e7aba150539d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -102,10 +102,7 @@ struct sk_psock { struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; - union { - struct rcu_head rcu; - struct work_struct gc; - }; + struct rcu_work rwork; }; int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9c25020086a9..d43d43905d2c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -666,10 +666,10 @@ void sk_psock_stop(struct sk_psock *psock, bool wait) static void sk_psock_done_strp(struct sk_psock *psock); -static void sk_psock_destroy_deferred(struct work_struct *gc) +static void sk_psock_destroy(struct work_struct *work) { - struct sk_psock *psock = container_of(gc, struct sk_psock, gc); - + struct sk_psock *psock = container_of(to_rcu_work(work), + struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ sk_psock_done_strp(psock); @@ -688,14 +688,6 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -static void sk_psock_destroy(struct rcu_head *rcu) -{ - struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); - - INIT_WORK(&psock->gc, sk_psock_destroy_deferred); - schedule_work(&psock->gc); -} - void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { sk_psock_stop(psock, false); @@ -709,7 +701,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - call_rcu(&psock->rcu, sk_psock_destroy); + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); + queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); -- cgit v1.2.3 From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:30 -0700 Subject: sock_map: Introduce BPF_SK_SKB_VERDICT Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is confusing and more importantly we still want to distinguish them from user-space. So we can just reuse the stream verdict code but introduce a new type of eBPF program, skb_verdict. Users are not allowed to attach stream_verdict and skb_verdict programs to the same map. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 1 + net/core/skmsg.c | 4 +++- net/core/sock_map.c | 28 ++++++++++++++++++++++++++++ tools/bpf/bpftool/common.c | 1 + tools/bpf/bpftool/prog.c | 1 + tools/include/uapi/linux/bpf.h | 1 + 8 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e7aba150539d..c83dbc2d81d9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,7 @@ struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { @@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); + psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 598716742593..49371eba98ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9603de81811a..6428634da57e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 656eceab73bc..a045812d7c78 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.stream_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, } skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 42d797291d34..c2a0411e08a8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk, strp_stop = true; if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } @@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; @@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { @@ -278,6 +290,9 @@ no_progs: if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; @@ -309,6 +324,9 @@ no_progs: } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -317,6 +335,9 @@ out_unlock_drop: out_drop: sk_psock_put(sk, psock); out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: @@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #endif case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; pprog = &progs->stream_verdict; break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + pprog = &progs->skb_verdict; + break; default: return -EOPNOTSUPP; } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ab9f2233607c..69902603012c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 8a59f9d1e3d4340659fdfee8879dc09a6f2546e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:31 -0700 Subject: sock: Introduce sk->sk_prot->psock_update_sk_prot() Currently sockmap calls into each protocol to update the struct proto and replace it. This certainly won't work when the protocol is implemented as a module, for example, AF_UNIX. Introduce a new ops sk->sk_prot->psock_update_sk_prot(), so each protocol can implement its own way to replace the struct proto. This also helps get rid of symbol dependencies on CONFIG_INET. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-11-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 18 +++--------------- include/net/sock.h | 3 +++ include/net/tcp.h | 1 + include/net/udp.h | 1 + net/core/skmsg.c | 5 ----- net/core/sock_map.c | 24 ++++-------------------- net/ipv4/tcp_bpf.c | 24 +++++++++++++++++++++--- net/ipv4/tcp_ipv4.c | 3 +++ net/ipv4/udp.c | 3 +++ net/ipv4/udp_bpf.c | 15 +++++++++++++-- net/ipv6/tcp_ipv6.c | 3 +++ net/ipv6/udp.c | 3 +++ 12 files changed, 58 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c83dbc2d81d9..5e800ddc2dc6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + int (*psock_update_sk_prot)(struct sock *sk, bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -395,25 +396,12 @@ static inline void sk_psock_cork_free(struct sk_psock *psock) } } -static inline void sk_psock_update_proto(struct sock *sk, - struct sk_psock *psock, - struct proto *ops) -{ - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, ops); -} - static inline void sk_psock_restore_proto(struct sock *sk, struct sk_psock *psock) { sk->sk_prot->unhash = psock->saved_unhash; - if (inet_csk_has_ulp(sk)) { - tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); - } else { - sk->sk_write_space = psock->saved_write_space; - /* Pairs with lockless read in sk_clone_lock() */ - WRITE_ONCE(sk->sk_prot, psock->sk_proto); - } + if (psock->psock_update_sk_prot) + psock->psock_update_sk_prot(sk, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index 0b6266fd6bf6..8b4155e756c2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1184,6 +1184,9 @@ struct proto { void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); +#ifdef CONFIG_BPF_SYSCALL + int (*psock_update_sk_prot)(struct sock *sk, bool restore); +#endif /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS diff --git a/include/net/tcp.h b/include/net/tcp.h index 075de26f449d..2efa4e5ea23d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2203,6 +2203,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int tcp_bpf_update_proto(struct sock *sk, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index d4d064c59232..df7cc1edc200 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -518,6 +518,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); +int udp_bpf_update_proto(struct sock *sk, bool restore); #endif #endif /* _UDP_H */ diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a045812d7c78..9fc83f7cc1a0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -562,11 +562,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); - if (inet_csk_has_ulp(sk)) { - psock = ERR_PTR(-EINVAL); - goto out; - } - if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index c2a0411e08a8..2915c7c8778b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -185,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw) static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { - struct proto *prot; - - switch (sk->sk_type) { - case SOCK_STREAM: - prot = tcp_bpf_get_proto(sk, psock); - break; - - case SOCK_DGRAM: - prot = udp_bpf_get_proto(sk, psock); - break; - - default: + if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; - } - - if (IS_ERR(prot)) - return PTR_ERR(prot); - - sk_psock_update_proto(sk, psock, prot); - return 0; + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; + return sk->sk_prot->psock_update_sk_prot(sk, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) @@ -556,7 +540,7 @@ static bool sock_map_redirect_allowed(const struct sock *sk) static bool sock_map_sk_is_suitable(const struct sock *sk) { - return sk_is_tcp(sk) || sk_is_udp(sk); + return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ae980716d896..ac8cfbaeacd2 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -595,20 +595,38 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int tcp_bpf_update_proto(struct sock *sk, bool restore) { + struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; + if (restore) { + if (inet_csk_has_ulp(sk)) { + tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); + } else { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + } + return 0; + } + + if (inet_csk_has_ulp(sk)) + return -EINVAL; + if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) - return ERR_PTR(-EINVAL); + return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } - return &tcp_bpf_prots[family][config]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]); + return 0; } +EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index daad4f99db32..dfc6d1c0e710 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2806,6 +2806,9 @@ struct proto tcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a0478b17243..38952aaee3a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2849,6 +2849,9 @@ struct proto udp_prot = { .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 7a94791efc1a..6001f93cd3a0 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -41,12 +41,23 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock) +int udp_bpf_update_proto(struct sock *sk, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; + struct sk_psock *psock = sk_psock(sk); + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } if (sk->sk_family == AF_INET6) udp_bpf_check_v6_needs_rebuild(psock->sk_proto); - return &udp_bpf_prots[family]; + /* Pairs with lockless read in sk_clone_lock() */ + WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]); + return 0; } +EXPORT_SYMBOL_GPL(udp_bpf_update_proto); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d0f007741e8e..bff22d6ef516 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2139,6 +2139,9 @@ struct proto tcpv6_prot = { .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = tcp_bpf_update_proto, +#endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d25e5a9252fd..ef2c75bb4771 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1713,6 +1713,9 @@ struct proto udpv6_prot = { .unhash = udp_lib_unhash, .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = udp_bpf_update_proto, +#endif .memory_allocated = &udp_memory_allocated, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), -- cgit v1.2.3 From d7f571188ecf25c244789b883c878ec7c64b5b08 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:32 -0700 Subject: udp: Implement ->read_sock() for sockmap This is similar to tcp_read_sock(), except we do not need to worry about connections, we just need to retrieve skb from UDP receive queue. Note, the return value of ->read_sock() is unused in sk_psock_verdict_data_ready(), and UDP still does not support splice() due to lack of ->splice_read(), so users can not reach udp_read_sock() directly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-12-xiyou.wangcong@gmail.com --- include/net/udp.h | 2 ++ net/ipv4/af_inet.c | 1 + net/ipv4/udp.c | 29 +++++++++++++++++++++++++++++ net/ipv6/af_inet6.c | 1 + 4 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/net/udp.h b/include/net/udp.h index df7cc1edc200..347b62a753c3 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -329,6 +329,8 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 1355e6c0d567..f17870ee558b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1070,6 +1070,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, + .read_sock = udp_read_sock, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 38952aaee3a1..4d02f6839e38 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1782,6 +1782,35 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); +int udp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct sk_buff *skb; + int err, used; + + skb = skb_recv_udp(sk, 0, 1, &err); + if (!skb) + return err; + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + break; + } else if (used <= skb->len) { + copied += used; + } + + if (!desc->count) + break; + } + + return copied; +} +EXPORT_SYMBOL(udp_read_sock); + /* * This should be easy, if there is something there we * return it, otherwise we block. diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 802f5111805a..71de739b4a9e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -714,6 +714,7 @@ const struct proto_ops inet6_dgram_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ + .read_sock = udp_read_sock, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, -- cgit v1.2.3 From 2bc793e3272a13e337416c057cb81c5396ad91d1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:33 -0700 Subject: skmsg: Extract __tcp_bpf_recvmsg() and tcp_bpf_wait_data() Although these two functions are only used by TCP, they are not specific to TCP at all, both operate on skmsg and ingress_msg, so fit in net/core/skmsg.c very well. And we will need them for non-TCP, so rename and move them to skmsg.c and export them to modules. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-13-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 4 ++ include/net/tcp.h | 2 - net/core/skmsg.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_bpf.c | 100 +------------------------------------------------- net/tls/tls_sw.c | 4 +- 5 files changed, 106 insertions(+), 102 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 5e800ddc2dc6..f78e90a04a69 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -125,6 +125,10 @@ int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err); +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags); static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 2efa4e5ea23d..31b1696c62ba 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2209,8 +2209,6 @@ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9fc83f7cc1a0..92a83c02562a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,6 +399,104 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_wait_data); + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + struct sk_msg *msg_rx; + int i, copied = 0; + + msg_rx = sk_psock_peek_msg(psock); + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) + return copied ? copied : -EFAULT; + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + return copied; + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + if (unlikely(peek)) { + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) + break; + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); + } + msg_rx = sk_psock_peek_msg(psock); + } + + return copied; +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ac8cfbaeacd2..3d622a0d0753 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -10,80 +10,6 @@ #include #include -int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len, int flags) -{ - struct iov_iter *iter = &msg->msg_iter; - int peek = flags & MSG_PEEK; - struct sk_msg *msg_rx; - int i, copied = 0; - - msg_rx = sk_psock_peek_msg(psock); - while (copied != len) { - struct scatterlist *sge; - - if (unlikely(!msg_rx)) - break; - - i = msg_rx->sg.start; - do { - struct page *page; - int copy; - - sge = sk_msg_elem(msg_rx, i); - copy = sge->length; - page = sg_page(sge); - if (copied + copy > len) - copy = len - copied; - copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; - - copied += copy; - if (likely(!peek)) { - sge->offset += copy; - sge->length -= copy; - if (!msg_rx->skb) - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - - if (!sge->length) { - sk_msg_iter_var_next(i); - if (!msg_rx->skb) - put_page(page); - } - } else { - /* Lets not optimize peek case if copy_page_to_iter - * didn't copy the entire length lets just break. - */ - if (copy != sge->length) - return copied; - sk_msg_iter_var_next(i); - } - - if (copied == len) - break; - } while (i != msg_rx->sg.end); - - if (unlikely(peek)) { - msg_rx = sk_psock_next_msg(psock, msg_rx); - if (!msg_rx) - break; - continue; - } - - msg_rx->sg.start = i; - if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - msg_rx = sk_psock_dequeue_msg(psock); - kfree_sk_msg(msg_rx); - } - msg_rx = sk_psock_peek_msg(psock); - } - - return copied; -} -EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg); - static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes, int flags) { @@ -237,28 +163,6 @@ static bool tcp_bpf_stream_read(const struct sock *sk) return !empty; } -static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, - int flags, long timeo, int *err) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - int ret = 0; - - if (sk->sk_shutdown & RCV_SHUTDOWN) - return 1; - - if (!timeo) - return ret; - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - ret = sk_wait_event(sk, &timeo, - !list_empty(&psock->ingress_msg) || - !skb_queue_empty(&sk->sk_receive_queue), &wait); - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); - return ret; -} - static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -278,13 +182,13 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; timeo = sock_rcvtimeo(sk, nonblock); - data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err); + data = sk_msg_wait_data(sk, psock, flags, timeo, &err); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 01d933ae5f16..1dcb34dfd56b 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1789,8 +1789,8 @@ int tls_sw_recvmsg(struct sock *sk, skb = tls_wait_data(sk, psock, flags, timeo, &err); if (!skb) { if (psock) { - int ret = __tcp_bpf_recvmsg(sk, psock, - msg, len, flags); + int ret = sk_msg_recvmsg(sk, psock, msg, len, + flags); if (ret > 0) { decrypted += ret; -- cgit v1.2.3 From 9fadafa46f4813b9e3d8f76d3525c83499a26d36 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 1 Apr 2021 15:08:22 +0800 Subject: include: net: Remove repeated struct declaration struct ctl_table_header is declared twice. One is declared at 46th line. The blew one is not needed. Remove the duplicate. Signed-off-by: Wan Jiabing Signed-off-by: David S. Miller --- include/net/net_namespace.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index dcaee24a4d87..47457048ab86 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -407,7 +407,6 @@ int register_pernet_device(struct pernet_operations *); void unregister_pernet_device(struct pernet_operations *); struct ctl_table; -struct ctl_table_header; #ifdef CONFIG_SYSCTL int net_sysctl_init(void); -- cgit v1.2.3 From dc87efdb1a5cd46134a9d490480160e303bc6eef Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:19:44 -0700 Subject: mptcp: add mptcp reset option support The MPTCP reset option allows to carry a mptcp-specific error code that provides more information on the nature of a connection reset. Reset option data received gets stored in the subflow context so it can be sent to userspace via the 'subflow closed' netlink event. When a subflow is closed, the desired error code that should be sent to the peer is also placed in the subflow context structure. If a reset is sent before subflow establishment could complete, e.g. on HMAC failure during an MP_JOIN operation, the mptcp skb extension is used to store the reset information. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 18 +++++++++++-- include/uapi/linux/mptcp.h | 11 ++++++++ net/ipv4/tcp_ipv4.c | 21 ++++++++++++--- net/ipv6/tcp_ipv6.c | 14 +++++++++- net/mptcp/options.c | 67 ++++++++++++++++++++++++++++++++++++++++++---- net/mptcp/pm_netlink.c | 12 +++++++++ net/mptcp/protocol.c | 12 ++++++--- net/mptcp/protocol.h | 14 +++++++++- net/mptcp/subflow.c | 30 ++++++++++++++++++--- 9 files changed, 180 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index cea69c801595..16fe34d139c3 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -30,8 +30,8 @@ struct mptcp_ext { ack64:1, mpc_map:1, frozen:1, - __unused:1; - /* one byte hole */ + reset_transient:1; + u8 reset_reason:4; }; #define MPTCP_RM_IDS_MAX 8 @@ -58,6 +58,8 @@ struct mptcp_out_options { struct mptcp_rm_list rm_list; u8 join_id; u8 backup; + u8 reset_reason:4; + u8 reset_transient:1; u32 nonce; u64 thmac; u32 token; @@ -156,6 +158,16 @@ void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); + +__be32 mptcp_get_reset_option(const struct sk_buff *skb); + +static inline __be32 mptcp_reset_option(const struct sk_buff *skb) +{ + if (skb_ext_exist(skb, SKB_EXT_MPTCP)) + return mptcp_get_reset_option(skb); + + return htonl(0u); +} #else static inline void mptcp_init(void) @@ -236,6 +248,8 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, { return 0; /* TCP fallback */ } + +static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index e1172c1ffdfd..8eb3c0844bff 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -174,10 +174,21 @@ enum mptcp_event_attr { MPTCP_ATTR_FLAGS, /* u16 */ MPTCP_ATTR_TIMEOUT, /* u32 */ MPTCP_ATTR_IF_IDX, /* s32 */ + MPTCP_ATTR_RESET_REASON,/* u32 */ + MPTCP_ATTR_RESET_FLAGS, /* u32 */ __MPTCP_ATTR_AFTER_LAST }; #define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) +/* MPTCP Reset reason codes, rfc8684 */ +#define MPTCP_RST_EUNSPEC 0 +#define MPTCP_RST_EMPTCP 1 +#define MPTCP_RST_ERESOURCE 2 +#define MPTCP_RST_EPROHIBIT 3 +#define MPTCP_RST_EWQ2BIG 4 +#define MPTCP_RST_EBADPERF 5 +#define MPTCP_RST_EMIDDLEBOX 6 + #endif /* _UAPI_MPTCP_H */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index dfc6d1c0e710..312184cead57 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -655,14 +655,18 @@ EXPORT_SYMBOL(tcp_v4_send_check); * Exception: precedence violation. We do not implement it in any case. */ +#ifdef CONFIG_TCP_MD5SIG +#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED +#else +#define OPTION_BYTES sizeof(__be32) +#endif + static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct { struct tcphdr th; -#ifdef CONFIG_TCP_MD5SIG - __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)]; -#endif + __be32 opt[OPTION_BYTES / sizeof(__be32)]; } rep; struct ip_reply_arg arg; #ifdef CONFIG_TCP_MD5SIG @@ -770,6 +774,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ip_hdr(skb)->daddr, &rep.th); } #endif + /* Can't co-exist with TCPMD5, hence check rep.opt[0] */ + if (rep.opt[0] == 0) { + __be32 mrst = mptcp_reset_option(skb); + + if (mrst) { + rep.opt[0] = mrst; + arg.iov[0].iov_len += sizeof(mrst); + rep.th.doff = arg.iov[0].iov_len / 4; + } + } + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bff22d6ef516..5f47c0b6e3de 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -879,8 +879,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); + __be32 mrst = 0, *topt; struct dst_entry *dst; - __be32 *topt; __u32 mark = 0; if (tsecr) @@ -890,6 +890,15 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 tot_len += TCPOLEN_MD5SIG_ALIGNED; #endif +#ifdef CONFIG_MPTCP + if (rst && !key) { + mrst = mptcp_reset_option(skb); + + if (mrst) + tot_len += sizeof(__be32); + } +#endif + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, GFP_ATOMIC); if (!buff) @@ -920,6 +929,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 *topt++ = htonl(tsecr); } + if (mrst) + *topt++ = mrst; + #ifdef CONFIG_TCP_MD5SIG if (key) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 68361d28dc67..4b7119eb2c31 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -305,6 +305,18 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->fastclose = 1; break; + case MPTCPOPT_RST: + if (opsize != TCPOLEN_MPTCP_RST) + break; + + if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + break; + mp_opt->reset = 1; + flags = *ptr++; + mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; + mp_opt->reset_reason = *ptr; + break; + default: break; } @@ -327,6 +339,7 @@ void mptcp_get_options(const struct sk_buff *skb, mp_opt->rm_addr = 0; mp_opt->dss = 0; mp_opt->mp_prio = 0; + mp_opt->reset = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -726,6 +739,22 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, return true; } +static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (remaining < TCPOLEN_MPTCP_RST) + return; + + *size = TCPOLEN_MPTCP_RST; + opts->suboptions |= OPTION_MPTCP_RST; + opts->reset_transient = subflow->reset_transient; + opts->reset_reason = subflow->reset_reason; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -741,11 +770,10 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(__mptcp_check_fallback(msk))) return false; - /* prevent adding of any MPTCP related options on reset packet - * until we support MP_TCPRST/MP_FASTCLOSE - */ - if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) - return false; + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { + mptcp_established_options_rst(sk, skb, size, remaining, opts); + return true; + } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) @@ -1062,6 +1090,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mp_opt.mp_prio = 0; } + if (mp_opt.reset) { + subflow->reset_seen = 1; + subflow->reset_reason = mp_opt.reset_reason; + subflow->reset_transient = mp_opt.reset_transient; + } + if (!mp_opt.dss) return; @@ -1289,6 +1323,12 @@ mp_capable_done: ptr += 5; } + if (OPTION_MPTCP_RST & opts->suboptions) + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; @@ -1340,3 +1380,20 @@ mp_capable_done: if (tp) mptcp_set_rwin(tp); } + +__be32 mptcp_get_reset_option(const struct sk_buff *skb) +{ + const struct mptcp_ext *ext = mptcp_get_ext(skb); + u8 flags, reason; + + if (ext) { + flags = ext->reset_transient; + reason = ext->reset_reason; + + return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, + flags, reason); + } + + return htonl(0u); +} +EXPORT_SYMBOL_GPL(mptcp_get_reset_option); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index cadafafa1049..51be6c34b339 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1687,9 +1687,21 @@ static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { + const struct mptcp_subflow_context *sf; + if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; + sf = mptcp_subflow_ctx(ssk); + if (!sf->reset_seen) + return 0; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) + return -EMSGSIZE; + + if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) + return -EMSGSIZE; + return 0; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 531ee24aa827..e894345d10c1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3090,14 +3090,18 @@ bool mptcp_finish_join(struct sock *ssk) pr_debug("msk=%p, subflow=%p", msk, subflow); /* mptcp socket already closing? */ - if (!mptcp_is_fully_established(parent)) + if (!mptcp_is_fully_established(parent)) { + subflow->reset_reason = MPTCP_RST_EMPTCP; return false; + } if (!msk->pm.server_side) goto out; - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* active connections are already on conn_list, and we can't acquire * msk lock here. @@ -3111,8 +3115,10 @@ bool mptcp_finish_join(struct sock *ssk) sock_hold(ssk); } spin_unlock_bh(&msk->join_list_lock); - if (!ret) + if (!ret) { + subflow->reset_reason = MPTCP_RST_EPROHIBIT; return false; + } /* attach to msk socket only after we are sure he will deal with us * at close time diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index e8c5ff2b8ace..40e9b05856cd 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -26,6 +26,7 @@ #define OPTION_MPTCP_RM_ADDR BIT(8) #define OPTION_MPTCP_FASTCLOSE BIT(9) #define OPTION_MPTCP_PRIO BIT(10) +#define OPTION_MPTCP_RST BIT(11) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -36,6 +37,7 @@ #define MPTCPOPT_MP_PRIO 5 #define MPTCPOPT_MP_FAIL 6 #define MPTCPOPT_MP_FASTCLOSE 7 +#define MPTCPOPT_RST 8 /* MPTCP suboption lengths */ #define TCPOLEN_MPTCP_MPC_SYN 4 @@ -65,6 +67,7 @@ #define TCPOLEN_MPTCP_PRIO 3 #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 +#define TCPOLEN_MPTCP_RST 4 /* MPTCP MP_JOIN flags */ #define MPTCPOPT_BACKUP BIT(0) @@ -94,6 +97,9 @@ /* MPTCP MP_PRIO flags */ #define MPTCP_PRIO_BKUP BIT(0) +/* MPTCP TCPRST flags */ +#define MPTCP_RST_TRANSIENT BIT(0) + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_NOSPACE 1 @@ -123,6 +129,7 @@ struct mptcp_options_received { u16 mp_capable : 1, mp_join : 1, fastclose : 1, + reset : 1, dss : 1, add_addr : 1, rm_addr : 1, @@ -152,6 +159,8 @@ struct mptcp_options_received { }; u64 ahmac; u16 port; + u8 reset_reason:4; + u8 reset_transient:1; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) @@ -422,6 +431,9 @@ struct mptcp_subflow_context { u8 hmac[MPTCPOPT_HMAC_LEN]; u8 local_id; u8 remote_id; + u8 reset_seen:1; + u8 reset_transient:1; + u8 reset_reason:4; long delegated_status; struct list_head delegated_node; /* link into delegated_action, protected by local BH */ @@ -742,7 +754,7 @@ unsigned int mptcp_pm_get_add_addr_accept_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(struct mptcp_sock *msk); -static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) +static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) { return (struct mptcp_ext *)skb_ext_find(skb, SKB_EXT_MPTCP); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 7a5f50d00d4b..223d6be5fc3b 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -115,6 +115,16 @@ static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct soc return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport; } +static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason) +{ + struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP); + + if (mpext) { + memset(mpext, 0, sizeof(*mpext)); + mpext->reset_reason = reason; + } +} + /* Init mptcp request socket. * * Returns an error code if a JOIN has failed and a TCP reset @@ -190,8 +200,10 @@ again: subflow_req->msk = subflow_token_join_request(req); /* Can't fall back to TCP in this case. */ - if (!subflow_req->msk) + if (!subflow_req->msk) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); return -EPERM; + } if (subflow_use_different_sport(subflow_req->msk, sk_listener)) { pr_debug("syn inet_sport=%d %d", @@ -400,8 +412,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!mp_opt.mp_join) + if (!mp_opt.mp_join) { + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; + } subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; @@ -410,6 +424,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } @@ -438,6 +453,7 @@ fallback: return; do_reset: + subflow->reset_transient = 0; mptcp_subflow_reset(sk); } @@ -654,8 +670,10 @@ create_child: * to reset the context to non MPTCP status. */ if (!ctx || fallback) { - if (fallback_is_fatal) + if (fallback_is_fatal) { + subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; + } subflow_drop_ctx(child); goto out; @@ -690,8 +708,10 @@ create_child: struct mptcp_sock *owner; owner = subflow_req->msk; - if (!owner) + if (!owner) { + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; + } /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; @@ -1056,6 +1076,8 @@ fatal: smp_wmb(); ssk->sk_error_report(ssk); tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMPTCP; tcp_send_active_reset(ssk, GFP_ATOMIC); subflow->data_avail = 0; return false; -- cgit v1.2.3 From 7f040aa322617acf1978b8de140f1bf9ff916617 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Apr 2021 11:07:46 -0700 Subject: net: reorganize fields in netns_mib Order fields to increase locality for most used protocols. udplite and icmp are moved at the end. Same for proc_net_devsnmp6 which is not used in fast path. This potentially saves one cache line miss for typical TCP/UDP over IPv4/IPv6. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/mib.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index 59b2c3a3db42..7e373664b1e7 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -5,22 +5,19 @@ #include struct netns_mib { - DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); +#if IS_ENABLED(CONFIG_IPV6) + DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); +#endif + + DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); DEFINE_SNMP_STAT(struct linux_mib, net_statistics); - DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); - DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics); - DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); - DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics); + DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); #if IS_ENABLED(CONFIG_IPV6) - struct proc_dir_entry *proc_net_devsnmp6; DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); - DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6); - DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); - DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); - DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib, icmpv6msg_statistics); #endif + #ifdef CONFIG_XFRM_STATISTICS DEFINE_SNMP_STAT(struct linux_xfrm_mib, xfrm_statistics); #endif @@ -30,6 +27,19 @@ struct netns_mib { #ifdef CONFIG_MPTCP DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); #endif + + DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics); +#if IS_ENABLED(CONFIG_IPV6) + DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6); +#endif + + DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); + DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics); +#if IS_ENABLED(CONFIG_IPV6) + DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); + DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib, icmpv6msg_statistics); + struct proc_dir_entry *proc_net_devsnmp6; +#endif }; #endif -- cgit v1.2.3 From 82506665179209e43d3c9d39ffa42f8c8ff968bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Apr 2021 11:10:37 -0700 Subject: tcp: reorder tcp_congestion_ops for better cache locality Group all the often used fields in the first cache line, to reduce cache line misses. Signed-off-by: Eric Dumazet Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 31b1696c62ba..eaea43afcc97 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1035,44 +1035,56 @@ struct rate_sample { }; struct tcp_congestion_ops { - struct list_head list; - u32 key; - u32 flags; - - /* initialize private data (optional) */ - void (*init)(struct sock *sk); - /* cleanup private data (optional) */ - void (*release)(struct sock *sk); +/* fast path fields are put first to fill one cache line */ /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); + /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); + /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); + /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); - /* new value of cwnd after loss (required) */ - u32 (*undo_cwnd)(struct sock *sk); + /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); - /* returns the multiplier used in tcp_sndbuf_expand (optional) */ - u32 (*sndbuf_expand)(struct sock *sk); + /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, const struct rate_sample *rs); + + + /* new value of cwnd after loss (required) */ + u32 (*undo_cwnd)(struct sock *sk); + /* returns the multiplier used in tcp_sndbuf_expand (optional) */ + u32 (*sndbuf_expand)(struct sock *sk); + +/* control/slow paths put last */ /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); - char name[TCP_CA_NAME_MAX]; - struct module *owner; -}; + char name[TCP_CA_NAME_MAX]; + struct module *owner; + struct list_head list; + u32 key; + u32 flags; + + /* initialize private data (optional) */ + void (*init)(struct sock *sk); + /* cleanup private data (optional) */ + void (*release)(struct sock *sk); +} ____cacheline_aligned_in_smp; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); -- cgit v1.2.3 From 4c4c0a89abd5c08e91df9bcce4ebcb3433bbb9bf Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 19 Feb 2021 08:18:12 +0200 Subject: net/mlx5: Pack mlx5_rl_entry structure mlx5_rl_entry structure is not properly packed as shown below. Due to this an array of size 9144 bytes allocated which is aligned to 16Kbytes. Hence, pack the structure and avoid the wastage. This offers 8Kbytes of saving per mlx5_core_dev struct. pahole -C mlx5_rl_entry drivers/net/ethernet/mellanox/mlx5/core/en_main.o Existing layout: struct mlx5_rl_entry { u8 rl_raw[48]; /* 0 48 */ u16 index; /* 48 2 */ /* XXX 6 bytes hole, try to pack */ u64 refcount; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ u16 uid; /* 64 2 */ u8 dedicated:1; /* 66: 0 1 */ /* size: 72, cachelines: 2, members: 5 */ /* sum members: 60, holes: 1, sum holes: 6 */ /* sum bitfield members: 1 bits (0 bytes) */ /* padding: 5 */ /* bit_padding: 7 bits */ /* last cacheline: 8 bytes */ }; After alignment: struct mlx5_rl_entry { u8 rl_raw[48]; /* 0 48 */ u64 refcount; /* 48 8 */ u16 index; /* 56 2 */ u16 uid; /* 58 2 */ u8 dedicated:1; /* 60: 0 1 */ /* size: 64, cachelines: 1, members: 5 */ /* padding: 3 */ /* bit_padding: 7 bits */ }; Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 23bb01d7c9b9..a9bd7e3bd554 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -517,8 +517,8 @@ struct mlx5_rate_limit { struct mlx5_rl_entry { u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)]; - u16 index; u64 refcount; + u16 index; u16 uid; u8 dedicated : 1; }; -- cgit v1.2.3 From 6b30b6d4d36c978e0ab0f22e85bf3c646732e98b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 19 Feb 2021 12:06:54 +0200 Subject: net/mlx5: Allocate rate limit table when rate is configured A device supports 128 rate limiters. A static table allocation consumes 8KB of memory even when rate is not configured. Instead, allocate the table when at least one rate is configured. Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/rl.c | 46 ++++++++++++++++++++++------ include/linux/mlx5/driver.h | 1 + 2 files changed, 38 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rl.c b/drivers/net/ethernet/mellanox/mlx5/core/rl.c index 08792fe701e3..0526e3798c09 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c @@ -117,6 +117,9 @@ static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table, bool empty_found = false; int i; + lockdep_assert_held(&table->rl_lock); + WARN_ON(!table->rl_entry); + for (i = 0; i < table->max_size; i++) { if (dedicated) { if (!table->rl_entry[i].refcount) @@ -172,10 +175,17 @@ bool mlx5_rl_are_equal(struct mlx5_rate_limit *rl_0, } EXPORT_SYMBOL(mlx5_rl_are_equal); -static int mlx5_rl_table_alloc(struct mlx5_rl_table *table) +static int mlx5_rl_table_get(struct mlx5_rl_table *table) { int i; + lockdep_assert_held(&table->rl_lock); + + if (table->rl_entry) { + table->refcount++; + return 0; + } + table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry), GFP_KERNEL); if (!table->rl_entry) @@ -187,13 +197,27 @@ static int mlx5_rl_table_alloc(struct mlx5_rl_table *table) for (i = 0; i < table->max_size; i++) table->rl_entry[i].index = i + 1; + table->refcount++; return 0; } +static void mlx5_rl_table_put(struct mlx5_rl_table *table) +{ + lockdep_assert_held(&table->rl_lock); + if (--table->refcount) + return; + + kfree(table->rl_entry); + table->rl_entry = NULL; +} + static void mlx5_rl_table_free(struct mlx5_core_dev *dev, struct mlx5_rl_table *table) { int i; + if (!table->rl_entry) + return; + /* Clear all configured rates */ for (i = 0; i < table->max_size; i++) if (table->rl_entry[i].refcount) @@ -219,8 +243,8 @@ int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid, { struct mlx5_rl_table *table = &dev->priv.rl_table; struct mlx5_rl_entry *entry; - int err = 0; u32 rate; + int err; if (!table->max_size) return -EOPNOTSUPP; @@ -233,13 +257,16 @@ int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid, } mutex_lock(&table->rl_lock); + err = mlx5_rl_table_get(table); + if (err) + goto out; entry = find_rl_entry(table, rl_in, uid, dedicated_entry); if (!entry) { mlx5_core_err(dev, "Max number of %u rates reached\n", table->max_size); err = -ENOSPC; - goto out; + goto rl_err; } if (!entry->refcount) { /* new rate limit */ @@ -255,14 +282,18 @@ int mlx5_rl_add_rate_raw(struct mlx5_core_dev *dev, void *rl_in, u16 uid, burst_upper_bound), MLX5_GET(set_pp_rate_limit_context, rl_in, typical_packet_size)); - goto out; + goto rl_err; } entry->dedicated = dedicated_entry; } mlx5_rl_entry_get(entry); *index = entry->index; + mutex_unlock(&table->rl_lock); + return 0; +rl_err: + mlx5_rl_table_put(table); out: mutex_unlock(&table->rl_lock); return err; @@ -277,6 +308,7 @@ void mlx5_rl_remove_rate_raw(struct mlx5_core_dev *dev, u16 index) mutex_lock(&table->rl_lock); entry = &table->rl_entry[index - 1]; mlx5_rl_entry_put(dev, entry); + mlx5_rl_table_put(table); mutex_unlock(&table->rl_lock); } EXPORT_SYMBOL(mlx5_rl_remove_rate_raw); @@ -325,6 +357,7 @@ void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, struct mlx5_rate_limit *rl) goto out; } mlx5_rl_entry_put(dev, entry); + mlx5_rl_table_put(table); out: mutex_unlock(&table->rl_lock); } @@ -333,7 +366,6 @@ EXPORT_SYMBOL(mlx5_rl_remove_rate); int mlx5_init_rl_table(struct mlx5_core_dev *dev) { struct mlx5_rl_table *table = &dev->priv.rl_table; - int err; mutex_init(&table->rl_lock); if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) { @@ -346,10 +378,6 @@ int mlx5_init_rl_table(struct mlx5_core_dev *dev) table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate); table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate); - err = mlx5_rl_table_alloc(table); - if (err) - return err; - mlx5_core_info(dev, "Rate limit: %u rates are supported, range: %uMbps to %uMbps\n", table->max_size, table->min_rate >> 10, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a9bd7e3bd554..baf38b5a2a8c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -530,6 +530,7 @@ struct mlx5_rl_table { u32 max_rate; u32 min_rate; struct mlx5_rl_entry *rl_entry; + u64 refcount; }; struct mlx5_core_roce { -- cgit v1.2.3 From 2daae89666ad253281bb3d6a027c00a702c02eff Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 1 Apr 2021 14:46:37 +0800 Subject: bpf, cgroup: Delete repeated struct bpf_prog declaration struct bpf_prog is declared twice. There is one declaration which is independent on the macro at 18th line. So the below one is not needed though. Remove the duplicate. Signed-off-by: Wan Jiabing Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210401064637.993327-1-wanjiabing@vivo.com --- include/linux/bpf-cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 6a29fe11485d..8b77d08d4b47 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -458,7 +458,6 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); #else -struct bpf_prog; struct cgroup_bpf {}; static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} -- cgit v1.2.3 From 6ac4c6f887f5a8efb6a6952798c09a2562022966 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 1 Apr 2021 15:20:37 +0800 Subject: bpf: Remove repeated struct btf_type declaration struct btf_type is declared twice. One is declared at 35th line. The below one is not needed, hence remove the duplicate. Signed-off-by: Wan Jiabing Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210401072037.995849-1-wanjiabing@vivo.com --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9fdd839b418c..ff8cd68c01b3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -928,7 +928,6 @@ struct bpf_link_primer { }; struct bpf_struct_ops_value; -struct btf_type; struct btf_member; #define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64 -- cgit v1.2.3 From c3d5c2d96d69f2578d6fbf66e39cc2cf840d9812 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 4 Apr 2021 10:22:18 +0300 Subject: PCI/IOV: Add sysfs MSI-X vector assignment interface A typical cloud provider SR-IOV use case is to create many VFs for use by guest VMs. The VFs may not be assigned to a VM until a customer requests a VM of a certain size, e.g., number of CPUs. A VF may need MSI-X vectors proportional to the number of CPUs in the VM, but there is no standard way to change the number of MSI-X vectors supported by a VF. Some Mellanox ConnectX devices support dynamic assignment of MSI-X vectors to SR-IOV VFs. This can be done by the PF driver after VFs are enabled, and it can be done without affecting VFs that are already in use. The hardware supports a limited pool of MSI-X vectors that can be assigned to the PF or to individual VFs. This is device-specific behavior that requires support in the PF driver. Add a read-only "sriov_vf_total_msix" sysfs file for the PF and a writable "sriov_vf_msix_count" file for each VF. Management software may use these to learn how many MSI-X vectors are available and to dynamically assign them to VFs before the VFs are passed through to a VM. If the PF driver implements the ->sriov_get_vf_total_msix() callback, "sriov_vf_total_msix" contains the total number of MSI-X vectors available for distribution among VFs. If no driver is bound to the VF, writing "N" to "sriov_vf_msix_count" uses the PF driver ->sriov_set_msix_vec_count() callback to assign "N" MSI-X vectors to the VF. When a VF driver subsequently reads the MSI-X Message Control register, it will see the new Table Size "N". Link: https://lore.kernel.org/linux-pci/20210314124256.70253-2-leon@kernel.org Acked-by: Bjorn Helgaas Signed-off-by: Leon Romanovsky --- Documentation/ABI/testing/sysfs-bus-pci | 29 +++++++++ drivers/pci/iov.c | 102 ++++++++++++++++++++++++++++++-- drivers/pci/pci-sysfs.c | 3 +- drivers/pci/pci.h | 3 +- include/linux/pci.h | 8 +++ 5 files changed, 137 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index 25c9c39770c6..e5cfd170b491 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -375,3 +375,32 @@ Description: The value comes from the PCI kernel device state and can be one of: "unknown", "error", "D0", D1", "D2", "D3hot", "D3cold". The file is read only. + +What: /sys/bus/pci/devices/.../sriov_vf_total_msix +Date: January 2021 +Contact: Leon Romanovsky +Description: + This file is associated with a SR-IOV physical function (PF). + It contains the total number of MSI-X vectors available for + assignment to all virtual functions (VFs) associated with PF. + The value will be zero if the device doesn't support this + functionality. For supported devices, the value will be + constant and won't be changed after MSI-X vectors assignment. + +What: /sys/bus/pci/devices/.../sriov_vf_msix_count +Date: January 2021 +Contact: Leon Romanovsky +Description: + This file is associated with a SR-IOV virtual function (VF). + It allows configuration of the number of MSI-X vectors for + the VF. This allows devices that have a global pool of MSI-X + vectors to optimally divide them between VFs based on VF usage. + + The values accepted are: + * > 0 - this number will be reported as the Table Size in the + VF's MSI-X capability + * < 0 - not valid + * = 0 - will reset to the device default value + + The file is writable if the PF is bound to a driver that + implements ->sriov_set_msix_vec_count(). diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 4afd4ee4f7f0..afc06e6ce115 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -31,6 +31,7 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int vf_id) return (dev->devfn + dev->sriov->offset + dev->sriov->stride * vf_id) & 0xff; } +EXPORT_SYMBOL_GPL(pci_iov_virtfn_devfn); /* * Per SR-IOV spec sec 3.3.10 and 3.3.11, First VF Offset and VF Stride may @@ -157,6 +158,92 @@ failed: return rc; } +#ifdef CONFIG_PCI_MSI +static ssize_t sriov_vf_total_msix_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pci_dev *pdev = to_pci_dev(dev); + u32 vf_total_msix = 0; + + device_lock(dev); + if (!pdev->driver || !pdev->driver->sriov_get_vf_total_msix) + goto unlock; + + vf_total_msix = pdev->driver->sriov_get_vf_total_msix(pdev); +unlock: + device_unlock(dev); + return sysfs_emit(buf, "%u\n", vf_total_msix); +} +static DEVICE_ATTR_RO(sriov_vf_total_msix); + +static ssize_t sriov_vf_msix_count_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *vf_dev = to_pci_dev(dev); + struct pci_dev *pdev = pci_physfn(vf_dev); + int val, ret; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0) + return -EINVAL; + + device_lock(&pdev->dev); + if (!pdev->driver || !pdev->driver->sriov_set_msix_vec_count) { + ret = -EOPNOTSUPP; + goto err_pdev; + } + + device_lock(&vf_dev->dev); + if (vf_dev->driver) { + /* + * A driver is already attached to this VF and has configured + * itself based on the current MSI-X vector count. Changing + * the vector size could mess up the driver, so block it. + */ + ret = -EBUSY; + goto err_dev; + } + + ret = pdev->driver->sriov_set_msix_vec_count(vf_dev, val); + +err_dev: + device_unlock(&vf_dev->dev); +err_pdev: + device_unlock(&pdev->dev); + return ret ? : count; +} +static DEVICE_ATTR_WO(sriov_vf_msix_count); +#endif + +static struct attribute *sriov_vf_dev_attrs[] = { +#ifdef CONFIG_PCI_MSI + &dev_attr_sriov_vf_msix_count.attr, +#endif + NULL, +}; + +static umode_t sriov_vf_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + + if (!pdev->is_virtfn) + return 0; + + return a->mode; +} + +const struct attribute_group sriov_vf_dev_attr_group = { + .attrs = sriov_vf_dev_attrs, + .is_visible = sriov_vf_attrs_are_visible, +}; + int pci_iov_add_virtfn(struct pci_dev *dev, int id) { int i; @@ -400,18 +487,21 @@ static DEVICE_ATTR_RO(sriov_stride); static DEVICE_ATTR_RO(sriov_vf_device); static DEVICE_ATTR_RW(sriov_drivers_autoprobe); -static struct attribute *sriov_dev_attrs[] = { +static struct attribute *sriov_pf_dev_attrs[] = { &dev_attr_sriov_totalvfs.attr, &dev_attr_sriov_numvfs.attr, &dev_attr_sriov_offset.attr, &dev_attr_sriov_stride.attr, &dev_attr_sriov_vf_device.attr, &dev_attr_sriov_drivers_autoprobe.attr, +#ifdef CONFIG_PCI_MSI + &dev_attr_sriov_vf_total_msix.attr, +#endif NULL, }; -static umode_t sriov_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) +static umode_t sriov_pf_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) { struct device *dev = kobj_to_dev(kobj); @@ -421,9 +511,9 @@ static umode_t sriov_attrs_are_visible(struct kobject *kobj, return a->mode; } -const struct attribute_group sriov_dev_attr_group = { - .attrs = sriov_dev_attrs, - .is_visible = sriov_attrs_are_visible, +const struct attribute_group sriov_pf_dev_attr_group = { + .attrs = sriov_pf_dev_attrs, + .is_visible = sriov_pf_attrs_are_visible, }; int __weak pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index f8afd54ca3e1..a6b8fbbba6d2 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1567,7 +1567,8 @@ static const struct attribute_group *pci_dev_attr_groups[] = { &pci_dev_attr_group, &pci_dev_hp_attr_group, #ifdef CONFIG_PCI_IOV - &sriov_dev_attr_group, + &sriov_pf_dev_attr_group, + &sriov_vf_dev_attr_group, #endif &pci_bridge_attr_group, &pcie_dev_attr_group, diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ef7c4661314f..afb87b917f07 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -501,7 +501,8 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); -extern const struct attribute_group sriov_dev_attr_group; +extern const struct attribute_group sriov_pf_dev_attr_group; +extern const struct attribute_group sriov_vf_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..9b575a676888 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -856,6 +856,12 @@ struct module; * e.g. drivers/net/e100.c. * @sriov_configure: Optional driver callback to allow configuration of * number of VFs to enable via sysfs "sriov_numvfs" file. + * @sriov_set_msix_vec_count: PF Driver callback to change number of MSI-X + * vectors on a VF. Triggered via sysfs "sriov_vf_msix_count". + * This will change MSI-X Table Size in the VF Message Control + * registers. + * @sriov_get_vf_total_msix: PF driver callback to get the total number of + * MSI-X vectors available for distribution to the VFs. * @err_handler: See Documentation/PCI/pci-error-recovery.rst * @groups: Sysfs attribute groups. * @driver: Driver model structure. @@ -871,6 +877,8 @@ struct pci_driver { int (*resume)(struct pci_dev *dev); /* Device woken up */ void (*shutdown)(struct pci_dev *dev); int (*sriov_configure)(struct pci_dev *dev, int num_vfs); /* On PF */ + int (*sriov_set_msix_vec_count)(struct pci_dev *vf, int msix_vec_count); /* On PF */ + u32 (*sriov_get_vf_total_msix)(struct pci_dev *pf); const struct pci_error_handlers *err_handler; const struct attribute_group **groups; struct device_driver driver; -- cgit v1.2.3 From 0b989c1e37053196676b2238f82195bd5a339d58 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 14 Mar 2021 14:42:54 +0200 Subject: net/mlx5: Add dynamic MSI-X capabilities bits These new fields declare the number of MSI-X vectors that is possible to allocate on the VF through PF configuration. Value must be in range defined by min_dynamic_vf_msix_table_size and max_dynamic_vf_msix_table_size. The driver should continue to query its MSI-X table through PCI configuration header. Link: https://lore.kernel.org/linux-pci/20210314124256.70253-3-leon@kernel.org Acked-by: Bjorn Helgaas Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3ee7a86f39e4..432290b58a0b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1683,7 +1683,16 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_6e0[0x10]; u8 sf_base_id[0x10]; - u8 reserved_at_700[0x80]; + u8 reserved_at_700[0x8]; + u8 num_total_dynamic_vf_msix[0x18]; + u8 reserved_at_720[0x14]; + u8 dynamic_msix_table_size[0xc]; + u8 reserved_at_740[0xc]; + u8 min_dynamic_vf_msix_table_size[0x4]; + u8 reserved_at_750[0x4]; + u8 max_dynamic_vf_msix_table_size[0xc]; + + u8 reserved_at_760[0x20]; u8 vhca_tunnel_commands[0x40]; u8 reserved_at_7c0[0x40]; }; -- cgit v1.2.3 From eeb85a14ee3494febb85ccfbee0772eda0823b13 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 5 Apr 2021 00:12:23 -0700 Subject: net: Allow to specify ifindex when device is moved to another namespace Currently, we can specify ifindex on link creation. This change allows to specify ifindex when a device is moved to another network namespace. Even now, a device ifindex can be changed if there is another device with the same ifindex in the target namespace. So this change doesn't introduce completely new behavior, it adds more control to the process. CRIU users want to restore containers with pre-created network devices. A user will provide network devices and instructions where they have to be restored, then CRIU will restore network namespaces and move devices into them. The problem is that devices have to be restored with the same indexes that they have before C/R. Cc: Alexander Mikhalitsyn Suggested-by: Christian Brauner Signed-off-by: Andrei Vagin Reviewed-by: Christian Brauner Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 2 +- include/linux/netdevice.h | 3 ++- net/core/dev.c | 24 +++++++++++++++++------- net/core/rtnetlink.c | 19 +++++++++++++++---- net/ieee802154/core.c | 4 ++-- net/wireless/core.c | 4 ++-- 6 files changed, 39 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 7349a70af083..8c0c70e1da77 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2354,7 +2354,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) */ if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) { ret = dev_change_net_namespace(vf_netdev, - dev_net(ndev), "eth%d"); + dev_net(ndev), "eth%d", 0); if (ret) netdev_err(vf_netdev, "could not move to same namespace as %s: %d\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f57b70fc251f..b482236c0e99 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4026,7 +4026,8 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags, int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); -int dev_change_net_namespace(struct net_device *, struct net *, const char *); +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex); int __dev_set_mtu(struct net_device *, int); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); diff --git a/net/core/dev.c b/net/core/dev.c index b4c67a5be606..9d1a8fac793f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11067,6 +11067,8 @@ EXPORT_SYMBOL(unregister_netdev); * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. + * @new_ifindex: If not zero, specifies device index in the target + * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on @@ -11075,10 +11077,11 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); - int err, new_nsid, new_ifindex; + int err, new_nsid; ASSERT_RTNL(); @@ -11109,6 +11112,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char goto out; } + /* Check that new_ifindex isn't used yet. */ + err = -EBUSY; + if (new_ifindex && __dev_get_by_index(net, new_ifindex)) + goto out; + /* * And now a mini version of register_netdevice unregister_netdevice. */ @@ -11136,10 +11144,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; + if (!new_ifindex) { + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11448,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (__dev_get_by_name(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); - err = dev_change_net_namespace(dev, &init_net, fb_name); + err = dev_change_net_namespace(dev, &init_net, fb_name, 0); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1bdcb33fb561..d51252afde0a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2266,6 +2266,9 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EINVAL; } + if (tb[IFLA_NEW_IFINDEX] && nla_get_s32(tb[IFLA_NEW_IFINDEX]) <= 0) + return -EINVAL; + if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; @@ -2603,14 +2606,22 @@ static int do_setlink(const struct sk_buff *skb, return err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { - struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); + struct net *net; + int new_ifindex; + + net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } - err = dev_change_net_namespace(dev, net, ifname); + if (tb[IFLA_NEW_IFINDEX]) + new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); + else + new_ifindex = 0; + + err = dev_change_net_namespace(dev, net, ifname, new_ifindex); put_net(net); if (err) goto errout; @@ -3452,7 +3463,7 @@ replay: if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); + err = dev_change_net_namespace(dev, dest_net, ifname, 0); if (err < 0) goto out_unregister; } diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index de259b5170ab..ec3068937fc3 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -205,7 +205,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, if (!wpan_dev->netdev) continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); + err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d", 0); if (err) break; wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -222,7 +222,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wpan_dev->netdev, net, - "wpan%d"); + "wpan%d", 0); WARN_ON(err); wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; } diff --git a/net/wireless/core.c b/net/wireless/core.c index a2785379df6e..fabb677b7d58 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -165,7 +165,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, if (!wdev->netdev) continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); + err = dev_change_net_namespace(wdev->netdev, net, "wlan%d", 0); if (err) break; wdev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -182,7 +182,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wdev->netdev, net, - "wlan%d"); + "wlan%d", 0); WARN_ON(err); wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } -- cgit v1.2.3 From 237c609f8744a8d5415f40a7ee731957934b0eef Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:11:04 +0200 Subject: netfilter: nfnetlink: add and use nfnetlink_broadcast This removes the only reference of net->nfnl outside of the nfnetlink module. This allows to move net->nfnl to net_generic infra. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 2 ++ net/netfilter/nfnetlink.c | 7 +++++++ net/netfilter/nfnetlink_acct.c | 3 +-- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 791d516e1e88..d4c14257db5d 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -51,6 +51,8 @@ int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); +void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, + __u32 group, gfp_t allocation); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index d3df66a39b5e..06e106b3ed85 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -178,6 +178,13 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid) } EXPORT_SYMBOL_GPL(nfnetlink_unicast); +void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, + __u32 group, gfp_t allocation) +{ + netlink_broadcast(net->nfnl, skb, portid, group, allocation); +} +EXPORT_SYMBOL_GPL(nfnetlink_broadcast); + /* Process one complete nfnetlink message. */ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index bb930f3b06c7..6895f31c5fbb 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -469,8 +469,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct) kfree_skb(skb); return; } - netlink_broadcast(net->nfnl, skb, 0, NFNLGRP_ACCT_QUOTA, - GFP_ATOMIC); + nfnetlink_broadcast(net, skb, 0, NFNLGRP_ACCT_QUOTA, GFP_ATOMIC); } int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct) -- cgit v1.2.3 From 8b0adbe3e38dbe5aae9edf6f5159ffdca7cfbdf1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:11:07 +0200 Subject: netfilter: nf_defrag_ipv6: use net_generic infra This allows followup patch to remove these members from struct net. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 6 +++ net/ipv6/netfilter/nf_conntrack_reasm.c | 68 +++++++++++++++-------------- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 15 ++++--- 3 files changed, 52 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index 6d31cd041143..ece923e2035b 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -13,4 +13,10 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user); struct inet_frags_ctl; +struct nft_ct_frag6_pernet { + struct ctl_table_header *nf_frag_frags_hdr; + struct fqdir *fqdir; + unsigned int users; +}; + #endif /* _NF_DEFRAG_IPV6_H */ diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c129ad334eb3..a0108415275f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -15,28 +15,13 @@ #include #include #include -#include -#include -#include #include -#include #include -#include #include -#include -#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include #include #include #include @@ -44,11 +29,18 @@ #include #include #include +#include static const char nf_frags_cache_name[] = "nf-frags"; +unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; +static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) +{ + return net_generic(net, nf_frag_pernet_id); +} + #ifdef CONFIG_SYSCTL static struct ctl_table nf_ct_frag6_sysctl_table[] = { @@ -75,6 +67,7 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = { static int nf_ct_frag6_sysctl_register(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag; struct ctl_table *table; struct ctl_table_header *hdr; @@ -86,18 +79,20 @@ static int nf_ct_frag6_sysctl_register(struct net *net) goto err_alloc; } - table[0].data = &net->nf_frag.fqdir->timeout; - table[1].data = &net->nf_frag.fqdir->low_thresh; - table[1].extra2 = &net->nf_frag.fqdir->high_thresh; - table[2].data = &net->nf_frag.fqdir->high_thresh; - table[2].extra1 = &net->nf_frag.fqdir->low_thresh; - table[2].extra2 = &init_net.nf_frag.fqdir->high_thresh; + nf_frag = nf_frag_pernet(net); + + table[0].data = &nf_frag->fqdir->timeout; + table[1].data = &nf_frag->fqdir->low_thresh; + table[1].extra2 = &nf_frag->fqdir->high_thresh; + table[2].data = &nf_frag->fqdir->high_thresh; + table[2].extra1 = &nf_frag->fqdir->low_thresh; + table[2].extra2 = &nf_frag->fqdir->high_thresh; hdr = register_net_sysctl(net, "net/netfilter", table); if (hdr == NULL) goto err_reg; - net->nf_frag_frags_hdr = hdr; + nf_frag->nf_frag_frags_hdr = hdr; return 0; err_reg: @@ -109,10 +104,11 @@ err_alloc: static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct ctl_table *table; - table = net->nf_frag_frags_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->nf_frag_frags_hdr); + table = nf_frag->nf_frag_frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } @@ -149,6 +145,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, const struct ipv6hdr *hdr, int iif) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); struct frag_v6_compare_key key = { .id = id, .saddr = hdr->saddr, @@ -158,7 +155,7 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, }; struct inet_frag_queue *q; - q = inet_frag_find(net->nf_frag.fqdir, &key); + q = inet_frag_find(nf_frag->fqdir, &key); if (!q) return NULL; @@ -495,37 +492,44 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); static int nf_ct_net_init(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); int res; - res = fqdir_init(&net->nf_frag.fqdir, &nf_frags, net); + res = fqdir_init(&nf_frag->fqdir, &nf_frags, net); if (res < 0) return res; - net->nf_frag.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; - net->nf_frag.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; - net->nf_frag.fqdir->timeout = IPV6_FRAG_TIMEOUT; + nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; + nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; + nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = nf_ct_frag6_sysctl_register(net); if (res < 0) - fqdir_exit(net->nf_frag.fqdir); + fqdir_exit(nf_frag->fqdir); return res; } static void nf_ct_net_pre_exit(struct net *net) { - fqdir_pre_exit(net->nf_frag.fqdir); + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); + + fqdir_pre_exit(nf_frag->fqdir); } static void nf_ct_net_exit(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net); + nf_ct_frags6_sysctl_unregister(net); - fqdir_exit(net->nf_frag.fqdir); + fqdir_exit(nf_frag->fqdir); } static struct pernet_operations nf_ct_net_ops = { .init = nf_ct_net_init, .pre_exit = nf_ct_net_pre_exit, .exit = nf_ct_net_exit, + .id = &nf_frag_pernet_id, + .size = sizeof(struct nft_ct_frag6_pernet), }; static const struct rhashtable_params nfct_rhash_params = { diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 6646a87fb5dc..402dc4ca9504 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -25,6 +25,8 @@ #include #include +extern unsigned int nf_frag_pernet_id; + static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, @@ -89,10 +91,12 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = { static void __net_exit defrag6_net_exit(struct net *net) { - if (net->nf.defrag_ipv6) { + struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); + + if (nf_frag->users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); - net->nf.defrag_ipv6 = false; + nf_frag->users = 0; } } @@ -130,21 +134,22 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv6_enable(struct net *net) { + struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; might_sleep(); - if (net->nf.defrag_ipv6) + if (nf_frag->users) return 0; mutex_lock(&defrag6_mutex); - if (net->nf.defrag_ipv6) + if (nf_frag->users) goto out_unlock; err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) - net->nf.defrag_ipv6 = true; + nf_frag->users = 1; out_unlock: mutex_unlock(&defrag6_mutex); -- cgit v1.2.3 From 0854db2aaef3fcdd3498a9d299c60adea2aa3dc6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:11:10 +0200 Subject: netfilter: nf_tables: use net_generic infra for transaction data This moves all nf_tables pernet data from struct net to a net_generic extension, with the exception of the gencursor. The latter is used in the data path and also outside of the nf_tables core. All others are only used from the configuration plane. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 11 ++ net/netfilter/nf_tables_api.c | 313 ++++++++++++++++++++++++-------------- net/netfilter/nf_tables_offload.c | 30 ++-- net/netfilter/nft_chain_filter.c | 11 +- net/netfilter/nft_dynset.c | 6 +- 5 files changed, 243 insertions(+), 128 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8fefa112ae89..f0f7a3c5da6a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1567,4 +1567,15 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif + +struct nftables_pernet { + struct list_head tables; + struct list_head commit_list; + struct list_head module_list; + struct list_head notify_list; + struct mutex commit_mutex; + unsigned int base_seq; + u8 validate_state; +}; + #endif /* _NET_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a24de59e6c69..1b881a84bd01 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -21,10 +21,13 @@ #include #include #include +#include #include #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) +unsigned int nf_tables_net_id __read_mostly; + static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); @@ -103,7 +106,9 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types static void nft_validate_state_update(struct net *net, u8 new_validate_state) { - switch (net->nft.validate_state) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); break; @@ -114,7 +119,7 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) return; } - net->nft.validate_state = new_validate_state; + nft_net->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); @@ -169,13 +174,15 @@ static void nft_trans_destroy(struct nft_trans *trans) static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) { + struct nftables_pernet *nft_net; struct net *net = ctx->net; struct nft_trans *trans; if (!nft_set_is_anonymous(set)) return; - list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: if (nft_trans_set(trans) == set) @@ -269,6 +276,14 @@ static void nf_tables_unregister_hook(struct net *net, nf_unregister_net_hook(net, &basechain->ops); } +static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) +{ + struct nftables_pernet *nft_net; + + nft_net = net_generic(net, nf_tables_net_id); + list_add_tail(&trans->list, &nft_net->commit_list); +} + static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) { struct nft_trans *trans; @@ -280,7 +295,7 @@ static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type) if (msg_type == NFT_MSG_NEWTABLE) nft_activate_next(ctx->net, ctx->table); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -313,7 +328,7 @@ static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type) } } - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } @@ -386,7 +401,7 @@ static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type, ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID])); } nft_trans_rule(trans) = rule; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return trans; } @@ -452,7 +467,7 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, set); } nft_trans_set(trans) = set; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -484,7 +499,7 @@ static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, obj); nft_trans_obj(trans) = obj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -517,7 +532,7 @@ static int nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type, nft_activate_next(ctx->net, flowtable); nft_trans_flowtable(trans) = flowtable; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; } @@ -545,13 +560,15 @@ static struct nft_table *nft_table_lookup(const struct net *net, const struct nlattr *nla, u8 family, u8 genmask, u32 nlpid) { + struct nftables_pernet *nft_net; struct nft_table *table; if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(table, &net->nft.tables, list, - lockdep_is_held(&net->nft.commit_mutex)) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_rcu(table, &nft_net->tables, list, + lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) { @@ -570,9 +587,11 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, const struct nlattr *nla, u8 genmask) { + struct nftables_pernet *nft_net; struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && nft_active_genmask(table, genmask)) return table; @@ -625,6 +644,7 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...) { char module_name[MODULE_NAME_LEN]; + struct nftables_pernet *nft_net; struct nft_module_request *req; va_list args; int ret; @@ -635,7 +655,8 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, if (ret >= MODULE_NAME_LEN) return 0; - list_for_each_entry(req, &net->nft.module_list, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) return 0; @@ -651,7 +672,7 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, req->done = false; strlcpy(req->module, module_name, MODULE_NAME_LEN); - list_add_tail(&req->list, &net->nft.module_list); + list_add_tail(&req->list, &nft_net->module_list); return -EAGAIN; } @@ -690,7 +711,9 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, static __be16 nft_base_seq(const struct net *net) { - return htons(net->nft.base_seq & 0xffff); + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + return htons(nft_net->base_seq & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -751,6 +774,7 @@ static void nft_notify_enqueue(struct sk_buff *skb, bool report, static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { + struct nftables_pernet *nft_net; struct sk_buff *skb; int err; @@ -769,7 +793,8 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -779,15 +804,17 @@ static int nf_tables_dump_tables(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); + struct nftables_pernet *nft_net; const struct nft_table *table; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -972,7 +999,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) nft_trans_table_flags(trans) = flags; nft_trans_table_update(trans) = true; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err: nft_trans_destroy(trans); @@ -1035,6 +1062,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -1044,7 +1072,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, u32 flags = 0; int err; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); attr = nla[NFTA_TABLE_NAME]; table = nft_table_lookup(net, attr, family, genmask, NETLINK_CB(skb).portid); @@ -1105,7 +1133,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (err < 0) goto err_trans; - list_add_tail_rcu(&table->list, &net->nft.tables); + list_add_tail_rcu(&table->list, &nft_net->tables); return 0; err_trans: rhltable_destroy(&table->chains_ht); @@ -1193,11 +1221,12 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_table *table, *nt; const struct nlattr * const *nla = ctx->nla; int err = 0; - list_for_each_entry_safe(table, nt, &ctx->net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (family != AF_UNSPEC && table->family != family) continue; @@ -1316,7 +1345,9 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING - return lockdep_is_held(&net->nft.commit_mutex); + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + return lockdep_is_held(&nft_net->commit_mutex); #else return true; #endif @@ -1519,6 +1550,7 @@ nla_put_failure: static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { + struct nftables_pernet *nft_net; struct sk_buff *skb; int err; @@ -1538,7 +1570,8 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -1553,11 +1586,13 @@ static int nf_tables_dump_chains(struct sk_buff *skb, unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -1873,11 +1908,12 @@ static int nft_chain_parse_hook(struct net *net, struct nft_chain_hook *hook, u8 family, bool autoload) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; int err; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX, @@ -2266,6 +2302,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_trans *tmp; char *name; @@ -2275,7 +2312,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, goto err; err = -EEXIST; - list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) { + list_for_each_entry(tmp, &nft_net->commit_list, list) { if (tmp->msg_type == NFT_MSG_NEWCHAIN && tmp->ctx.table == table && nft_trans_chain_update(tmp) && @@ -2289,7 +2326,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_chain_name(trans) = name; } - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err: @@ -2301,10 +2338,11 @@ err: static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nlattr *nla) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { struct nft_chain *chain = trans->ctx.chain; if (trans->msg_type == NFT_MSG_NEWCHAIN && @@ -2319,6 +2357,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -2330,7 +2369,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, u64 handle = 0; u32 flags = 0; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, NETLINK_CB(skb).portid); @@ -2866,6 +2905,7 @@ nla_put_failure: static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct sk_buff *skb; int err; @@ -2885,7 +2925,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -2943,11 +2983,13 @@ static int nf_tables_dump_rules(struct sk_buff *skb, unsigned int idx = 0; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -3178,6 +3220,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); struct nft_expr_info *info = NULL; @@ -3195,7 +3238,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, int err, rem; u64 handle, pos_handle; - lockdep_assert_held(&net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, NETLINK_CB(skb).portid); @@ -3367,7 +3410,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, kvfree(info); chain->use++; - if (net->nft.validate_state == NFT_VALIDATE_DO) + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, table); if (chain->flags & NFT_CHAIN_HW_OFFLOAD) { @@ -3396,10 +3439,11 @@ err1: static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nlattr *nla) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { struct nft_rule *rule = nft_trans_rule(trans); if (trans->msg_type == NFT_MSG_NEWRULE && @@ -3512,13 +3556,14 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, enum nft_set_policies policy) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; u32 flags = 0; int i; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); if (nla[NFTA_SET_FLAGS] != NULL) @@ -3656,10 +3701,11 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, static struct nft_set *nft_set_lookup_byid(const struct net *net, const struct nlattr *nla, u8 genmask) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWSET) { struct nft_set *set = nft_trans_set(trans); @@ -3893,6 +3939,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, int event, gfp_t gfp_flags) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct sk_buff *skb; u32 portid = ctx->portid; int err; @@ -3911,7 +3958,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -3924,14 +3971,16 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2]; struct net *net = sock_net(skb->sk); struct nft_ctx *ctx = cb->data, ctx_set; + struct nftables_pernet *nft_net; if (cb->args[1]) return skb->len; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && ctx->family != table->family) continue; @@ -4770,6 +4819,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; struct net *net = sock_net(skb->sk); + struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; struct nft_set_dump_args args; @@ -4780,7 +4830,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) int event; rcu_read_lock(); - list_for_each_entry_rcu(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) continue; @@ -5064,6 +5115,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set_elem *elem, int event, u16 flags) { + struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; @@ -5083,7 +5135,8 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_net = net_generic(net, nf_tables_net_id); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -5551,7 +5604,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err_set_full: @@ -5582,6 +5635,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; @@ -5610,7 +5664,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return err; } - if (net->nft.validate_state == NFT_VALIDATE_DO) + if (nft_net->validate_state == NFT_VALIDATE_DO) return nft_table_validate(net, ctx.table); return 0; @@ -5746,7 +5800,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_set_elem_deactivate(ctx->net, set, &elem); nft_trans_elem(trans) = elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; fail_ops: @@ -5780,7 +5834,7 @@ static int nft_flush_set(const struct nft_ctx *ctx, nft_set_elem_deactivate(ctx->net, set, elem); nft_trans_elem_set(trans) = set; nft_trans_elem(trans) = *elem; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; err1: @@ -6074,7 +6128,7 @@ static int nf_tables_updobj(const struct nft_ctx *ctx, nft_trans_obj(trans) = obj; nft_trans_obj_update(trans) = true; nft_trans_obj_newobj(trans) = newobj; - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -6236,6 +6290,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) struct nft_obj_filter *filter = cb->data; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; + struct nftables_pernet *nft_net; struct nft_object *obj; bool reset = false; @@ -6243,9 +6298,10 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) reset = true; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -6268,7 +6324,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, - net->nft.base_seq); + nft_net->base_seq); audit_log_nfcfg(buf, family, @@ -6389,8 +6445,11 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, reset = true; if (reset) { - char *buf = kasprintf(GFP_ATOMIC, "%s:%u", - table->name, net->nft.base_seq); + const struct nftables_pernet *nft_net; + char *buf; + + nft_net = net_generic(net, nf_tables_net_id); + buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); audit_log_nfcfg(buf, family, @@ -6476,10 +6535,11 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, int family, int report, gfp_t gfp) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct sk_buff *skb; int err; char *buf = kasprintf(gfp, "%s:%u", - table->name, net->nft.base_seq); + table->name, nft_net->base_seq); audit_log_nfcfg(buf, family, @@ -6505,7 +6565,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, goto err; } - nft_notify_enqueue(skb, report, &net->nft.notify_list); + nft_notify_enqueue(skb, report, &nft_net->notify_list); return; err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -6837,7 +6897,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -7025,7 +7085,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); nft_flowtable_hook_release(&flowtable_hook); - list_add_tail(&trans->list, &ctx->net->nft.commit_list); + nft_trans_commit_list_add_tail(ctx->net, trans); return 0; @@ -7157,12 +7217,14 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; + struct nftables_pernet *nft_net; const struct nft_table *table; rcu_read_lock(); - cb->seq = net->nft.base_seq; + nft_net = net_generic(net, nf_tables_net_id); + cb->seq = nft_net->base_seq; - list_for_each_entry_rcu(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) continue; @@ -7297,6 +7359,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct list_head *hook_list, int event) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct sk_buff *skb; int err; @@ -7316,7 +7379,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, goto err; } - nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); + nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); @@ -7341,6 +7404,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); @@ -7350,7 +7414,7 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -7385,6 +7449,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_flowtable *flowtable; + struct nftables_pernet *nft_net; struct nft_table *table; struct net *net; @@ -7392,13 +7457,14 @@ static int nf_tables_flowtable_event(struct notifier_block *this, return 0; net = dev_net(dev); - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { nft_flowtable_event(event, dev, flowtable); } } - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } @@ -7579,16 +7645,17 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { static int nf_tables_validate(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table; - switch (net->nft.validate_state) { + switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: break; case NFT_VALIDATE_NEED: nft_validate_state_update(net, NFT_VALIDATE_DO); fallthrough; case NFT_VALIDATE_DO: - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_validate(net, table) < 0) return -EAGAIN; } @@ -7763,9 +7830,10 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha static void nf_tables_commit_chain_prepare_cancel(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { struct nft_chain *chain = trans->ctx.chain; if (trans->msg_type == NFT_MSG_NEWRULE || @@ -7874,10 +7942,11 @@ static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, static void nf_tables_module_autoload_cleanup(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_module_request *req, *next; - WARN_ON_ONCE(!list_empty(&net->nft.commit_list)); - list_for_each_entry_safe(req, next, &net->nft.module_list, list) { + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); + list_for_each_entry_safe(req, next, &nft_net->module_list, list) { WARN_ON_ONCE(!req->done); list_del(&req->list); kfree(req); @@ -7886,6 +7955,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net) static void nf_tables_commit_release(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; /* all side effects have to be made visible. @@ -7895,35 +7965,36 @@ static void nf_tables_commit_release(struct net *net) * Memory reclaim happens asynchronously from work queue * to prevent expensive synchronize_rcu() in commit phase. */ - if (list_empty(&net->nft.commit_list)) { + if (list_empty(&nft_net->commit_list)) { nf_tables_module_autoload_cleanup(net); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return; } - trans = list_last_entry(&net->nft.commit_list, + trans = list_last_entry(&nft_net->commit_list, struct nft_trans, list); get_net(trans->ctx.net); WARN_ON_ONCE(trans->put_net); trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); - list_splice_tail_init(&net->nft.commit_list, &nf_tables_destroy_list); + list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); schedule_work(&trans_destroy_work); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); } static void nft_commit_notify(struct net *net, u32 portid) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct sk_buff *batch_skb = NULL, *nskb, *skb; unsigned char *data; int len; - list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) { + list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) { if (!batch_skb) { new_batch: batch_skb = skb; @@ -7949,7 +8020,7 @@ new_batch: NFT_CB(batch_skb).report, GFP_KERNEL); } - WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } static int nf_tables_commit_audit_alloc(struct list_head *adl, @@ -8005,6 +8076,7 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) static int nf_tables_commit(struct net *net, struct sk_buff *skb) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_chain *chain; @@ -8012,8 +8084,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) LIST_HEAD(adl); int err; - if (list_empty(&net->nft.commit_list)) { - mutex_unlock(&net->nft.commit_mutex); + if (list_empty(&nft_net->commit_list)) { + mutex_unlock(&nft_net->commit_mutex); return 0; } @@ -8026,7 +8098,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) return err; /* 1. Allocate space for next generation rules_gen_X[] */ - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { int ret; ret = nf_tables_commit_audit_alloc(&adl, trans->ctx.table); @@ -8047,7 +8119,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) } /* step 2. Make rules_gen_X visible to packet path */ - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(chain, &table->chains, list) nf_tables_commit_chain(net, chain); } @@ -8056,12 +8128,13 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - while (++net->nft.base_seq == 0); + while (++nft_net->base_seq == 0) + ; /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); - list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { nf_tables_commit_audit_collect(&adl, trans->ctx.table, trans->msg_type); switch (trans->msg_type) { @@ -8216,7 +8289,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, net->nft.base_seq); + nf_tables_commit_audit_log(&adl, nft_net->base_seq); nf_tables_commit_release(net); return 0; @@ -8224,17 +8297,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) static void nf_tables_module_autoload(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_module_request *req, *next; LIST_HEAD(module_list); - list_splice_init(&net->nft.module_list, &module_list); - mutex_unlock(&net->nft.commit_mutex); + list_splice_init(&nft_net->module_list, &module_list); + mutex_unlock(&nft_net->commit_mutex); list_for_each_entry_safe(req, next, &module_list, list) { request_module("%s", req->module); req->done = true; } - mutex_lock(&net->nft.commit_mutex); - list_splice(&module_list, &net->nft.module_list); + mutex_lock(&nft_net->commit_mutex); + list_splice(&module_list, &nft_net->module_list); } static void nf_tables_abort_release(struct nft_trans *trans) @@ -8271,6 +8345,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_hook *hook; @@ -8279,7 +8354,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) nf_tables_validate(net) < 0) return -EAGAIN; - list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, + list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWTABLE: @@ -8403,7 +8478,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) synchronize_rcu(); list_for_each_entry_safe_reverse(trans, next, - &net->nft.commit_list, list) { + &nft_net->commit_list, list) { list_del(&trans->list); nf_tables_abort_release(trans); } @@ -8424,22 +8499,24 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); int ret = __nf_tables_abort(net, action); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return ret; } static bool nf_tables_valid_genid(struct net *net, u32 genid) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); bool genid_ok; - mutex_lock(&net->nft.commit_mutex); + mutex_lock(&nft_net->commit_mutex); - genid_ok = genid == 0 || net->nft.base_seq == genid; + genid_ok = genid == 0 || nft_net->base_seq == genid; if (!genid_ok) - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); /* else, commit mutex has to be released by commit or abort function */ return genid_ok; @@ -8994,9 +9071,10 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) static void __nft_release_hooks(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table; - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table)) continue; @@ -9053,9 +9131,10 @@ static void __nft_release_table(struct net *net, struct nft_table *table) static void __nft_release_tables(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_table *table, *nt; - list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (nft_table_has_owner(table)) continue; @@ -9066,6 +9145,7 @@ static void __nft_release_tables(struct net *net) static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct nftables_pernet *nft_net; struct netlink_notify *n = ptr; struct nft_table *table, *nt; struct net *net = n->net; @@ -9074,8 +9154,9 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { __nft_release_hook(net, table); @@ -9084,13 +9165,13 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, } if (release) { synchronize_rcu(); - list_for_each_entry_safe(table, nt, &net->nft.tables, list) { + list_for_each_entry_safe(table, nt, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) __nft_release_table(net, table); } } - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } @@ -9101,13 +9182,15 @@ static struct notifier_block nft_nl_notifier = { static int __net_init nf_tables_init_net(struct net *net) { - INIT_LIST_HEAD(&net->nft.tables); - INIT_LIST_HEAD(&net->nft.commit_list); - INIT_LIST_HEAD(&net->nft.module_list); - INIT_LIST_HEAD(&net->nft.notify_list); - mutex_init(&net->nft.commit_mutex); - net->nft.base_seq = 1; - net->nft.validate_state = NFT_VALIDATE_SKIP; + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + INIT_LIST_HEAD(&nft_net->tables); + INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->module_list); + INIT_LIST_HEAD(&nft_net->notify_list); + mutex_init(&nft_net->commit_mutex); + nft_net->base_seq = 1; + nft_net->validate_state = NFT_VALIDATE_SKIP; return 0; } @@ -9119,20 +9202,24 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { - mutex_lock(&net->nft.commit_mutex); - if (!list_empty(&net->nft.commit_list)) + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + + mutex_lock(&nft_net->commit_mutex); + if (!list_empty(&nft_net->commit_list)) __nf_tables_abort(net, NFNL_ABORT_NONE); __nft_release_tables(net); - mutex_unlock(&net->nft.commit_mutex); - WARN_ON_ONCE(!list_empty(&net->nft.tables)); - WARN_ON_ONCE(!list_empty(&net->nft.module_list)); - WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); + mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); + WARN_ON_ONCE(!list_empty(&nft_net->module_list)); + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .id = &nf_tables_net_id, + .size = sizeof(struct nftables_pernet), }; static int __init nf_tables_module_init(void) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9ae14270c543..43b56eff3b04 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -7,6 +7,8 @@ #include #include +extern unsigned int nf_tables_net_id; + static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; @@ -307,16 +309,18 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); - mutex_lock(&net->nft.commit_mutex); + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, @@ -412,9 +416,10 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); int err = 0; - list_for_each_entry_continue_reverse(trans, &net->nft.commit_list, list) { + list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -460,11 +465,12 @@ static void nft_flow_rule_offload_abort(struct net *net, int nft_flow_rule_offload_commit(struct net *net) { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); struct nft_trans *trans; int err = 0; u8 policy; - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -516,7 +522,7 @@ int nft_flow_rule_offload_commit(struct net *net) } } - list_for_each_entry(trans, &net->nft.commit_list, list) { + list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->ctx.family != NFPROTO_NETDEV) continue; @@ -536,15 +542,15 @@ int nft_flow_rule_offload_commit(struct net *net) return err; } -static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) +static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, + struct net_device *dev) { struct nft_base_chain *basechain; - struct net *net = dev_net(dev); struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -576,19 +582,21 @@ static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); + nft_net = net_generic(net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); - mutex_unlock(&net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index ff8528ad3dc6..7a9aa57b195b 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,8 @@ #include #include +extern unsigned int nf_tables_net_id; + #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, @@ -355,6 +358,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; struct nft_table *table; struct nft_chain *chain, *nr; struct nft_ctx ctx = { @@ -365,8 +369,9 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - mutex_lock(&ctx.net->nft.commit_mutex); - list_for_each_entry(table, &ctx.net->nft.tables, list) { + nft_net = net_generic(ctx.net, nf_tables_net_id); + mutex_lock(&nft_net->commit_mutex); + list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; @@ -380,7 +385,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, nft_netdev_event(event, dev, &ctx); } } - mutex_unlock(&ctx.net->nft.commit_mutex); + mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index d44a70c11b3f..f9437a0dcfef 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -11,6 +11,9 @@ #include #include #include +#include + +extern unsigned int nf_tables_net_id; struct nft_dynset { struct nft_set *set; @@ -161,13 +164,14 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); struct nft_dynset *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; u64 timeout; int err, i; - lockdep_assert_held(&ctx->net->nft.commit_mutex); + lockdep_assert_held(&nft_net->commit_mutex); if (tb[NFTA_DYNSET_SET_NAME] == NULL || tb[NFTA_DYNSET_OP] == NULL || -- cgit v1.2.3 From 7b5974709faf7628a036d3f0f14d4f403f536eac Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:11:12 +0200 Subject: netfilter: conntrack: move sysctl pointer to net_generic infra No need to keep this in struct net, place it in the net_generic data. The sysctl pointer is removed from struct net in a followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 3 +++ net/netfilter/nf_conntrack_standalone.c | 10 ++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 439379ca9ffa..ef405a134307 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -47,6 +47,9 @@ struct nf_conntrack_net { unsigned int users4; unsigned int users6; unsigned int users_bridge; +#ifdef CONFIG_SYSCTL + struct ctl_table_header *sysctl_header; +#endif }; #include diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0ee702d374b0..3f2cc7b04b20 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -1027,6 +1027,7 @@ static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, static int nf_conntrack_standalone_init_sysctl(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct nf_udp_net *un = nf_udp_pernet(net); struct ctl_table *table; @@ -1072,8 +1073,8 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_BUCKETS].mode = 0444; } - net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table); - if (!net->ct.sysctl_header) + cnet->sysctl_header = register_net_sysctl(net, "net/netfilter", table); + if (!cnet->sysctl_header) goto out_unregister_netfilter; return 0; @@ -1085,10 +1086,11 @@ out_unregister_netfilter: static void nf_conntrack_standalone_fini_sysctl(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct ctl_table *table; - table = net->ct.sysctl_header->ctl_table_arg; - unregister_net_sysctl_table(net->ct.sysctl_header); + table = cnet->sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(cnet->sysctl_header); kfree(table); } #else -- cgit v1.2.3 From 1379940bf809ba643eb10950c932f72d0191aa43 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:11:13 +0200 Subject: netfilter: conntrack: move ecache dwork to net_generic infra dwork struct is large (>128 byte) and not needed when conntrack module is not loaded. Place it in net_generic data instead. The struct net dwork member is now obsolete and will be removed in a followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 4 ++++ include/net/netfilter/nf_conntrack_ecache.h | 33 +++++++++++------------------ net/netfilter/nf_conntrack_core.c | 7 ++++-- net/netfilter/nf_conntrack_ecache.c | 31 ++++++++++++++++++++++----- 4 files changed, 47 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index ef405a134307..86d86c860ede 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -50,6 +50,10 @@ struct nf_conntrack_net { #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; #endif +#ifdef CONFIG_NF_CONNTRACK_EVENTS + struct delayed_work ecache_dwork; + struct netns_ct *ct_net; +#endif }; #include diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index eb81f9195e28..d00ba6048e44 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -171,12 +171,18 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report); +void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state); + void nf_conntrack_ecache_pernet_init(struct net *net); void nf_conntrack_ecache_pernet_fini(struct net *net); int nf_conntrack_ecache_init(void); void nf_conntrack_ecache_fini(void); +static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) +{ + return net->ct.ecache_dwork_pending; +} #else /* CONFIG_NF_CONNTRACK_EVENTS */ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, @@ -186,6 +192,11 @@ static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, { } +static inline void nf_conntrack_ecache_work(struct net *net, + enum nf_ct_ecache_state s) +{ +} + static inline void nf_conntrack_ecache_pernet_init(struct net *net) { } @@ -203,26 +214,6 @@ static inline void nf_conntrack_ecache_fini(void) { } +static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ - -static inline void nf_conntrack_ecache_delayed_work(struct net *net) -{ -#ifdef CONFIG_NF_CONNTRACK_EVENTS - if (!delayed_work_pending(&net->ct.ecache_dwork)) { - schedule_delayed_work(&net->ct.ecache_dwork, HZ); - net->ct.ecache_dwork_pending = true; - } -#endif -} - -static inline void nf_conntrack_ecache_work(struct net *net) -{ -#ifdef CONFIG_NF_CONNTRACK_EVENTS - if (net->ct.ecache_dwork_pending) { - net->ct.ecache_dwork_pending = false; - mod_delayed_work(system_wq, &net->ct.ecache_dwork, 0); - } -#endif -} - #endif /*_NF_CONNTRACK_ECACHE_H*/ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ff0168736f6e..ace3e8265e0a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -656,6 +656,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) { struct nf_conn_tstamp *tstamp; + struct net *net; if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) return false; @@ -670,11 +671,13 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) * be done by event cache worker on redelivery. */ nf_ct_delete_from_lists(ct); - nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); + nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); return false; } - nf_conntrack_ecache_work(nf_ct_net(ct)); + net = nf_ct_net(ct); + if (nf_conntrack_ecache_dwork_pending(net)) + nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); nf_ct_delete_from_lists(ct); nf_ct_put(ct); return true; diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 7956c9f19899..759d87aef95f 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -27,6 +27,8 @@ #include #include +extern unsigned int nf_conntrack_net_id; + static DEFINE_MUTEX(nf_ct_ecache_mutex); #define ECACHE_RETRY_WAIT (HZ/10) @@ -96,8 +98,8 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu) static void ecache_work(struct work_struct *work) { - struct netns_ct *ctnet = - container_of(work, struct netns_ct, ecache_dwork.work); + struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache_dwork.work); + struct netns_ct *ctnet = cnet->ct_net; int cpu, delay = -1; struct ct_pcpu *pcpu; @@ -127,7 +129,7 @@ static void ecache_work(struct work_struct *work) ctnet->ecache_dwork_pending = delay > 0; if (delay >= 0) - schedule_delayed_work(&ctnet->ecache_dwork, delay); + schedule_delayed_work(&cnet->ecache_dwork, delay); } int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, @@ -344,6 +346,20 @@ void nf_ct_expect_unregister_notifier(struct net *net, } EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); +void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) +{ + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + if (state == NFCT_ECACHE_DESTROY_FAIL && + !delayed_work_pending(&cnet->ecache_dwork)) { + schedule_delayed_work(&cnet->ecache_dwork, HZ); + net->ct.ecache_dwork_pending = true; + } else if (state == NFCT_ECACHE_DESTROY_SENT) { + net->ct.ecache_dwork_pending = false; + mod_delayed_work(system_wq, &cnet->ecache_dwork, 0); + } +} + #define NF_CT_EVENTS_DEFAULT 1 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; @@ -355,13 +371,18 @@ static const struct nf_ct_ext_type event_extend = { void nf_conntrack_ecache_pernet_init(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + net->ct.sysctl_events = nf_ct_events; - INIT_DELAYED_WORK(&net->ct.ecache_dwork, ecache_work); + cnet->ct_net = &net->ct; + INIT_DELAYED_WORK(&cnet->ecache_dwork, ecache_work); } void nf_conntrack_ecache_pernet_fini(struct net *net) { - cancel_delayed_work_sync(&net->ct.ecache_dwork); + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + cancel_delayed_work_sync(&cnet->ecache_dwork); } int nf_conntrack_ecache_init(void) -- cgit v1.2.3 From db3685b4046f8b629bbf73caa33751ce567ea8ff Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 1 Apr 2021 16:11:14 +0200 Subject: net: remove obsolete members from struct net all have been moved to generic_net infra. On x86_64, this reduces struct net size from 70 to 63 cache lines (4480 to 4032 byte). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/net_namespace.h | 9 --------- include/net/netns/conntrack.h | 4 ---- include/net/netns/netfilter.h | 6 ------ include/net/netns/nftables.h | 7 ------- include/net/netns/x_tables.h | 1 - 5 files changed, 27 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index dcaee24a4d87..fdb16dc33703 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -142,15 +142,6 @@ struct net { #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE) struct netns_nftables nft; #endif -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - struct netns_nf_frag nf_frag; - struct ctl_table_header *nf_frag_frags_hdr; -#endif - struct sock *nfnl; - struct sock *nfnl_stash; -#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT) - struct list_head nfct_timeout_list; -#endif #endif #ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 806454e767bf..e5f664d69ead 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -96,13 +96,9 @@ struct netns_ct { atomic_t count; unsigned int expect_count; #ifdef CONFIG_NF_CONNTRACK_EVENTS - struct delayed_work ecache_dwork; bool ecache_dwork_pending; #endif bool auto_assign_helper_warned; -#ifdef CONFIG_SYSCTL - struct ctl_table_header *sysctl_header; -#endif unsigned int sysctl_log_invalid; /* Log invalid packets */ int sysctl_events; int sysctl_acct; diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index ca043342c0eb..15e2b13fb0c0 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -28,11 +28,5 @@ struct netns_nf { #if IS_ENABLED(CONFIG_DECNET) struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; #endif -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) - bool defrag_ipv4; -#endif -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - bool defrag_ipv6; -#endif }; #endif diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 6c0806bd8d1e..8c77832d0240 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -5,14 +5,7 @@ #include struct netns_nftables { - struct list_head tables; - struct list_head commit_list; - struct list_head module_list; - struct list_head notify_list; - struct mutex commit_mutex; - unsigned int base_seq; u8 gencursor; - u8 validate_state; }; #endif diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h index 9bc5a12fdbb0..83c8ea2e87a6 100644 --- a/include/net/netns/x_tables.h +++ b/include/net/netns/x_tables.h @@ -8,7 +8,6 @@ struct ebt_table; struct netns_xt { - struct list_head tables[NFPROTO_NUMPROTO]; bool notrack_deprecated_warning; bool clusterip_deprecated_warning; #if defined(CONFIG_BRIDGE_NF_EBTABLES) || \ -- cgit v1.2.3 From f67743f9e03a67dbbf931d1787e6faf50766e521 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 6 Apr 2021 21:55:52 +0200 Subject: Bluetooth: Add support for reading AOSP vendor capabilities When drivers indicate support for AOSP vendor extension, initialize them and read its capabilities. Signed-off-by: Marcel Holtmann Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 11 +++++++++++ net/bluetooth/Kconfig | 7 +++++++ net/bluetooth/Makefile | 1 + net/bluetooth/aosp.c | 35 +++++++++++++++++++++++++++++++++++ net/bluetooth/aosp.h | 16 ++++++++++++++++ net/bluetooth/hci_core.c | 3 +++ 6 files changed, 73 insertions(+) create mode 100644 net/bluetooth/aosp.c create mode 100644 net/bluetooth/aosp.h (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ca4ac6603b9a..aa2879a3b0dd 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -586,6 +586,10 @@ struct hci_dev { void *msft_data; #endif +#if IS_ENABLED(CONFIG_BT_AOSPEXT) + bool aosp_capable; +#endif + int (*open)(struct hci_dev *hdev); int (*close)(struct hci_dev *hdev); int (*flush)(struct hci_dev *hdev); @@ -1239,6 +1243,13 @@ static inline void hci_set_msft_opcode(struct hci_dev *hdev, __u16 opcode) #endif } +static inline void hci_set_aosp_capable(struct hci_dev *hdev) +{ +#if IS_ENABLED(CONFIG_BT_AOSPEXT) + hdev->aosp_capable = true; +#endif +} + int hci_dev_open(__u16 dev); int hci_dev_close(__u16 dev); int hci_dev_do_close(struct hci_dev *hdev); diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 400c5130dc0a..e0ab4cd7afc3 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -99,6 +99,13 @@ config BT_MSFTEXT This options enables support for the Microsoft defined HCI vendor extensions. +config BT_AOSPEXT + bool "Enable Android Open Source Project extensions" + depends on BT + help + This options enables support for the Android Open Source + Project defined HCI vendor extensions. + config BT_DEBUGFS bool "Export Bluetooth internals in debugfs" depends on BT && DEBUG_FS diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 1c645fba8c49..cc0995301f93 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -20,5 +20,6 @@ bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o bluetooth-$(CONFIG_BT_LEDS) += leds.o bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o +bluetooth-$(CONFIG_BT_AOSPEXT) += aosp.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/aosp.c b/net/bluetooth/aosp.c new file mode 100644 index 000000000000..a1b7762335a5 --- /dev/null +++ b/net/bluetooth/aosp.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Intel Corporation + */ + +#include +#include + +#include "aosp.h" + +void aosp_do_open(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + if (!hdev->aosp_capable) + return; + + bt_dev_dbg(hdev, "Initialize AOSP extension"); + + /* LE Get Vendor Capabilities Command */ + skb = __hci_cmd_sync(hdev, hci_opcode_pack(0x3f, 0x153), 0, NULL, + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) + return; + + kfree_skb(skb); +} + +void aosp_do_close(struct hci_dev *hdev) +{ + if (!hdev->aosp_capable) + return; + + bt_dev_dbg(hdev, "Cleanup of AOSP extension"); +} diff --git a/net/bluetooth/aosp.h b/net/bluetooth/aosp.h new file mode 100644 index 000000000000..328fc6d39f70 --- /dev/null +++ b/net/bluetooth/aosp.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Intel Corporation + */ + +#if IS_ENABLED(CONFIG_BT_AOSPEXT) + +void aosp_do_open(struct hci_dev *hdev); +void aosp_do_close(struct hci_dev *hdev); + +#else + +static inline void aosp_do_open(struct hci_dev *hdev) {} +static inline void aosp_do_close(struct hci_dev *hdev) {} + +#endif diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b0d9c36acc03..0da9b3274986 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -44,6 +44,7 @@ #include "smp.h" #include "leds.h" #include "msft.h" +#include "aosp.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -1586,6 +1587,7 @@ setup_failed: ret = hdev->set_diag(hdev, true); msft_do_open(hdev); + aosp_do_open(hdev); clear_bit(HCI_INIT, &hdev->flags); @@ -1782,6 +1784,7 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_DOWN); + aosp_do_close(hdev); msft_do_close(hdev); if (hdev->flush) -- cgit v1.2.3 From 77651900cede54930cd8a039dcd4583bfa308807 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 5 Apr 2021 16:13:41 -0700 Subject: usbnet: add _mii suffix to usbnet_set/get_link_ksettings The generic functions assumed devices provided an MDIO interface (accessed via older mii code, not phylib). This is true only for genuine ethernet. Devices with a higher level of abstraction or based on different technologies do not have MDIO. To support this case, first rename the existing functions with _mii suffix. v2: rebased on changed upstream v3: changed names to clearly say that this does NOT use phylib v4: moved hunks to correct patch; reworded commmit messages Signed-off-by : Oliver Neukum Tested-by: Roland Dreier Reviewed-by: Grant Grundler Tested-by: Grant Grundler Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/usb/asix_devices.c | 12 ++++++------ drivers/net/usb/cdc_ncm.c | 4 ++-- drivers/net/usb/dm9601.c | 4 ++-- drivers/net/usb/mcs7830.c | 4 ++-- drivers/net/usb/sierra_net.c | 4 ++-- drivers/net/usb/smsc75xx.c | 4 ++-- drivers/net/usb/sr9700.c | 4 ++-- drivers/net/usb/sr9800.c | 4 ++-- drivers/net/usb/usbnet.c | 15 +++++++++------ include/linux/usb/usbnet.h | 4 ++-- 10 files changed, 31 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index 6e13d8165852..19a8fafb8f04 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -125,8 +125,8 @@ static const struct ethtool_ops ax88172_ethtool_ops = { .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static void ax88172_set_multicast(struct net_device *net) @@ -291,8 +291,8 @@ static const struct ethtool_ops ax88772_ethtool_ops = { .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int ax88772_link_reset(struct usbnet *dev) @@ -782,8 +782,8 @@ static const struct ethtool_ops ax88178_ethtool_ops = { .get_eeprom = asix_get_eeprom, .set_eeprom = asix_set_eeprom, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int marvell_phy_init(struct usbnet *dev) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 8ae565a801b5..04f3851dd48b 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -142,8 +142,8 @@ static const struct ethtool_ops cdc_ncm_ethtool_ops = { .get_sset_count = cdc_ncm_get_sset_count, .get_strings = cdc_ncm_get_strings, .get_ethtool_stats = cdc_ncm_get_ethtool_stats, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static u32 cdc_ncm_check_rx_max(struct usbnet *dev, u32 new_rx) diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c index b5d2ac55a874..89cc61d7a675 100644 --- a/drivers/net/usb/dm9601.c +++ b/drivers/net/usb/dm9601.c @@ -282,8 +282,8 @@ static const struct ethtool_ops dm9601_ethtool_ops = { .get_eeprom_len = dm9601_get_eeprom_len, .get_eeprom = dm9601_get_eeprom, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static void dm9601_set_multicast(struct net_device *net) diff --git a/drivers/net/usb/mcs7830.c b/drivers/net/usb/mcs7830.c index fc512b780d15..9f9352a4522f 100644 --- a/drivers/net/usb/mcs7830.c +++ b/drivers/net/usb/mcs7830.c @@ -452,8 +452,8 @@ static const struct ethtool_ops mcs7830_ethtool_ops = { .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static const struct net_device_ops mcs7830_netdev_ops = { diff --git a/drivers/net/usb/sierra_net.c b/drivers/net/usb/sierra_net.c index 55a244eca5ca..55025202dc4f 100644 --- a/drivers/net/usb/sierra_net.c +++ b/drivers/net/usb/sierra_net.c @@ -629,8 +629,8 @@ static const struct ethtool_ops sierra_net_ethtool_ops = { .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int sierra_net_get_fw_attr(struct usbnet *dev, u16 *datap) diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c index 4353b370249f..f8cdabb9ef5a 100644 --- a/drivers/net/usb/smsc75xx.c +++ b/drivers/net/usb/smsc75xx.c @@ -741,8 +741,8 @@ static const struct ethtool_ops smsc75xx_ethtool_ops = { .set_eeprom = smsc75xx_ethtool_set_eeprom, .get_wol = smsc75xx_ethtool_get_wol, .set_wol = smsc75xx_ethtool_set_wol, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int smsc75xx_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd) diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c index 878557ad03ad..ce29261263cd 100644 --- a/drivers/net/usb/sr9700.c +++ b/drivers/net/usb/sr9700.c @@ -250,8 +250,8 @@ static const struct ethtool_ops sr9700_ethtool_ops = { .get_eeprom_len = sr9700_get_eeprom_len, .get_eeprom = sr9700_get_eeprom, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static void sr9700_set_multicast(struct net_device *netdev) diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c index da56735d7755..a822d81310d5 100644 --- a/drivers/net/usb/sr9800.c +++ b/drivers/net/usb/sr9800.c @@ -527,8 +527,8 @@ static const struct ethtool_ops sr9800_ethtool_ops = { .get_eeprom_len = sr_get_eeprom_len, .get_eeprom = sr_get_eeprom, .nway_reset = usbnet_nway_reset, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; static int sr9800_link_reset(struct usbnet *dev) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index f4f37ecfed58..5b4629c80b4b 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -944,7 +944,10 @@ EXPORT_SYMBOL_GPL(usbnet_open); * they'll probably want to use this base set. */ -int usbnet_get_link_ksettings(struct net_device *net, +/* These methods are written on the assumption that the device + * uses MII + */ +int usbnet_get_link_ksettings_mii(struct net_device *net, struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); @@ -956,9 +959,9 @@ int usbnet_get_link_ksettings(struct net_device *net, return 0; } -EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings); +EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_mii); -int usbnet_set_link_ksettings(struct net_device *net, +int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd) { struct usbnet *dev = netdev_priv(net); @@ -978,7 +981,7 @@ int usbnet_set_link_ksettings(struct net_device *net, return retval; } -EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings); +EXPORT_SYMBOL_GPL(usbnet_set_link_ksettings_mii); u32 usbnet_get_link (struct net_device *net) { @@ -1043,8 +1046,8 @@ static const struct ethtool_ops usbnet_ethtool_ops = { .get_msglevel = usbnet_get_msglevel, .set_msglevel = usbnet_set_msglevel, .get_ts_info = ethtool_op_get_ts_info, - .get_link_ksettings = usbnet_get_link_ksettings, - .set_link_ksettings = usbnet_set_link_ksettings, + .get_link_ksettings = usbnet_get_link_ksettings_mii, + .set_link_ksettings = usbnet_set_link_ksettings_mii, }; /*-------------------------------------------------------------------------*/ diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index cfbfd6fe01df..a89e1452107d 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -267,9 +267,9 @@ extern void usbnet_pause_rx(struct usbnet *); extern void usbnet_resume_rx(struct usbnet *); extern void usbnet_purge_paused_rxq(struct usbnet *); -extern int usbnet_get_link_ksettings(struct net_device *net, +extern int usbnet_get_link_ksettings_mii(struct net_device *net, struct ethtool_link_ksettings *cmd); -extern int usbnet_set_link_ksettings(struct net_device *net, +extern int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd); extern u32 usbnet_get_link(struct net_device *net); extern u32 usbnet_get_msglevel(struct net_device *); -- cgit v1.2.3 From 956baa99571bbaf88f3e91190dfb498c685b0e21 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Mon, 5 Apr 2021 16:13:42 -0700 Subject: usbnet: add method for reporting speed without MII The old method for reporting link speed assumed a driver uses the generic phy (mii) MDIO read/write functions. CDC devices don't expose the phy. Add a primitive internal version reporting back directly what the CDC notification/status operations recorded. v2: rebased on upstream v3: changed names and made clear which units are used v4: moved hunks to correct patch; rewrote commmit messages Signed-off-by: Oliver Neukum Tested-by: Roland Dreier Reviewed-by: Grant Grundler Tested-by: Grant Grundler Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 23 +++++++++++++++++++++++ include/linux/usb/usbnet.h | 7 +++++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 5b4629c80b4b..ecf62849f4c1 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -961,6 +961,27 @@ int usbnet_get_link_ksettings_mii(struct net_device *net, } EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_mii); +int usbnet_get_link_ksettings_internal(struct net_device *net, + struct ethtool_link_ksettings *cmd) +{ + struct usbnet *dev = netdev_priv(net); + + /* the assumption that speed is equal on tx and rx + * is deeply engrained into the networking layer. + * For wireless stuff it is not true. + * We assume that rx_speed matters more. + */ + if (dev->rx_speed != SPEED_UNSET) + cmd->base.speed = dev->rx_speed / 1000000; + else if (dev->tx_speed != SPEED_UNSET) + cmd->base.speed = dev->tx_speed / 1000000; + else + cmd->base.speed = SPEED_UNKNOWN; + + return 0; +} +EXPORT_SYMBOL_GPL(usbnet_get_link_ksettings_internal); + int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd) { @@ -1664,6 +1685,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) dev->intf = udev; dev->driver_info = info; dev->driver_name = name; + dev->rx_speed = SPEED_UNSET; + dev->tx_speed = SPEED_UNSET; net->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!net->tstats) diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index a89e1452107d..8336e86ce606 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -53,6 +53,9 @@ struct usbnet { u32 hard_mtu; /* count any extra framing */ size_t rx_urb_size; /* size for rx urbs */ struct mii_if_info mii; + long rx_speed; /* If MII not used */ + long tx_speed; /* If MII not used */ +# define SPEED_UNSET -1 /* various kinds of pending driver work */ struct sk_buff_head rxq; @@ -81,8 +84,6 @@ struct usbnet { # define EVENT_LINK_CHANGE 11 # define EVENT_SET_RX_MODE 12 # define EVENT_NO_IP_ALIGN 13 - u32 rx_speed; /* in bps - NOT Mbps */ - u32 tx_speed; /* in bps - NOT Mbps */ }; static inline struct usb_driver *driver_of(struct usb_interface *intf) @@ -271,6 +272,8 @@ extern int usbnet_get_link_ksettings_mii(struct net_device *net, struct ethtool_link_ksettings *cmd); extern int usbnet_set_link_ksettings_mii(struct net_device *net, const struct ethtool_link_ksettings *cmd); +extern int usbnet_get_link_ksettings_internal(struct net_device *net, + struct ethtool_link_ksettings *cmd); extern u32 usbnet_get_link(struct net_device *net); extern u32 usbnet_get_msglevel(struct net_device *); extern void usbnet_set_msglevel(struct net_device *, u32); -- cgit v1.2.3 From a460513ed4b6994bfeb7bd86f72853140bc1ac12 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 6 Apr 2021 13:22:51 +0300 Subject: time64.h: Consolidated PSEC_PER_SEC definition We have currently three users of the PSEC_PER_SEC each of them defining it individually. Instead, move it to time64.h to be available for everyone. There is a new user coming with the same constant in use. It will also make its life easier. Signed-off-by: Andy Shevchenko Acked-by: Heiko Stuebner Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_ptp.c | 2 ++ drivers/phy/phy-core-mipi-dphy.c | 2 -- drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c | 8 ++++---- include/soc/mscc/ocelot_ptp.h | 2 -- include/vdso/time64.h | 1 + 5 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index a33ab315cc6b..87ad2137ba06 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 Microsemi Corporation * Copyright 2020 NXP */ +#include + #include #include #include diff --git a/drivers/phy/phy-core-mipi-dphy.c b/drivers/phy/phy-core-mipi-dphy.c index 14e0551cd319..77fe65367ce5 100644 --- a/drivers/phy/phy-core-mipi-dphy.c +++ b/drivers/phy/phy-core-mipi-dphy.c @@ -12,8 +12,6 @@ #include #include -#define PSEC_PER_SEC 1000000000000LL - /* * Minimum D-PHY timings based on MIPI D-PHY specification. Derived * from the valid ranges specified in Section 6.9, Table 14, Page 41 diff --git a/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c b/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c index 8af8c6c5cc02..347dc79a18c1 100644 --- a/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c +++ b/drivers/phy/rockchip/phy-rockchip-inno-dsidphy.c @@ -11,16 +11,16 @@ #include #include #include +#include #include #include #include +#include #include +#include + #include #include -#include -#include - -#define PSEC_PER_SEC 1000000000000LL #define UPDATE(x, h, l) (((x) << (l)) & GENMASK((h), (l))) diff --git a/include/soc/mscc/ocelot_ptp.h b/include/soc/mscc/ocelot_ptp.h index 6a7388fa7cc5..ded497d72bdb 100644 --- a/include/soc/mscc/ocelot_ptp.h +++ b/include/soc/mscc/ocelot_ptp.h @@ -37,8 +37,6 @@ enum { #define PTP_CFG_MISC_PTP_EN BIT(2) -#define PSEC_PER_SEC 1000000000000LL - #define PTP_CFG_CLK_ADJ_CFG_ENA BIT(0) #define PTP_CFG_CLK_ADJ_CFG_DIR BIT(1) diff --git a/include/vdso/time64.h b/include/vdso/time64.h index 9d43c3f5e89d..b40cfa2aa33c 100644 --- a/include/vdso/time64.h +++ b/include/vdso/time64.h @@ -9,6 +9,7 @@ #define NSEC_PER_MSEC 1000000L #define USEC_PER_SEC 1000000L #define NSEC_PER_SEC 1000000000L +#define PSEC_PER_SEC 1000000000000LL #define FSEC_PER_SEC 1000000000000000LL #endif /* __VDSO_TIME64_H */ -- cgit v1.2.3 From a91d98a0a2b8e4c433b7341708f7d706e0cf1c8e Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Thu, 10 Sep 2020 15:28:02 +0800 Subject: net/mlx5: Map register values to restore objects Currently reg_c0 lower 16 bits and reg_b are used to store the chain id that missed in FDB and NIC tables accordingly. However, the registers' values may index a restore object, rather than a single u32 value. Different object types can be used to restore mutually exclusive contexts such as chain id and sample group id. Use the mapping object to associate an index with a restore object as a prestep for supporting additional restore types. Signed-off-by: Chris Mi Reviewed-by: Oz Shlomo Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/rep/tc.c | 38 +++++++++++++--------- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 +++-- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 13 ++++++-- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 14 +++----- .../ethernet/mellanox/mlx5/core/lib/fs_chains.c | 20 ++++++++---- .../ethernet/mellanox/mlx5/core/lib/fs_chains.h | 3 +- include/linux/mlx5/eswitch.h | 9 +++-- 7 files changed, 63 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index 11a44d30adc7..dde83cba85c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -618,9 +618,10 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, struct mlx5e_tc_update_priv *tc_priv) { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - u32 chain = 0, reg_c0, reg_c1, tunnel_id, zone_restore_id; + u32 reg_c0, reg_c1, tunnel_id, zone_restore_id; struct mlx5_rep_uplink_priv *uplink_priv; struct mlx5e_rep_priv *uplink_rpriv; + struct mlx5_mapped_obj mapped_obj; struct tc_skb_ext *tc_skb_ext; struct mlx5_eswitch *esw; struct mlx5e_priv *priv; @@ -640,30 +641,35 @@ bool mlx5e_rep_tc_update_skb(struct mlx5_cqe64 *cqe, priv = netdev_priv(skb->dev); esw = priv->mdev->priv.eswitch; - err = mlx5_get_chain_for_tag(esw_chains(esw), reg_c0, &chain); + err = mlx5_get_mapped_object(esw_chains(esw), reg_c0, &mapped_obj); if (err) { netdev_dbg(priv->netdev, - "Couldn't find chain for chain tag: %d, err: %d\n", + "Couldn't find mapped object for reg_c0: %d, err: %d\n", reg_c0, err); return false; } - if (chain) { - tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); - if (!tc_skb_ext) { - WARN_ON(1); - return false; - } + if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN) { + if (mapped_obj.chain) { + tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); + if (!tc_skb_ext) { + WARN_ON(1); + return false; + } - tc_skb_ext->chain = chain; + tc_skb_ext->chain = mapped_obj.chain; - zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK; + zone_restore_id = reg_c1 & ESW_ZONE_ID_MASK; - uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); - uplink_priv = &uplink_rpriv->uplink_priv; - if (!mlx5e_tc_ct_restore_flow(uplink_priv->ct_priv, skb, - zone_restore_id)) - return false; + uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH); + uplink_priv = &uplink_rpriv->uplink_priv; + if (!mlx5e_tc_ct_restore_flow(uplink_priv->ct_priv, skb, + zone_restore_id)) + return false; + } + } else { + netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type); + return false; } tunnel_id = reg_c1 >> ESW_TUN_OFFSET; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index bb1e0d442b5c..9b5607ddb9a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4973,6 +4973,7 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, u32 chain = 0, chain_tag, reg_b, zone_restore_id; struct mlx5e_priv *priv = netdev_priv(skb->dev); struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5_mapped_obj mapped_obj; struct tc_skb_ext *tc_skb_ext; int err; @@ -4980,7 +4981,7 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, chain_tag = reg_b & MLX5E_TC_TABLE_CHAIN_TAG_MASK; - err = mlx5_get_chain_for_tag(nic_chains(priv), chain_tag, &chain); + err = mlx5_get_mapped_object(nic_chains(priv), chain_tag, &mapped_obj); if (err) { netdev_dbg(priv->netdev, "Couldn't find chain for chain tag: %d, err: %d\n", @@ -4988,7 +4989,8 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, return false; } - if (chain) { + if (mapped_obj.type == MLX5_MAPPED_OBJ_CHAIN) { + chain = mapped_obj.chain; tc_skb_ext = skb_ext_add(skb, TC_SKB_EXT); if (WARN_ON(!tc_skb_ext)) return false; @@ -5001,6 +5003,9 @@ bool mlx5e_tc_update_skb(struct mlx5_cqe64 *cqe, if (!mlx5e_tc_ct_restore_flow(tc->ct, skb, zone_restore_id)) return false; + } else { + netdev_dbg(priv->netdev, "Invalid mapped object type: %d\n", mapped_obj.type); + return false; } #endif /* CONFIG_NET_TC_SKB_EXT */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index e0415676821a..c5b35e7f8aed 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -47,6 +47,17 @@ #include "sf/sf.h" #include "en/tc_ct.h" +enum mlx5_mapped_obj_type { + MLX5_MAPPED_OBJ_CHAIN, +}; + +struct mlx5_mapped_obj { + enum mlx5_mapped_obj_type type; + union { + u32 chain; + }; +}; + #ifdef CONFIG_MLX5_ESWITCH #define ESW_OFFLOADS_DEFAULT_NUM_GROUPS 15 @@ -733,8 +744,6 @@ mlx5_esw_vporttbl_put(struct mlx5_eswitch *esw, struct mlx5_vport_tbl_attr *attr struct mlx5_flow_handle * esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag); -u32 -esw_get_max_restore_tag(struct mlx5_eswitch *esw); int esw_offloads_load_rep(struct mlx5_eswitch *esw, u16 vport_num); void esw_offloads_unload_rep(struct mlx5_eswitch *esw, u16 vport_num); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 8ac4b60ea225..117d9fa93ff5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1286,7 +1286,7 @@ esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag) misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters_2); MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, - ESW_CHAIN_TAG_METADATA_MASK); + ESW_REG_C0_USER_DATA_METADATA_MASK); misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters_2); MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, tag); @@ -1312,12 +1312,6 @@ esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag) return flow_rule; } -u32 -esw_get_max_restore_tag(struct mlx5_eswitch *esw) -{ - return ESW_CHAIN_TAG_METADATA_MASK; -} - #define MAX_PF_SQ 256 #define MAX_SQ_NVPORTS 32 @@ -1434,7 +1428,7 @@ esw_chains_create(struct mlx5_eswitch *esw, struct mlx5_flow_table *miss_fdb) attr.max_ft_sz = fdb_max; attr.max_grp_num = esw->params.large_group_num; attr.default_ft = miss_fdb; - attr.max_restore_tag = esw_get_max_restore_tag(esw); + attr.max_restore_tag = ESW_REG_C0_USER_DATA_METADATA_MASK; chains = mlx5_chains_create(dev, &attr); if (IS_ERR(chains)) { @@ -1928,7 +1922,7 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw) goto out_free; } - ft_attr.max_fte = 1 << ESW_CHAIN_TAG_METADATA_BITS; + ft_attr.max_fte = 1 << ESW_REG_C0_USER_DATA_METADATA_BITS; ft = mlx5_create_flow_table(ns, &ft_attr); if (IS_ERR(ft)) { err = PTR_ERR(ft); @@ -1943,7 +1937,7 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw) misc_parameters_2); MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, - ESW_CHAIN_TAG_METADATA_MASK); + ESW_REG_C0_USER_DATA_METADATA_MASK); MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ft_attr.max_fte - 1); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c index 381325b4a863..00ff809dcfe8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.c @@ -832,8 +832,7 @@ mlx5_chains_init(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr) if (err) goto init_prios_ht_err; - mapping = mapping_create(sizeof(u32), attr->max_restore_tag, - true); + mapping = mapping_create(sizeof(struct mlx5_mapped_obj), attr->max_restore_tag, true); if (IS_ERR(mapping)) { err = PTR_ERR(mapping); goto mapping_err; @@ -884,21 +883,28 @@ int mlx5_chains_get_chain_mapping(struct mlx5_fs_chains *chains, u32 chain, u32 *chain_mapping) { - return mapping_add(chains_mapping(chains), &chain, chain_mapping); + struct mapping_ctx *ctx = chains->chains_mapping; + struct mlx5_mapped_obj mapped_obj = {}; + + mapped_obj.type = MLX5_MAPPED_OBJ_CHAIN; + mapped_obj.chain = chain; + return mapping_add(ctx, &mapped_obj, chain_mapping); } int mlx5_chains_put_chain_mapping(struct mlx5_fs_chains *chains, u32 chain_mapping) { - return mapping_remove(chains_mapping(chains), chain_mapping); + struct mapping_ctx *ctx = chains->chains_mapping; + + return mapping_remove(ctx, chain_mapping); } -int mlx5_get_chain_for_tag(struct mlx5_fs_chains *chains, u32 tag, - u32 *chain) +int +mlx5_get_mapped_object(struct mlx5_fs_chains *chains, u32 tag, struct mlx5_mapped_obj *obj) { int err; - err = mapping_find(chains_mapping(chains), tag, chain); + err = mapping_find(chains->chains_mapping, tag, obj); if (err) { mlx5_core_warn(chains->dev, "Can't find chain for tag: %d\n", tag); return -ENOENT; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h index 6d5be31b05dd..75a3bba12a78 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_chains.h @@ -7,6 +7,7 @@ #include struct mlx5_fs_chains; +struct mlx5_mapped_obj; enum mlx5_chains_flags { MLX5_CHAINS_AND_PRIOS_SUPPORTED = BIT(0), @@ -64,7 +65,7 @@ mlx5_chains_create(struct mlx5_core_dev *dev, struct mlx5_chains_attr *attr); void mlx5_chains_destroy(struct mlx5_fs_chains *chains); int -mlx5_get_chain_for_tag(struct mlx5_fs_chains *chains, u32 tag, u32 *chain); +mlx5_get_mapped_object(struct mlx5_fs_chains *chains, u32 tag, struct mlx5_mapped_obj *obj); void mlx5_chains_set_end_ft(struct mlx5_fs_chains *chains, diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 994c2c8cb4fd..125ae482383b 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -74,20 +74,19 @@ bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw); bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw); /* Reg C0 usage: - * Reg C0 = < ESW_PFNUM_BITS(4) | ESW_VPORT BITS(12) | ESW_CHAIN_TAG(16) > + * Reg C0 = < ESW_PFNUM_BITS(4) | ESW_VPORT BITS(12) | ESW_REG_C0_OBJ(16) > * * Highest 4 bits of the reg c0 is the PF_NUM (range 0-15), 12 bits of * unique non-zero vport id (range 1-4095). The rest (lowest 16 bits) is left - * for tc chain tag restoration. + * for user data objects managed by a common mapping context. * PFNUM + VPORT comprise the SOURCE_PORT matching. */ #define ESW_VPORT_BITS 12 #define ESW_PFNUM_BITS 4 #define ESW_SOURCE_PORT_METADATA_BITS (ESW_PFNUM_BITS + ESW_VPORT_BITS) #define ESW_SOURCE_PORT_METADATA_OFFSET (32 - ESW_SOURCE_PORT_METADATA_BITS) -#define ESW_CHAIN_TAG_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS) -#define ESW_CHAIN_TAG_METADATA_MASK GENMASK(ESW_CHAIN_TAG_METADATA_BITS - 1,\ - 0) +#define ESW_REG_C0_USER_DATA_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS) +#define ESW_REG_C0_USER_DATA_METADATA_MASK GENMASK(ESW_REG_C0_USER_DATA_METADATA_BITS - 1, 0) static inline u32 mlx5_eswitch_get_vport_metadata_mask(void) { -- cgit v1.2.3 From 30f60bae80922582a16e80b070171a865fce58cd Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 6 Apr 2021 17:15:58 -0700 Subject: mptcp: use mptcp_addr_info in mptcp_out_options This patch moved the mptcp_addr_info struct from protocol.h to mptcp.h, added a new struct mptcp_addr_info member addr in struct mptcp_out_options, and dropped the original addr, addr6, addr_id and port fields in it. Then we can use opts->addr to get the adding address from PM directly using mptcp_pm_add_addr_signal. Since the port number became big-endian now, use ntohs to convert it before sending it out with the ADD_ADDR suboption. Also convert it when passing it to add_addr_generate_hmac or printing it out. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/mptcp.h | 21 +++++++++++++-------- net/mptcp/options.c | 44 ++++++++++++++++++++------------------------ net/mptcp/protocol.h | 12 ------------ 3 files changed, 33 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 16fe34d139c3..83f23774b908 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -41,20 +41,25 @@ struct mptcp_rm_list { u8 nr; }; +struct mptcp_addr_info { + u8 id; + sa_family_t family; + __be16 port; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; +}; + struct mptcp_out_options { #if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; u64 sndr_key; u64 rcvr_key; - union { - struct in_addr addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; -#endif - }; - u8 addr_id; - u16 port; u64 ahmac; + struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u8 join_id; u8 backup; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4b7119eb2c31..352c128337a7 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -626,7 +626,6 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; - struct mptcp_addr_info saddr; bool echo; bool port; int len; @@ -643,45 +642,40 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * } if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port))) + !(mptcp_pm_add_addr_signal(msk, remaining, &opts->addr, &echo, &port))) return false; - len = mptcp_add_addr_len(saddr.family, echo, port); + len = mptcp_add_addr_len(opts->addr.family, echo, port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) *size -= opt_size; - opts->addr_id = saddr.id; - if (port) - opts->port = ntohs(saddr.port); - if (saddr.family == AF_INET) { + if (opts->addr.family == AF_INET) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR; - opts->addr = saddr.addr; if (!echo) { opts->ahmac = add_addr_generate_hmac(msk->local_key, msk->remote_key, - opts->addr_id, - &opts->addr, - opts->port); + opts->addr.id, + &opts->addr.addr, + ntohs(opts->addr.port)); } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (saddr.family == AF_INET6) { + else if (opts->addr.family == AF_INET6) { opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; - opts->addr6 = saddr.addr6; if (!echo) { opts->ahmac = add_addr6_generate_hmac(msk->local_key, msk->remote_key, - opts->addr_id, - &opts->addr6, - opts->port); + opts->addr.id, + &opts->addr.addr6, + ntohs(opts->addr.port)); } } #endif pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", - opts->addr_id, opts->ahmac, echo, opts->port); + opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } @@ -1217,7 +1211,7 @@ mp_capable_done: len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif - if (opts->port) + if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { @@ -1226,28 +1220,30 @@ mp_capable_done: } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, - len, echo, opts->addr_id); + len, echo, opts->addr.id); if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { - memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { - memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif - if (!opts->port) { + if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { + u16 port = ntohs(opts->addr.port); + if (opts->ahmac) { u8 *bptr = (u8 *)ptr; - put_unaligned_be16(opts->port, bptr); + put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; @@ -1256,7 +1252,7 @@ mp_capable_done: ptr += 3; } else { - put_unaligned_be32(opts->port << 16 | + put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index cb5dad522f39..4890dbb9f710 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -169,18 +169,6 @@ static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) ((nib & 0xF) << 8) | field); } -struct mptcp_addr_info { - sa_family_t family; - __be16 port; - u8 id; - union { - struct in_addr addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - struct in6_addr addr6; -#endif - }; -}; - enum mptcp_pm_status { MPTCP_PM_ADD_ADDR_RECEIVED, MPTCP_PM_ADD_ADDR_SEND_ACK, -- cgit v1.2.3 From 56f15e2cb1f77fbcf9df38de7e5dcb4b37070196 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Apr 2021 17:23:59 -0700 Subject: ethtool: document PHY tunable callbacks Add missing kdoc for phy tunable callbacks. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3583f7fc075c..5c631a298994 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -410,6 +410,8 @@ struct ethtool_pause_stats { * @get_ethtool_phy_stats: Return extended statistics about the PHY device. * This is only useful if the device maintains PHY statistics and * cannot use the standard PHY library helpers. + * @get_phy_tunable: Read the value of a PHY tunable. + * @set_phy_tunable: Set the value of a PHY tunable. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must -- cgit v1.2.3 From 0854fa82c96ca37a35e954b7079c0bfd795affb1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 Apr 2021 23:40:51 -0700 Subject: net: remove the new_ifindex argument from dev_change_net_namespace Here is only one place where we want to specify new_ifindex. In all other cases, callers pass 0 as new_ifindex. It looks reasonable to add a low-level function with new_ifindex and to convert dev_change_net_namespace to a static inline wrapper. Fixes: eeb85a14ee34 ("net: Allow to specify ifindex when device is moved to another namespace") Suggested-by: Jakub Kicinski Signed-off-by: Andrei Vagin Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 2 +- include/linux/netdevice.h | 8 +++++++- net/core/dev.c | 10 +++++----- net/core/rtnetlink.c | 4 ++-- net/ieee802154/core.c | 4 ++-- net/wireless/core.c | 4 ++-- 6 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 8c0c70e1da77..7349a70af083 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2354,7 +2354,7 @@ static int netvsc_register_vf(struct net_device *vf_netdev) */ if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) { ret = dev_change_net_namespace(vf_netdev, - dev_net(ndev), "eth%d", 0); + dev_net(ndev), "eth%d"); if (ret) netdev_err(vf_netdev, "could not move to same namespace as %s: %d\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b482236c0e99..5cbc950b34df 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4026,8 +4026,14 @@ void __dev_notify_flags(struct net_device *, unsigned int old_flags, int dev_change_name(struct net_device *, const char *); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex); +static inline int dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex); + const char *pat) +{ + return __dev_change_net_namespace(dev, net, pat, 0); +} int __dev_set_mtu(struct net_device *, int); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); diff --git a/net/core/dev.c b/net/core/dev.c index 9d1a8fac793f..33ff4a944109 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11062,7 +11062,7 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); /** - * dev_change_net_namespace - move device to different nethost namespace + * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name @@ -11077,8 +11077,8 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); int err, new_nsid; @@ -11202,7 +11202,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, out: return err; } -EXPORT_SYMBOL_GPL(dev_change_net_namespace); +EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -11458,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (__dev_get_by_name(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); - err = dev_change_net_namespace(dev, &init_net, fb_name, 0); + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9108a7e6c0c0..9f1f55785a6f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2619,7 +2619,7 @@ static int do_setlink(const struct sk_buff *skb, else new_ifindex = 0; - err = dev_change_net_namespace(dev, net, ifname, new_ifindex); + err = __dev_change_net_namespace(dev, net, ifname, new_ifindex); put_net(net); if (err) goto errout; @@ -3461,7 +3461,7 @@ replay: if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname, 0); + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; } diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index ec3068937fc3..de259b5170ab 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -205,7 +205,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, if (!wpan_dev->netdev) continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d", 0); + err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -222,7 +222,7 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, continue; wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wpan_dev->netdev, net, - "wpan%d", 0); + "wpan%d"); WARN_ON(err); wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL; } diff --git a/net/wireless/core.c b/net/wireless/core.c index fabb677b7d58..a2785379df6e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -165,7 +165,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, if (!wdev->netdev) continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; - err = dev_change_net_namespace(wdev->netdev, net, "wlan%d", 0); + err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; wdev->netdev->features |= NETIF_F_NETNS_LOCAL; @@ -182,7 +182,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; err = dev_change_net_namespace(wdev->netdev, net, - "wlan%d", 0); + "wlan%d"); WARN_ON(err); wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } -- cgit v1.2.3 From 0750cfd8b7fd491c9d7c8bd19d9ac380cd3c84ee Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Thu, 11 Mar 2021 15:03:33 -0800 Subject: nl80211: better document CMD_ROAM behavior The docs were very sparse with how exactly CMD_ROAM should be used. Specifically related to BSS information normally obtained through a user space scan. Signed-off-by: James Prestwood Link: https://lore.kernel.org/r/20210311230333.103934-1-prestwoj@gmail.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ac78da99fccd..5e30c7f6c484 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -655,6 +655,9 @@ * When a security association was established on an 802.1X network using * fast transition, this event should be followed by an * %NL80211_CMD_PORT_AUTHORIZED event. + * Following a %NL80211_CMD_ROAM event userspace can issue + * %NL80211_CMD_GET_SCAN in order to obtain the scan information for the + * new BSS the card/driver roamed to. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other * reasons, for this the %NL80211_ATTR_DISCONNECTED_BY_AP and -- cgit v1.2.3 From afd2daa26c7abd734d78bd274fc6c59a15e61063 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 6 Apr 2021 21:55:53 +0200 Subject: Bluetooth: Add support for virtio transport driver This adds support for Bluetooth HCI transport over virtio. Signed-off-by: Marcel Holtmann Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/Kconfig | 10 + drivers/bluetooth/Makefile | 2 + drivers/bluetooth/virtio_bt.c | 401 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/virtio_bt.h | 31 ++++ include/uapi/linux/virtio_ids.h | 1 + 5 files changed, 445 insertions(+) create mode 100644 drivers/bluetooth/virtio_bt.c create mode 100644 include/uapi/linux/virtio_bt.h (limited to 'include') diff --git a/drivers/bluetooth/Kconfig b/drivers/bluetooth/Kconfig index 4e73a531b377..851842372c9b 100644 --- a/drivers/bluetooth/Kconfig +++ b/drivers/bluetooth/Kconfig @@ -425,4 +425,14 @@ config BT_HCIRSI Say Y here to compile support for HCI over Redpine into the kernel or say M to compile as a module. +config BT_VIRTIO + tristate "Virtio Bluetooth driver" + depends on VIRTIO + help + Virtio Bluetooth support driver. + This driver supports Virtio Bluetooth devices. + + Say Y here to compile support for HCI over Virtio into the + kernel or say M to compile as a module. + endmenu diff --git a/drivers/bluetooth/Makefile b/drivers/bluetooth/Makefile index 1a58a3ae142c..16286ea2655d 100644 --- a/drivers/bluetooth/Makefile +++ b/drivers/bluetooth/Makefile @@ -26,6 +26,8 @@ obj-$(CONFIG_BT_BCM) += btbcm.o obj-$(CONFIG_BT_RTL) += btrtl.o obj-$(CONFIG_BT_QCA) += btqca.o +obj-$(CONFIG_BT_VIRTIO) += virtio_bt.o + obj-$(CONFIG_BT_HCIUART_NOKIA) += hci_nokia.o obj-$(CONFIG_BT_HCIRSI) += btrsi.o diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c new file mode 100644 index 000000000000..c804db7e90f8 --- /dev/null +++ b/drivers/bluetooth/virtio_bt.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#define VERSION "0.1" + +enum { + VIRTBT_VQ_TX, + VIRTBT_VQ_RX, + VIRTBT_NUM_VQS, +}; + +struct virtio_bluetooth { + struct virtio_device *vdev; + struct virtqueue *vqs[VIRTBT_NUM_VQS]; + struct work_struct rx; + struct hci_dev *hdev; +}; + +static int virtbt_add_inbuf(struct virtio_bluetooth *vbt) +{ + struct virtqueue *vq = vbt->vqs[VIRTBT_VQ_RX]; + struct scatterlist sg[1]; + struct sk_buff *skb; + int err; + + skb = alloc_skb(1000, GFP_KERNEL); + sg_init_one(sg, skb->data, 1000); + + err = virtqueue_add_inbuf(vq, sg, 1, skb, GFP_KERNEL); + if (err < 0) { + kfree_skb(skb); + return err; + } + + return 0; +} + +static int virtbt_open(struct hci_dev *hdev) +{ + struct virtio_bluetooth *vbt = hci_get_drvdata(hdev); + + if (virtbt_add_inbuf(vbt) < 0) + return -EIO; + + virtqueue_kick(vbt->vqs[VIRTBT_VQ_RX]); + return 0; +} + +static int virtbt_close(struct hci_dev *hdev) +{ + struct virtio_bluetooth *vbt = hci_get_drvdata(hdev); + int i; + + cancel_work_sync(&vbt->rx); + + for (i = 0; i < ARRAY_SIZE(vbt->vqs); i++) { + struct virtqueue *vq = vbt->vqs[i]; + struct sk_buff *skb; + + while ((skb = virtqueue_detach_unused_buf(vq))) + kfree_skb(skb); + } + + return 0; +} + +static int virtbt_flush(struct hci_dev *hdev) +{ + return 0; +} + +static int virtbt_send_frame(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct virtio_bluetooth *vbt = hci_get_drvdata(hdev); + struct scatterlist sg[1]; + int err; + + memcpy(skb_push(skb, 1), &hci_skb_pkt_type(skb), 1); + + sg_init_one(sg, skb->data, skb->len); + err = virtqueue_add_outbuf(vbt->vqs[VIRTBT_VQ_TX], sg, 1, skb, + GFP_KERNEL); + if (err) { + kfree_skb(skb); + return err; + } + + virtqueue_kick(vbt->vqs[VIRTBT_VQ_TX]); + return 0; +} + +static int virtbt_setup_zephyr(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Read Build Information */ + skb = __hci_cmd_sync(hdev, 0xfc08, 0, NULL, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + bt_dev_info(hdev, "%s", (char *)(skb->data + 1)); + + hci_set_fw_info(hdev, "%s", skb->data + 1); + + kfree_skb(skb); + return 0; +} + +static int virtbt_set_bdaddr_zephyr(struct hci_dev *hdev, + const bdaddr_t *bdaddr) +{ + struct sk_buff *skb; + + /* Write BD_ADDR */ + skb = __hci_cmd_sync(hdev, 0xfc06, 6, bdaddr, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static int virtbt_setup_intel(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Intel Read Version */ + skb = __hci_cmd_sync(hdev, 0xfc05, 0, NULL, HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static int virtbt_set_bdaddr_intel(struct hci_dev *hdev, const bdaddr_t *bdaddr) +{ + struct sk_buff *skb; + + /* Intel Write BD Address */ + skb = __hci_cmd_sync(hdev, 0xfc31, 6, bdaddr, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static int virtbt_setup_realtek(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Read ROM Version */ + skb = __hci_cmd_sync(hdev, 0xfc6d, 0, NULL, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + bt_dev_info(hdev, "ROM version %u", *((__u8 *) (skb->data + 1))); + + kfree_skb(skb); + return 0; +} + +static int virtbt_shutdown_generic(struct hci_dev *hdev) +{ + struct sk_buff *skb; + + /* Reset */ + skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, HCI_INIT_TIMEOUT); + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + return 0; +} + +static void virtbt_rx_handle(struct virtio_bluetooth *vbt, struct sk_buff *skb) +{ + __u8 pkt_type; + + pkt_type = *((__u8 *) skb->data); + skb_pull(skb, 1); + + switch (pkt_type) { + case HCI_EVENT_PKT: + case HCI_ACLDATA_PKT: + case HCI_SCODATA_PKT: + case HCI_ISODATA_PKT: + hci_skb_pkt_type(skb) = pkt_type; + hci_recv_frame(vbt->hdev, skb); + break; + } +} + +static void virtbt_rx_work(struct work_struct *work) +{ + struct virtio_bluetooth *vbt = container_of(work, + struct virtio_bluetooth, rx); + struct sk_buff *skb; + unsigned int len; + + skb = virtqueue_get_buf(vbt->vqs[VIRTBT_VQ_RX], &len); + if (!skb) + return; + + skb->len = len; + virtbt_rx_handle(vbt, skb); + + if (virtbt_add_inbuf(vbt) < 0) + return; + + virtqueue_kick(vbt->vqs[VIRTBT_VQ_RX]); +} + +static void virtbt_tx_done(struct virtqueue *vq) +{ + struct sk_buff *skb; + unsigned int len; + + while ((skb = virtqueue_get_buf(vq, &len))) + kfree_skb(skb); +} + +static void virtbt_rx_done(struct virtqueue *vq) +{ + struct virtio_bluetooth *vbt = vq->vdev->priv; + + schedule_work(&vbt->rx); +} + +static int virtbt_probe(struct virtio_device *vdev) +{ + vq_callback_t *callbacks[VIRTBT_NUM_VQS] = { + [VIRTBT_VQ_TX] = virtbt_tx_done, + [VIRTBT_VQ_RX] = virtbt_rx_done, + }; + const char *names[VIRTBT_NUM_VQS] = { + [VIRTBT_VQ_TX] = "tx", + [VIRTBT_VQ_RX] = "rx", + }; + struct virtio_bluetooth *vbt; + struct hci_dev *hdev; + int err; + __u8 type; + + if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) + return -ENODEV; + + type = virtio_cread8(vdev, offsetof(struct virtio_bt_config, type)); + + switch (type) { + case VIRTIO_BT_CONFIG_TYPE_PRIMARY: + case VIRTIO_BT_CONFIG_TYPE_AMP: + break; + default: + return -EINVAL; + } + + vbt = kzalloc(sizeof(*vbt), GFP_KERNEL); + if (!vbt) + return -ENOMEM; + + vdev->priv = vbt; + vbt->vdev = vdev; + + INIT_WORK(&vbt->rx, virtbt_rx_work); + + err = virtio_find_vqs(vdev, VIRTBT_NUM_VQS, vbt->vqs, callbacks, + names, NULL); + if (err) + return err; + + hdev = hci_alloc_dev(); + if (!hdev) { + err = -ENOMEM; + goto failed; + } + + vbt->hdev = hdev; + + hdev->bus = HCI_VIRTIO; + hdev->dev_type = type; + hci_set_drvdata(hdev, vbt); + + hdev->open = virtbt_open; + hdev->close = virtbt_close; + hdev->flush = virtbt_flush; + hdev->send = virtbt_send_frame; + + if (virtio_has_feature(vdev, VIRTIO_BT_F_VND_HCI)) { + __u16 vendor; + + virtio_cread(vdev, struct virtio_bt_config, vendor, &vendor); + + switch (vendor) { + case VIRTIO_BT_CONFIG_VENDOR_ZEPHYR: + hdev->manufacturer = 1521; + hdev->setup = virtbt_setup_zephyr; + hdev->shutdown = virtbt_shutdown_generic; + hdev->set_bdaddr = virtbt_set_bdaddr_zephyr; + break; + + case VIRTIO_BT_CONFIG_VENDOR_INTEL: + hdev->manufacturer = 2; + hdev->setup = virtbt_setup_intel; + hdev->shutdown = virtbt_shutdown_generic; + hdev->set_bdaddr = virtbt_set_bdaddr_intel; + set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); + set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + break; + + case VIRTIO_BT_CONFIG_VENDOR_REALTEK: + hdev->manufacturer = 93; + hdev->setup = virtbt_setup_realtek; + hdev->shutdown = virtbt_shutdown_generic; + set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + break; + } + } + + if (virtio_has_feature(vdev, VIRTIO_BT_F_MSFT_EXT)) { + __u16 msft_opcode; + + virtio_cread(vdev, struct virtio_bt_config, + msft_opcode, &msft_opcode); + + hci_set_msft_opcode(hdev, msft_opcode); + } + + if (virtio_has_feature(vdev, VIRTIO_BT_F_AOSP_EXT)) + hci_set_aosp_capable(hdev); + + if (hci_register_dev(hdev) < 0) { + hci_free_dev(hdev); + err = -EBUSY; + goto failed; + } + + return 0; + +failed: + vdev->config->del_vqs(vdev); + return err; +} + +static void virtbt_remove(struct virtio_device *vdev) +{ + struct virtio_bluetooth *vbt = vdev->priv; + struct hci_dev *hdev = vbt->hdev; + + hci_unregister_dev(hdev); + vdev->config->reset(vdev); + + hci_free_dev(hdev); + vbt->hdev = NULL; + + vdev->config->del_vqs(vdev); + kfree(vbt); +} + +static struct virtio_device_id virtbt_table[] = { + { VIRTIO_ID_BT, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +MODULE_DEVICE_TABLE(virtio, virtbt_table); + +static const unsigned int virtbt_features[] = { + VIRTIO_BT_F_VND_HCI, + VIRTIO_BT_F_MSFT_EXT, + VIRTIO_BT_F_AOSP_EXT, +}; + +static struct virtio_driver virtbt_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .feature_table = virtbt_features, + .feature_table_size = ARRAY_SIZE(virtbt_features), + .id_table = virtbt_table, + .probe = virtbt_probe, + .remove = virtbt_remove, +}; + +module_virtio_driver(virtbt_driver); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Generic Bluetooth VIRTIO driver ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/linux/virtio_bt.h b/include/uapi/linux/virtio_bt.h new file mode 100644 index 000000000000..a7bd48daa9a9 --- /dev/null +++ b/include/uapi/linux/virtio_bt.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ + +#ifndef _UAPI_LINUX_VIRTIO_BT_H +#define _UAPI_LINUX_VIRTIO_BT_H + +#include + +/* Feature bits */ +#define VIRTIO_BT_F_VND_HCI 0 /* Indicates vendor command support */ +#define VIRTIO_BT_F_MSFT_EXT 1 /* Indicates MSFT vendor support */ +#define VIRTIO_BT_F_AOSP_EXT 2 /* Indicates AOSP vendor support */ + +enum virtio_bt_config_type { + VIRTIO_BT_CONFIG_TYPE_PRIMARY = 0, + VIRTIO_BT_CONFIG_TYPE_AMP = 1, +}; + +enum virtio_bt_config_vendor { + VIRTIO_BT_CONFIG_VENDOR_NONE = 0, + VIRTIO_BT_CONFIG_VENDOR_ZEPHYR = 1, + VIRTIO_BT_CONFIG_VENDOR_INTEL = 2, + VIRTIO_BT_CONFIG_VENDOR_REALTEK = 3, +}; + +struct virtio_bt_config { + __u8 type; + __u16 vendor; + __u16 msft_opcode; +} __attribute__((packed)); + +#endif /* _UAPI_LINUX_VIRTIO_BT_H */ diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index bc1c0621f5ed..b4f468e9441d 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -53,6 +53,7 @@ #define VIRTIO_ID_MEM 24 /* virtio mem */ #define VIRTIO_ID_FS 26 /* virtio filesystem */ #define VIRTIO_ID_PMEM 27 /* virtio pmem */ +#define VIRTIO_ID_BT 28 /* virtio bluetooth */ #define VIRTIO_ID_MAC80211_HWSIM 29 /* virtio mac80211-hwsim */ #endif /* _LINUX_VIRTIO_IDS_H */ -- cgit v1.2.3 From cfa15cca51ef10d9aa363ed1d6907c40343ce34a Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 6 Apr 2021 21:55:54 +0200 Subject: Bluetooth: Fix default values for advertising interval The DISCOV_LE_FAST_ADV_INT_{MIN,MAX} contants are in msec, but then used later on directly while it is suppose to be N * 0.625 ms according to the Bluetooth Core specification. Signed-off-by: Marcel Holtmann Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index aa2879a3b0dd..58f7eada26fd 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1754,8 +1754,8 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c); #define DISCOV_INTERLEAVED_INQUIRY_LEN 0x04 #define DISCOV_BREDR_INQUIRY_LEN 0x08 #define DISCOV_LE_RESTART_DELAY msecs_to_jiffies(200) /* msec */ -#define DISCOV_LE_FAST_ADV_INT_MIN 100 /* msec */ -#define DISCOV_LE_FAST_ADV_INT_MAX 150 /* msec */ +#define DISCOV_LE_FAST_ADV_INT_MIN 0x00A0 /* 100 msec */ +#define DISCOV_LE_FAST_ADV_INT_MAX 0x00F0 /* 150 msec */ void mgmt_fill_version_info(void *ver); int mgmt_new_settings(struct hci_dev *hdev); -- cgit v1.2.3 From a61d67188f29ff678e94fb3ffba6c6d292e852c7 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 6 Apr 2021 21:55:56 +0200 Subject: Bluetooth: Allow Microsoft extension to indicate curve validation Some controllers don't support the Simple Pairing Options feature that can indicate the support for P-192 and P-256 public key validation. However they might support the Microsoft vendor extension that can indicate the validiation capability as well. Signed-off-by: Marcel Holtmann Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/mgmt.c | 5 ++++- net/bluetooth/msft.c | 8 ++++++++ net/bluetooth/msft.h | 6 ++++++ 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 58f7eada26fd..c73ac52af186 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -584,6 +584,7 @@ struct hci_dev { #if IS_ENABLED(CONFIG_BT_MSFTEXT) __u16 msft_opcode; void *msft_data; + bool msft_curve_validity; #endif #if IS_ENABLED(CONFIG_BT_AOSPEXT) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 59f8016c4866..f9be7f9084d6 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3730,8 +3730,11 @@ static int read_controller_cap(struct sock *sk, struct hci_dev *hdev, /* When the Read Simple Pairing Options command is supported, then * the remote public key validation is supported. + * + * Alternatively, when Microsoft extensions are available, they can + * indicate support for public key validation as well. */ - if (hdev->commands[41] & 0x08) + if ((hdev->commands[41] & 0x08) || msft_curve_validity(hdev)) flags |= 0x01; /* Remote public key validation (BR/EDR) */ flags |= 0x02; /* Remote public key validation (LE) */ diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 47b104f318e9..e28f15439ce4 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -142,6 +142,9 @@ static bool read_supported_features(struct hci_dev *hdev, msft->evt_prefix_len = rp->evt_prefix_len; msft->features = __le64_to_cpu(rp->features); + if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY) + hdev->msft_curve_validity = true; + kfree_skb(skb); return true; @@ -605,3 +608,8 @@ int msft_set_filter_enable(struct hci_dev *hdev, bool enable) return err; } + +bool msft_curve_validity(struct hci_dev *hdev) +{ + return hdev->msft_curve_validity; +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 88ed613dfa08..6e56d94b88d8 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -22,6 +22,7 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, u16 handle); void msft_req_add_set_filter_enable(struct hci_request *req, bool enable); int msft_set_filter_enable(struct hci_dev *hdev, bool enable); +bool msft_curve_validity(struct hci_dev *hdev); #else @@ -54,4 +55,9 @@ static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable) return -EOPNOTSUPP; } +static inline bool msft_curve_validity(struct hci_dev *hdev) +{ + return false; +} + #endif -- cgit v1.2.3 From 6f779a66dc84450ceb4825024d3e337f42e633de Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Mon, 22 Mar 2021 22:46:31 +0200 Subject: cfg80211: allow specifying a reason for hw_rfkill rfkill now allows to report a reason for the hw_rfkill state. Allow cfg80211 drivers to specify this reason. Keep the current API to use the default reason (RFKILL_HARD_BLOCK_SIGNAL). Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20210322204633.102581-4-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 ++++++++++- net/wireless/core.c | 7 ++++--- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 911fae42b0c0..3b296f2b7a2c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -6636,8 +6637,16 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, * wiphy_rfkill_set_hw_state - notify cfg80211 about hw block state * @wiphy: the wiphy * @blocked: block status + * @reason: one of reasons in &enum rfkill_hard_block_reasons */ -void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked); +void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, + enum rfkill_hard_block_reasons reason); + +static inline void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) +{ + wiphy_rfkill_set_hw_state_reason(wiphy, blocked, + RFKILL_HARD_BLOCK_SIGNAL); +} /** * wiphy_rfkill_start_polling - start polling rfkill diff --git a/net/wireless/core.c b/net/wireless/core.c index a2785379df6e..adfbcb33fb8f 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1092,14 +1092,15 @@ void wiphy_free(struct wiphy *wiphy) } EXPORT_SYMBOL(wiphy_free); -void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) +void wiphy_rfkill_set_hw_state_reason(struct wiphy *wiphy, bool blocked, + enum rfkill_hard_block_reasons reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - if (rfkill_set_hw_state(rdev->rfkill, blocked)) + if (rfkill_set_hw_state_reason(rdev->rfkill, blocked, reason)) schedule_work(&rdev->rfkill_block); } -EXPORT_SYMBOL(wiphy_rfkill_set_hw_state); +EXPORT_SYMBOL(wiphy_rfkill_set_hw_state_reason); void cfg80211_cqm_config_free(struct wireless_dev *wdev) { -- cgit v1.2.3 From d84d13d6f6e03a7aaac9e7d064a11cf6f5087da6 Mon Sep 17 00:00:00 2001 From: Vamsi Krishna Date: Tue, 2 Mar 2021 20:20:36 +0530 Subject: nl80211: Add interface to indicate TDLS peer's HE capability Enhance enum nl80211_tdls_peer_capability to configure TDLS peer's support for HE mode. Userspace decodes the TDLS setup response frame and confugures the HE mode support to driver if the peer has advertized HE mode support in TDLS setup response frame. The driver uses this information to decide whether to include HE operation IE in TDLS setup confirmation frame. Signed-off-by: Vamsi Krishna Link: https://lore.kernel.org/r/1614696636-30144-1-git-send-email-vamsin@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5e30c7f6c484..18dfe744bcb5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6298,11 +6298,13 @@ struct nl80211_vendor_cmd_info { * @NL80211_TDLS_PEER_HT: TDLS peer is HT capable. * @NL80211_TDLS_PEER_VHT: TDLS peer is VHT capable. * @NL80211_TDLS_PEER_WMM: TDLS peer is WMM capable. + * @NL80211_TDLS_PEER_HE: TDLS peer is HE capable. */ enum nl80211_tdls_peer_capability { NL80211_TDLS_PEER_HT = 1<<0, NL80211_TDLS_PEER_VHT = 1<<1, NL80211_TDLS_PEER_WMM = 1<<2, + NL80211_TDLS_PEER_HE = 1<<3, }; /** -- cgit v1.2.3 From 55f8205e7dddb2151def733cefbbf63deba9e1a5 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Tue, 30 Mar 2021 07:05:16 +0530 Subject: mac80211: Allow concurrent monitor iface and ethernet rx decap Some HW/driver can support passing ethernet rx decap frames and raw 802.11 frames for the monitor interface concurrently and via separate RX calls to mac80211. Packets going to the monitor interface(s) would be in 802.11 format and thus not have the RX_FLAG_8023 set, and 802.11 format monitoring frames should have RX_FLAG_ONLY_MONITOR set. Drivers doing such can enable the SUPPORTS_CONC_MON_RX_DECAP to allow using ethernet decap offload while a monitor interface is active, currently RX decapsulation offload gets disabled when a monitor interface is added. Signed-off-by: Sriram R Link: https://lore.kernel.org/r/1617068116-32253-1-git-send-email-srirrama@codeaurora.org [add proper documentation, rewrite commit message] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 +++++++ net/mac80211/debugfs.c | 1 + net/mac80211/iface.c | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 2d1d629e5d14..c21a0e27b35e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2399,6 +2399,12 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD: Hardware supports rx decapsulation * offload * + * @IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP: Hardware supports concurrent rx + * decapsulation offload and passing raw 802.11 frames for monitor iface. + * If this is supported, the driver must pass both 802.3 frames for real + * usage and 802.11 frames with %RX_FLAG_ONLY_MONITOR set for monitor to + * the stack. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2453,6 +2459,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, IEEE80211_HW_SUPPORTS_RX_DECAP_OFFLOAD, + IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 5296898875ff..9245c0421bda 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -456,6 +456,7 @@ static const char *hw_flag_names[] = { FLAG(AMPDU_KEYBORDER_SUPPORT), FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), FLAG(SUPPORTS_RX_DECAP_OFFLOAD), + FLAG(SUPPORTS_CONC_MON_RX_DECAP), #undef FLAG }; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index b80c9b016b2b..b1c170939e44 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -807,7 +807,8 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { flags |= IEEE80211_OFFLOAD_DECAP_ENABLED; - if (local->monitors) + if (local->monitors && + !ieee80211_hw_check(&local->hw, SUPPORTS_CONC_MON_RX_DECAP)) flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; } else { flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; -- cgit v1.2.3 From 9885d016ffa9465f91498e7a70c413c30446ad49 Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Wed, 7 Apr 2021 22:22:49 +0200 Subject: net: phy: marvell10g: add separate structure for 88X3340 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 88X3340 contains 4 cores similar to 88X3310, but there is a difference: it does not support xaui host mode. Instead the corresponding MACTYPE means rxaui / 5gbase-r / 2500base-x / sgmii without AN Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell10g.c | 58 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/marvell_phy.h | 6 ++++- 2 files changed, 61 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 2dc1317e601e..f74dfd993d8b 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -557,6 +557,21 @@ static int mv3310_init_interface(struct phy_device *phydev, int mactype) return 0; } +static int mv3340_init_interface(struct phy_device *phydev, int mactype) +{ + struct mv3310_priv *priv = dev_get_drvdata(&phydev->mdio.dev); + int err = 0; + + priv->rate_match = false; + + if (mactype == MV_V2_3340_PORT_CTRL_MACTYPE_RXAUI_NO_SGMII_AN) + priv->const_interface = PHY_INTERFACE_MODE_RXAUI; + else + err = mv3310_init_interface(phydev, mactype); + + return err; +} + static int mv3310_config_init(struct phy_device *phydev) { struct mv3310_priv *priv = dev_get_drvdata(&phydev->mdio.dev); @@ -884,6 +899,16 @@ static void mv3310_init_supported_interfaces(unsigned long *mask) __set_bit(PHY_INTERFACE_MODE_USXGMII, mask); } +static void mv3340_init_supported_interfaces(unsigned long *mask) +{ + __set_bit(PHY_INTERFACE_MODE_SGMII, mask); + __set_bit(PHY_INTERFACE_MODE_2500BASEX, mask); + __set_bit(PHY_INTERFACE_MODE_5GBASER, mask); + __set_bit(PHY_INTERFACE_MODE_RXAUI, mask); + __set_bit(PHY_INTERFACE_MODE_10GBASER, mask); + __set_bit(PHY_INTERFACE_MODE_USXGMII, mask); +} + static void mv2110_init_supported_interfaces(unsigned long *mask) { __set_bit(PHY_INTERFACE_MODE_SGMII, mask); @@ -903,6 +928,16 @@ static const struct mv3310_chip mv3310_type = { #endif }; +static const struct mv3310_chip mv3340_type = { + .init_supported_interfaces = mv3340_init_supported_interfaces, + .get_mactype = mv3310_get_mactype, + .init_interface = mv3340_init_interface, + +#ifdef CONFIG_HWMON + .hwmon_read_temp_reg = mv3310_hwmon_read_temp_reg, +#endif +}; + static const struct mv3310_chip mv2110_type = { .init_supported_interfaces = mv2110_init_supported_interfaces, .get_mactype = mv2110_get_mactype, @@ -916,7 +951,7 @@ static const struct mv3310_chip mv2110_type = { static struct phy_driver mv3310_drivers[] = { { .phy_id = MARVELL_PHY_ID_88X3310, - .phy_id_mask = MARVELL_PHY_ID_MASK, + .phy_id_mask = MARVELL_PHY_ID_88X33X0_MASK, .name = "mv88x3310", .driver_data = &mv3310_type, .get_features = mv3310_get_features, @@ -932,6 +967,24 @@ static struct phy_driver mv3310_drivers[] = { .remove = mv3310_remove, .set_loopback = genphy_c45_loopback, }, + { + .phy_id = MARVELL_PHY_ID_88X3340, + .phy_id_mask = MARVELL_PHY_ID_88X33X0_MASK, + .name = "mv88x3340", + .driver_data = &mv3340_type, + .get_features = mv3310_get_features, + .config_init = mv3310_config_init, + .probe = mv3310_probe, + .suspend = mv3310_suspend, + .resume = mv3310_resume, + .config_aneg = mv3310_config_aneg, + .aneg_done = mv3310_aneg_done, + .read_status = mv3310_read_status, + .get_tunable = mv3310_get_tunable, + .set_tunable = mv3310_set_tunable, + .remove = mv3310_remove, + .set_loopback = genphy_c45_loopback, + }, { .phy_id = MARVELL_PHY_ID_88E2110, .phy_id_mask = MARVELL_PHY_ID_MASK, @@ -954,7 +1007,8 @@ static struct phy_driver mv3310_drivers[] = { module_phy_driver(mv3310_drivers); static struct mdio_device_id __maybe_unused mv3310_tbl[] = { - { MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_MASK }, + { MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_88X33X0_MASK }, + { MARVELL_PHY_ID_88X3340, MARVELL_PHY_ID_88X33X0_MASK }, { MARVELL_PHY_ID_88E2110, MARVELL_PHY_ID_MASK }, { }, }; diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 274abd5fbac3..6b11a5411082 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -22,10 +22,14 @@ #define MARVELL_PHY_ID_88E1545 0x01410ea0 #define MARVELL_PHY_ID_88E1548P 0x01410ec0 #define MARVELL_PHY_ID_88E3016 0x01410e60 -#define MARVELL_PHY_ID_88X3310 0x002b09a0 #define MARVELL_PHY_ID_88E2110 0x002b09b0 #define MARVELL_PHY_ID_88X2222 0x01410f10 +/* PHY IDs and mask for Alaska 10G PHYs */ +#define MARVELL_PHY_ID_88X33X0_MASK 0xfffffff8 +#define MARVELL_PHY_ID_88X3310 0x002b09a0 +#define MARVELL_PHY_ID_88X3340 0x002b09a8 + /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */ #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0 -- cgit v1.2.3 From 53f111cbfac6f2000c604994ddf01d3299d7de6b Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Wed, 7 Apr 2021 22:22:51 +0200 Subject: net: phy: add constants for 2.5G and 5G speed in PCS speed register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add constants for 2.5G and 5G speed in PCS speed register into mdio.h. Signed-off-by: Marek Behún Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/mdio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 3f302e2523b2..bdf77dffa5a4 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -120,6 +120,8 @@ #define MDIO_PMA_SPEED_100 0x0020 /* 100M capable */ #define MDIO_PMA_SPEED_10 0x0040 /* 10M capable */ #define MDIO_PCS_SPEED_10P2B 0x0002 /* 10PASS-TS/2BASE-TL capable */ +#define MDIO_PCS_SPEED_2_5G 0x0040 /* 2.5G capable */ +#define MDIO_PCS_SPEED_5G 0x0080 /* 5G capable */ /* Device present registers. */ #define MDIO_DEVS_PRESENT(devad) (1 << (devad)) -- cgit v1.2.3 From fbe82b3db3e58edca33a7e7d9157eb7bdda6e537 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Thu, 8 Apr 2021 15:00:41 +0800 Subject: net: qed: remove unused including Remove including that don't need it. Signed-off-by: Tian Tao Signed-off-by: Zhiqi Song Signed-off-by: David S. Miller --- include/linux/qed/qed_ll2_if.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index 2f64ed79cee9..ea273ba1c991 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From b98b33043c95a3886b5feb73814f8882a9cceaad Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 8 Apr 2021 19:45:02 +0200 Subject: net: dccp: use net_generic storage DCCP is virtually never used, so no need to use space in struct net for it. Put the pernet ipv4/v6 socket in the dccp ipv4/ipv6 modules instead. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20210408174502.1625-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 4 ---- include/net/netns/dccp.h | 12 ------------ net/dccp/ipv4.c | 24 ++++++++++++++++++++---- net/dccp/ipv6.c | 24 ++++++++++++++++++++---- 4 files changed, 40 insertions(+), 24 deletions(-) delete mode 100644 include/net/netns/dccp.h (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 3802c8322ab0..fa5887143f0d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) @@ -130,9 +129,6 @@ struct net { #if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE) struct netns_sctp sctp; #endif -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) - struct netns_dccp dccp; -#endif #ifdef CONFIG_NETFILTER struct netns_nf nf; struct netns_xt xt; diff --git a/include/net/netns/dccp.h b/include/net/netns/dccp.h deleted file mode 100644 index cdbc4f5b8390..000000000000 --- a/include/net/netns/dccp.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NETNS_DCCP_H__ -#define __NETNS_DCCP_H__ - -struct sock; - -struct netns_dccp { - struct sock *v4_ctl_sk; - struct sock *v6_ctl_sk; -}; - -#endif diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 2455b0c0e486..ffc601a3b329 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -23,14 +23,21 @@ #include #include #include +#include #include "ackvec.h" #include "ccid.h" #include "dccp.h" #include "feat.h" +struct dccp_v4_pernet { + struct sock *v4_ctl_sk; +}; + +static unsigned int dccp_v4_pernet_id __read_mostly; + /* - * The per-net dccp.v4_ctl_sk socket is used for responding to + * The per-net v4_ctl_sk socket is used for responding to * the Out-of-the-blue (OOTB) packets. A control sock will be created * for this socket at the initialization time. */ @@ -513,7 +520,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) struct sk_buff *skb; struct dst_entry *dst; struct net *net = dev_net(skb_dst(rxskb)->dev); - struct sock *ctl_sk = net->dccp.v4_ctl_sk; + struct dccp_v4_pernet *pn; + struct sock *ctl_sk; /* Never send a reset in response to a reset. */ if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) @@ -522,6 +530,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) return; + pn = net_generic(net, dccp_v4_pernet_id); + ctl_sk = pn->v4_ctl_sk; dst = dccp_v4_route_skb(net, ctl_sk, rxskb); if (dst == NULL) return; @@ -1005,16 +1015,20 @@ static struct inet_protosw dccp_v4_protosw = { static int __net_init dccp_v4_init_net(struct net *net) { + struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); + if (dccp_hashinfo.bhash == NULL) return -ESOCKTNOSUPPORT; - return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET, + return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, SOCK_DCCP, IPPROTO_DCCP, net); } static void __net_exit dccp_v4_exit_net(struct net *net) { - inet_ctl_sock_destroy(net->dccp.v4_ctl_sk); + struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); + + inet_ctl_sock_destroy(pn->v4_ctl_sk); } static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) @@ -1026,6 +1040,8 @@ static struct pernet_operations dccp_v4_ops = { .init = dccp_v4_init_net, .exit = dccp_v4_exit_net, .exit_batch = dccp_v4_exit_batch, + .id = &dccp_v4_pernet_id, + .size = sizeof(struct dccp_v4_pernet), }; static int __init dccp_v4_init(void) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 2be5c69824f9..6f5304db5a67 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -27,13 +27,20 @@ #include #include #include +#include #include #include "dccp.h" #include "ipv6.h" #include "feat.h" -/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */ +struct dccp_v6_pernet { + struct sock *v6_ctl_sk; +}; + +static unsigned int dccp_v6_pernet_id __read_mostly; + +/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; @@ -254,7 +261,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) struct sk_buff *skb; struct flowi6 fl6; struct net *net = dev_net(skb_dst(rxskb)->dev); - struct sock *ctl_sk = net->dccp.v6_ctl_sk; + struct dccp_v6_pernet *pn; + struct sock *ctl_sk; struct dst_entry *dst; if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) @@ -263,6 +271,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) if (!ipv6_unicast_destination(rxskb)) return; + pn = net_generic(net, dccp_v6_pernet_id); + ctl_sk = pn->v6_ctl_sk; skb = dccp_ctl_make_reset(ctl_sk, rxskb); if (skb == NULL) return; @@ -1089,16 +1099,20 @@ static struct inet_protosw dccp_v6_protosw = { static int __net_init dccp_v6_init_net(struct net *net) { + struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); + if (dccp_hashinfo.bhash == NULL) return -ESOCKTNOSUPPORT; - return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6, + return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, SOCK_DCCP, IPPROTO_DCCP, net); } static void __net_exit dccp_v6_exit_net(struct net *net) { - inet_ctl_sock_destroy(net->dccp.v6_ctl_sk); + struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); + + inet_ctl_sock_destroy(pn->v6_ctl_sk); } static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) @@ -1110,6 +1124,8 @@ static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, .exit_batch = dccp_v6_exit_batch, + .id = &dccp_v6_pernet_id, + .size = sizeof(struct dccp_v6_pernet), }; static int __init dccp_v6_init(void) -- cgit v1.2.3 From fba863b816049b03f3fbb07b10ebdcfe5c4141f7 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 7 Apr 2021 17:51:56 +0200 Subject: net: phy: make PHY PM ops a no-op if MAC driver manages PHY PM Resume callback of the PHY driver is called after the one for the MAC driver. The PHY driver resume callback calls phy_init_hw(), and this is potentially problematic if the MAC driver calls phy_start() in its resume callback. One issue was reported with the fec driver and a KSZ8081 PHY which seems to become unstable if a soft reset is triggered during aneg. The new flag allows MAC drivers to indicate that they take care of suspending/resuming the PHY. Then the MAC PM callbacks can handle any dependency between MAC and PHY PM. Signed-off-by: Heiner Kallweit Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 6 ++++++ include/linux/phy.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index a009d1769b08..73d29fd5e03d 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -273,6 +273,9 @@ static __maybe_unused int mdio_bus_phy_suspend(struct device *dev) { struct phy_device *phydev = to_phy_device(dev); + if (phydev->mac_managed_pm) + return 0; + /* We must stop the state machine manually, otherwise it stops out of * control, possibly with the phydev->lock held. Upon resume, netdev * may call phy routines that try to grab the same lock, and that may @@ -294,6 +297,9 @@ static __maybe_unused int mdio_bus_phy_resume(struct device *dev) struct phy_device *phydev = to_phy_device(dev); int ret; + if (phydev->mac_managed_pm) + return 0; + if (!phydev->suspended_by_mdio_bus) goto no_resume; diff --git a/include/linux/phy.h b/include/linux/phy.h index 8e2cf84b2318..98fb441dd72e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -493,6 +493,7 @@ struct macsec_ops; * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. * @is_on_sfp_module: Set true if PHY is located on an SFP module. + * @mac_managed_pm: Set true if MAC driver takes of suspending/resuming PHY * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * @irq: IRQ number of the PHY's interrupt (-1 if none) @@ -567,6 +568,7 @@ struct phy_device { unsigned loopback_enabled:1; unsigned downshifted_rate:1; unsigned is_on_sfp_module:1; + unsigned mac_managed_pm:1; unsigned autoneg:1; /* The most recently read link state */ -- cgit v1.2.3 From c781ff12a2f37a9795e13bf328e5053d3e69f9e0 Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasiuk Date: Fri, 9 Apr 2021 11:06:34 +0300 Subject: ethtool: Allow network drivers to dump arbitrary EEPROM data Define get_module_eeprom_by_page() ethtool callback and implement netlink infrastructure. get_module_eeprom_by_page() allows network drivers to dump a part of module's EEPROM specified by page and bank numbers along with offset and length. It is effectively a netlink replacement for get_module_info() and get_module_eeprom() pair, which is needed due to emergence of complex non-linear EEPROM layouts. Signed-off-by: Vladyslav Tarasiuk Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 36 +++++- include/linux/ethtool.h | 33 +++++- include/uapi/linux/ethtool_netlink.h | 19 +++ net/ethtool/Makefile | 2 +- net/ethtool/eeprom.c | 171 +++++++++++++++++++++++++++ net/ethtool/netlink.c | 11 ++ net/ethtool/netlink.h | 2 + 7 files changed, 270 insertions(+), 4 deletions(-) create mode 100644 net/ethtool/eeprom.c (limited to 'include') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ce4a69f8308f..bbecffc7b11a 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1338,6 +1338,38 @@ in an implementation specific way. ``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP module parameters. This does not mean autonegotiation. +MODULE_EEPROM +============= + +Fetch module EEPROM data dump. +This interface is designed to allow dumps of at most 1/2 page at once. This +means only dumps of 128 (or less) bytes are allowed, without crossing half page +boundary located at offset 128. For pages other than 0 only high 128 bytes are +accessible. + +Request contents: + + ======================================= ====== ========================== + ``ETHTOOL_A_MODULE_EEPROM_HEADER`` nested request header + ``ETHTOOL_A_MODULE_EEPROM_OFFSET`` u32 offset within a page + ``ETHTOOL_A_MODULE_EEPROM_LENGTH`` u32 amount of bytes to read + ``ETHTOOL_A_MODULE_EEPROM_PAGE`` u8 page number + ``ETHTOOL_A_MODULE_EEPROM_BANK`` u8 bank number + ``ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS`` u8 page I2C address + ======================================= ====== ========================== + +Kernel response contents: + + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_MODULE_EEPROM_HEADER`` | nested | reply header | + +---------------------------------------------+--------+---------------------+ + | ``ETHTOOL_A_MODULE_EEPROM_DATA`` | nested | array of bytes from | + | | | module EEPROM | + +---------------------------------------------+--------+---------------------+ + +``ETHTOOL_A_MODULE_EEPROM_DATA`` has an attribute length equal to the amount of +bytes driver actually read. + Request translation =================== @@ -1415,8 +1447,8 @@ are netlink only. ``ETHTOOL_GET_DUMP_FLAG`` n/a ``ETHTOOL_GET_DUMP_DATA`` n/a ``ETHTOOL_GET_TS_INFO`` ``ETHTOOL_MSG_TSINFO_GET`` - ``ETHTOOL_GMODULEINFO`` n/a - ``ETHTOOL_GMODULEEEPROM`` n/a + ``ETHTOOL_GMODULEINFO`` ``ETHTOOL_MSG_MODULE_EEPROM_GET`` + ``ETHTOOL_GMODULEEEPROM`` ``ETHTOOL_MSG_MODULE_EEPROM_GET`` ``ETHTOOL_GEEE`` ``ETHTOOL_MSG_EEE_GET`` ``ETHTOOL_SEEE`` ``ETHTOOL_MSG_EEE_SET`` ``ETHTOOL_GRSSH`` n/a diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 4290e2fa3117..9f6f323af59a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -81,6 +81,7 @@ enum { #define ETH_RSS_HASH_NO_CHANGE 0 struct net_device; +struct netlink_ext_ack; /* Some generic methods drivers may use in their ethtool_ops */ u32 ethtool_op_get_link(struct net_device *dev); @@ -262,6 +263,31 @@ struct ethtool_pause_stats { u64 rx_pause_frames; }; +#define ETH_MODULE_EEPROM_PAGE_LEN 128 +#define ETH_MODULE_MAX_I2C_ADDRESS 0x7f + +/** + * struct ethtool_module_eeprom - EEPROM dump from specified page + * @offset: Offset within the specified EEPROM page to begin read, in bytes. + * @length: Number of bytes to read. + * @page: Page number to read from. + * @bank: Page bank number to read from, if applicable by EEPROM spec. + * @i2c_address: I2C address of a page. Value less than 0x7f expected. Most + * EEPROMs use 0x50 or 0x51. + * @data: Pointer to buffer with EEPROM data of @length size. + * + * This can be used to manage pages during EEPROM dump in ethtool and pass + * required information to the driver. + */ +struct ethtool_module_eeprom { + __u32 offset; + __u32 length; + __u8 page; + __u8 bank; + __u8 i2c_address; + __u8 *data; +}; + /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes @@ -414,6 +440,9 @@ struct ethtool_pause_stats { * cannot use the standard PHY library helpers. * @get_phy_tunable: Read the value of a PHY tunable. * @set_phy_tunable: Set the value of a PHY tunable. + * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from + * specified page. Returns a negative error code or the amount of bytes + * read. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -519,6 +548,9 @@ struct ethtool_ops { const struct ethtool_tunable *, void *); int (*set_phy_tunable)(struct net_device *, const struct ethtool_tunable *, const void *); + int (*get_module_eeprom_by_page)(struct net_device *dev, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack); }; int ethtool_check_ops(const struct ethtool_ops *ops); @@ -542,7 +574,6 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex); -struct netlink_ext_ack; struct phy_device; struct phy_tdr_config; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 7f1bdb5b31ba..9612dcd48a6a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -44,6 +44,7 @@ enum { ETHTOOL_MSG_TUNNEL_INFO_GET, ETHTOOL_MSG_FEC_GET, ETHTOOL_MSG_FEC_SET, + ETHTOOL_MSG_MODULE_EEPROM_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -84,6 +85,7 @@ enum { ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, ETHTOOL_MSG_FEC_GET_REPLY, ETHTOOL_MSG_FEC_NTF, + ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -646,6 +648,23 @@ enum { ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) }; +/* MODULE EEPROM */ + +enum { + ETHTOOL_A_MODULE_EEPROM_UNSPEC, + ETHTOOL_A_MODULE_EEPROM_HEADER, /* nest - _A_HEADER_* */ + + ETHTOOL_A_MODULE_EEPROM_OFFSET, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_LENGTH, /* u32 */ + ETHTOOL_A_MODULE_EEPROM_PAGE, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_BANK, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS, /* u8 */ + ETHTOOL_A_MODULE_EEPROM_DATA, /* nested */ + + __ETHTOOL_A_MODULE_EEPROM_CNT, + ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index c2dc9033a8f7..83842685fd8c 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o + tunnels.o fec.o eeprom.o diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c new file mode 100644 index 000000000000..8536dd905da5 --- /dev/null +++ b/net/ethtool/eeprom.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include "netlink.h" +#include "common.h" + +struct eeprom_req_info { + struct ethnl_req_info base; + u32 offset; + u32 length; + u8 page; + u8 bank; + u8 i2c_address; +}; + +struct eeprom_reply_data { + struct ethnl_reply_data base; + u32 length; + u8 *data; +}; + +#define MODULE_EEPROM_REQINFO(__req_base) \ + container_of(__req_base, struct eeprom_req_info, base) + +#define MODULE_EEPROM_REPDATA(__reply_base) \ + container_of(__reply_base, struct eeprom_reply_data, base) + +static int eeprom_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); + struct ethtool_module_eeprom page_data = {0}; + struct net_device *dev = reply_base->dev; + int ret; + + if (!dev->ethtool_ops->get_module_eeprom_by_page) + return -EOPNOTSUPP; + + page_data.offset = request->offset; + page_data.length = request->length; + page_data.i2c_address = request->i2c_address; + page_data.page = request->page; + page_data.bank = request->bank; + page_data.data = kmalloc(page_data.length, GFP_KERNEL); + if (!page_data.data) + return -ENOMEM; + + ret = ethnl_ops_begin(dev); + if (ret) + goto err_free; + + ret = dev->ethtool_ops->get_module_eeprom_by_page(dev, &page_data, + info->extack); + if (ret < 0) + goto err_ops; + + reply->length = ret; + reply->data = page_data.data; + + ethnl_ops_complete(dev); + return 0; + +err_ops: + ethnl_ops_complete(dev); +err_free: + kfree(page_data.data); + return ret; +} + +static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info); + + if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] || + !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] || + !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] || + !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]) + return -EINVAL; + + request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]); + request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]); + request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]); + + if (!request->length) + return -EINVAL; + + /* The following set of conditions limit the API to only dump 1/2 + * EEPROM page without crossing low page boundary located at offset 128. + * This means user may only request dumps of length limited to 128 from + * either low 128 bytes or high 128 bytes. + * For pages higher than 0 only high 128 bytes are accessible. + */ + request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]); + if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE], + "reading from lower half page is allowed for page 0 only"); + return -EINVAL; + } + + if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN && + request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], + "reading cross half page boundary is illegal"); + return -EINVAL; + } else if (request->offset >= ETH_MODULE_EEPROM_PAGE_LEN * 2) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_OFFSET], + "offset is out of bounds"); + return -EINVAL; + } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) { + NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH], + "reading cross page boundary is illegal"); + return -EINVAL; + } + + if (tb[ETHTOOL_A_MODULE_EEPROM_BANK]) + request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]); + + return 0; +} + +static int eeprom_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base); + + return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */ +} + +static int eeprom_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + + return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data); +} + +static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base); + + kfree(reply->data); +} + +const struct ethnl_request_ops ethnl_module_eeprom_request_ops = { + .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, + .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER, + .req_info_size = sizeof(struct eeprom_req_info), + .reply_data_size = sizeof(struct eeprom_reply_data), + + .parse_request = eeprom_parse_request, + .prepare_data = eeprom_prepare_data, + .reply_size = eeprom_reply_size, + .fill_reply = eeprom_fill_reply, + .cleanup_data = eeprom_cleanup_data, +}; + +const struct nla_policy ethnl_module_eeprom_get_policy[] = { + [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_MODULE_EEPROM_OFFSET] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_LENGTH] = { .type = NLA_U32 }, + [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 }, + [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 }, + [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] = + NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS), +}; + diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 705a4b201564..5f5d7c4b3d4a 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -246,6 +246,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, + [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -931,6 +932,16 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_module_eeprom_get_policy, + .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 785f7ee45930..4305ac971bb0 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -345,6 +345,7 @@ extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; +extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -378,6 +379,7 @@ extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_T extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; +extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From e19b0a3474ab9ef90dd110af9f39fc87329755f1 Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasiuk Date: Fri, 9 Apr 2021 11:06:35 +0300 Subject: net/mlx5: Refactor module EEPROM query Prepare for ethtool_ops::get_module_eeprom_data() implementation by extracting common part of mlx5_query_module_eeprom() into a separate function. Signed-off-by: Vladyslav Tarasiuk Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 79 +++++++++++++++----------- include/linux/mlx5/port.h | 9 +++ 2 files changed, 54 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 4bb219565c58..9b9f870d67a4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -353,67 +353,78 @@ static void mlx5_sfp_eeprom_params_set(u16 *i2c_addr, int *page_num, u16 *offset *offset -= MLX5_EEPROM_PAGE_LENGTH; } -int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, - u16 offset, u16 size, u8 *data) +static int mlx5_query_mcia(struct mlx5_core_dev *dev, + struct mlx5_module_eeprom_query_params *params, u8 *data) { - int module_num, status, err, page_num = 0; u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {}; u32 out[MLX5_ST_SZ_DW(mcia_reg)]; - u16 i2c_addr = 0; - u8 module_id; + int status, err; void *ptr; + u16 size; + + size = min_t(int, params->size, MLX5_EEPROM_MAX_BYTES); + + MLX5_SET(mcia_reg, in, l, 0); + MLX5_SET(mcia_reg, in, size, size); + MLX5_SET(mcia_reg, in, module, params->module_number); + MLX5_SET(mcia_reg, in, device_address, params->offset); + MLX5_SET(mcia_reg, in, page_number, params->page); + MLX5_SET(mcia_reg, in, i2c_device_address, params->i2c_address); - err = mlx5_query_module_num(dev, &module_num); + err = mlx5_core_access_reg(dev, in, sizeof(in), out, + sizeof(out), MLX5_REG_MCIA, 0, 0); if (err) return err; - err = mlx5_query_module_id(dev, module_num, &module_id); + status = MLX5_GET(mcia_reg, out, status); + if (status) { + mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n", + status); + return -EIO; + } + + ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); + memcpy(data, ptr, size); + + return size; +} + +int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, + u16 offset, u16 size, u8 *data) +{ + struct mlx5_module_eeprom_query_params query = {0}; + u8 module_id; + int err; + + err = mlx5_query_module_num(dev, &query.module_number); + if (err) + return err; + + err = mlx5_query_module_id(dev, query.module_number, &module_id); if (err) return err; switch (module_id) { case MLX5_MODULE_ID_SFP: - mlx5_sfp_eeprom_params_set(&i2c_addr, &page_num, &offset); + mlx5_sfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset); break; case MLX5_MODULE_ID_QSFP: case MLX5_MODULE_ID_QSFP_PLUS: case MLX5_MODULE_ID_QSFP28: - mlx5_qsfp_eeprom_params_set(&i2c_addr, &page_num, &offset); + mlx5_qsfp_eeprom_params_set(&query.i2c_address, &query.page, &query.offset); break; default: mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); return -EINVAL; } - if (offset + size > MLX5_EEPROM_PAGE_LENGTH) + if (query.offset + size > MLX5_EEPROM_PAGE_LENGTH) /* Cross pages read, read until offset 256 in low page */ size -= offset + size - MLX5_EEPROM_PAGE_LENGTH; - size = min_t(int, size, MLX5_EEPROM_MAX_BYTES); + query.size = size; - MLX5_SET(mcia_reg, in, l, 0); - MLX5_SET(mcia_reg, in, module, module_num); - MLX5_SET(mcia_reg, in, i2c_device_address, i2c_addr); - MLX5_SET(mcia_reg, in, page_number, page_num); - MLX5_SET(mcia_reg, in, device_address, offset); - MLX5_SET(mcia_reg, in, size, size); - - err = mlx5_core_access_reg(dev, in, sizeof(in), out, - sizeof(out), MLX5_REG_MCIA, 0, 0); - if (err) - return err; - - status = MLX5_GET(mcia_reg, out, status); - if (status) { - mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n", - status); - return -EIO; - } - - ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0); - memcpy(data, ptr, size); - - return size; + return mlx5_query_mcia(dev, &query, data); } EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom); diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 23edd2db4803..90b87aa82db3 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -62,6 +62,15 @@ enum mlx5_an_status { #define MLX5_EEPROM_PAGE_LENGTH 256 #define MLX5_EEPROM_HIGH_PAGE_LENGTH 128 +struct mlx5_module_eeprom_query_params { + u16 size; + u16 offset; + u16 i2c_address; + u32 page; + u32 bank; + u32 module_number; +}; + enum mlx5e_link_mode { MLX5E_1000BASE_CX_SGMII = 0, MLX5E_1000BASE_KX = 1, -- cgit v1.2.3 From e109d2b204daa223e6d3cdaa369071c3ea96dcbf Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasiuk Date: Fri, 9 Apr 2021 11:06:36 +0300 Subject: net/mlx5: Implement get_module_eeprom_by_page() Implement ethtool_ops::get_module_eeprom_by_page() to enable support of new SFP standards. Signed-off-by: Vladyslav Tarasiuk Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 44 ++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/port.c | 41 ++++++++++++++++++++ include/linux/mlx5/port.h | 2 + 3 files changed, 87 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index b185a0452629..c8057a44d5ab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1770,6 +1770,49 @@ static int mlx5e_get_module_eeprom(struct net_device *netdev, return 0; } +static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_module_eeprom_query_params query; + struct mlx5_core_dev *mdev = priv->mdev; + u8 *data = page_data->data; + int size_read; + int i = 0; + + if (!page_data->length) + return -EINVAL; + + memset(data, 0, page_data->length); + + query.offset = page_data->offset; + query.i2c_address = page_data->i2c_address; + query.bank = page_data->bank; + query.page = page_data->page; + while (i < page_data->length) { + query.size = page_data->length - i; + size_read = mlx5_query_module_eeprom_by_page(mdev, &query, data + i); + + /* Done reading, return how many bytes was read */ + if (!size_read) + return i; + + if (size_read == -EINVAL) + return -EINVAL; + if (size_read < 0) { + netdev_err(priv->netdev, "%s: mlx5_query_module_eeprom_by_page failed:0x%x\n", + __func__, size_read); + return i; + } + + i += size_read; + query.offset += size_read; + } + + return i; +} + int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, struct ethtool_flash *flash) { @@ -2159,6 +2202,7 @@ const struct ethtool_ops mlx5e_ethtool_ops = { .set_wol = mlx5e_set_wol, .get_module_info = mlx5e_get_module_info, .get_module_eeprom = mlx5e_get_module_eeprom, + .get_module_eeprom_by_page = mlx5e_get_module_eeprom_by_page, .flash_device = mlx5e_flash_device, .get_priv_flags = mlx5e_get_priv_flags, .set_priv_flags = mlx5e_set_priv_flags, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 9b9f870d67a4..522a41f8f1e2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -428,6 +428,47 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom); +int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, + struct mlx5_module_eeprom_query_params *params, + u8 *data) +{ + u8 module_id; + int err; + + err = mlx5_query_module_num(dev, ¶ms->module_number); + if (err) + return err; + + err = mlx5_query_module_id(dev, params->module_number, &module_id); + if (err) + return err; + + switch (module_id) { + case MLX5_MODULE_ID_SFP: + if (params->page > 0) + return -EINVAL; + break; + case MLX5_MODULE_ID_QSFP: + case MLX5_MODULE_ID_QSFP28: + case MLX5_MODULE_ID_QSFP_PLUS: + if (params->page > 3) + return -EINVAL; + break; + default: + mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); + return -EINVAL; + } + + if (params->i2c_address != MLX5_I2C_ADDR_HIGH && + params->i2c_address != MLX5_I2C_ADDR_LOW) { + mlx5_core_err(dev, "I2C address not recognized: 0x%x\n", params->i2c_address); + return -EINVAL; + } + + return mlx5_query_mcia(dev, params, data); +} +EXPORT_SYMBOL_GPL(mlx5_query_module_eeprom_by_page); + static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc, int pvlc_size, u8 local_port) { diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 90b87aa82db3..58d56adb9842 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -209,6 +209,8 @@ void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported, bool *enabled); int mlx5_query_module_eeprom(struct mlx5_core_dev *dev, u16 offset, u16 size, u8 *data); +int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, + struct mlx5_module_eeprom_query_params *params, u8 *data); int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); -- cgit v1.2.3 From 4c88fa412a100f925b8ab1aa952a672895f69d35 Mon Sep 17 00:00:00 2001 From: Vladyslav Tarasiuk Date: Fri, 9 Apr 2021 11:06:37 +0300 Subject: net/mlx5: Add support for DSFP module EEPROM dumps Allow the driver to recognise DSFP transceiver module ID and therefore allow its EEPROM dumps using ethtool. Signed-off-by: Vladyslav Tarasiuk Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx5/core/port.c | 2 ++ include/linux/mlx5/port.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 522a41f8f1e2..1ef2b6a848c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -454,6 +454,8 @@ int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev, if (params->page > 3) return -EINVAL; break; + case MLX5_MODULE_ID_DSFP: + break; default: mlx5_core_err(dev, "Module ID not recognized: 0x%x\n", module_id); return -EINVAL; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 58d56adb9842..77ea4f9c5265 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -45,6 +45,7 @@ enum mlx5_module_id { MLX5_MODULE_ID_QSFP = 0xC, MLX5_MODULE_ID_QSFP_PLUS = 0xD, MLX5_MODULE_ID_QSFP28 = 0x11, + MLX5_MODULE_ID_DSFP = 0x1B, }; enum mlx5_an_status { -- cgit v1.2.3 From d740513f05a24b1a46722325974223980f068728 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 9 Apr 2021 11:06:40 +0300 Subject: phy: sfp: add netlink SFP support to generic SFP code The new netlink API for reading SFP data requires a new op to be implemented. The idea of the new netlink SFP code is that userspace is responsible to parsing the EEPROM data and requesting pages, rather than have the kernel decide what pages are interesting and returning them. This allows greater flexibility for newer formats. Currently the generic SFP code only supports simple SFPs. Allow i2c address 0x50 and 0x51 to be accessed with page and bank must always be 0. This interface will later be extended when for example QSFP support is added. Signed-off-by: Andrew Lunn Signed-off-by: Vladyslav Tarasiuk Signed-off-by: David S. Miller --- drivers/net/phy/sfp-bus.c | 20 ++++++++++++++++++++ drivers/net/phy/sfp.c | 25 +++++++++++++++++++++++++ drivers/net/phy/sfp.h | 3 +++ include/linux/sfp.h | 10 ++++++++++ 4 files changed, 58 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index 2e11176c6b94..e61de66e973b 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -555,6 +555,26 @@ int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, } EXPORT_SYMBOL_GPL(sfp_get_module_eeprom); +/** + * sfp_get_module_eeprom_by_page() - Read a page from the SFP module EEPROM + * @bus: a pointer to the &struct sfp_bus structure for the sfp module + * @page: a &struct ethtool_module_eeprom + * @extack: extack for reporting problems + * + * Read an EEPROM page as specified by the supplied @page. See the + * documentation for &struct ethtool_module_eeprom for the page to be read. + * + * Returns 0 on success or a negative errno number. More error + * information might be provided via extack + */ +int sfp_get_module_eeprom_by_page(struct sfp_bus *bus, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack) +{ + return bus->socket_ops->module_eeprom_by_page(bus->sfp, page, extack); +} +EXPORT_SYMBOL_GPL(sfp_get_module_eeprom_by_page); + /** * sfp_upstream_start() - Inform the SFP that the network device is up * @bus: a pointer to the &struct sfp_bus structure for the sfp module diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 7998acc689b7..37f722c763d7 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -2330,6 +2330,30 @@ static int sfp_module_eeprom(struct sfp *sfp, struct ethtool_eeprom *ee, return 0; } +static int sfp_module_eeprom_by_page(struct sfp *sfp, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack) +{ + if (page->bank) { + NL_SET_ERR_MSG(extack, "Banks not supported"); + return -EOPNOTSUPP; + } + + if (page->page) { + NL_SET_ERR_MSG(extack, "Only page 0 supported"); + return -EOPNOTSUPP; + } + + if (page->i2c_address != 0x50 && + page->i2c_address != 0x51) { + NL_SET_ERR_MSG(extack, "Only address 0x50 and 0x51 supported"); + return -EOPNOTSUPP; + } + + return sfp_read(sfp, page->i2c_address == 0x51, page->offset, + page->data, page->length); +}; + static const struct sfp_socket_ops sfp_module_ops = { .attach = sfp_attach, .detach = sfp_detach, @@ -2337,6 +2361,7 @@ static const struct sfp_socket_ops sfp_module_ops = { .stop = sfp_stop, .module_info = sfp_module_info, .module_eeprom = sfp_module_eeprom, + .module_eeprom_by_page = sfp_module_eeprom_by_page, }; static void sfp_timeout(struct work_struct *work) diff --git a/drivers/net/phy/sfp.h b/drivers/net/phy/sfp.h index b83f70526270..27226535c72b 100644 --- a/drivers/net/phy/sfp.h +++ b/drivers/net/phy/sfp.h @@ -14,6 +14,9 @@ struct sfp_socket_ops { int (*module_info)(struct sfp *sfp, struct ethtool_modinfo *modinfo); int (*module_eeprom)(struct sfp *sfp, struct ethtool_eeprom *ee, u8 *data); + int (*module_eeprom_by_page)(struct sfp *sfp, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack); }; int sfp_add_phy(struct sfp_bus *bus, struct phy_device *phydev); diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 38893e4dd0f0..302094b855fb 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -542,6 +542,9 @@ phy_interface_t sfp_select_interface(struct sfp_bus *bus, int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo); int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, u8 *data); +int sfp_get_module_eeprom_by_page(struct sfp_bus *bus, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack); void sfp_upstream_start(struct sfp_bus *bus); void sfp_upstream_stop(struct sfp_bus *bus); void sfp_bus_put(struct sfp_bus *bus); @@ -587,6 +590,13 @@ static inline int sfp_get_module_eeprom(struct sfp_bus *bus, return -EOPNOTSUPP; } +static inline int sfp_get_module_eeprom_by_page(struct sfp_bus *bus, + const struct ethtool_module_eeprom *page, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + static inline void sfp_upstream_start(struct sfp_bus *bus) { } -- cgit v1.2.3 From f3c45326ee71d1d3ec11e9ddb5afc04bca9ae492 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Sat, 10 Apr 2021 10:45:48 -0700 Subject: bpf: Document PROG_TEST_RUN limitations Per net/bpf/test_run.c, particular prog types have additional restrictions around the parameters that can be provided, so document these in the header. I didn't bother documenting the limitation on duration for raw tracepoints since that's an output parameter anyway. Tested with ./tools/testing/selftests/bpf/test_doc_build.sh. Suggested-by: Yonghong Song Signed-off-by: Joe Stringer Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20210410174549.816482-1-joe@cilium.io --- include/uapi/linux/bpf.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 49371eba98ba..e1ee1be7e49b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -312,6 +312,27 @@ union bpf_iter_link_info { * *ctx_out*, *data_out* (for example, packet data), result of the * execution *retval*, and *duration* of the test run. * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_XDP** + * *ctx_in* and *ctx_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. -- cgit v1.2.3 From 51e0158a54321a48d260e95998393934bb0de52c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 6 Apr 2021 20:21:11 -0700 Subject: skmsg: Pass psock pointer to ->psock_update_sk_prot() Using sk_psock() to retrieve psock pointer from sock requires RCU read lock, but we already get psock pointer before calling ->psock_update_sk_prot() in both cases, so we can just pass it without bothering sk_psock(). Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()") Reported-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210407032111.33398-1-xiyou.wangcong@gmail.com --- include/linux/skmsg.h | 5 +++-- include/net/sock.h | 5 ++++- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- net/core/sock_map.c | 2 +- net/ipv4/tcp_bpf.c | 3 +-- net/ipv4/udp_bpf.c | 3 +-- 7 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index f78e90a04a69..e2fb0a5a101e 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,7 +99,8 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, + bool restore); struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; @@ -405,7 +406,7 @@ static inline void sk_psock_restore_proto(struct sock *sk, { sk->sk_prot->unhash = psock->saved_unhash; if (psock->psock_update_sk_prot) - psock->psock_update_sk_prot(sk, true); + psock->psock_update_sk_prot(sk, psock, true); } static inline void sk_psock_set_state(struct sk_psock *psock, diff --git a/include/net/sock.h b/include/net/sock.h index 8b4155e756c2..c4bbdcd83f4d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1114,6 +1114,7 @@ struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; +struct sk_psock; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes @@ -1185,7 +1186,9 @@ struct proto { void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); #ifdef CONFIG_BPF_SYSCALL - int (*psock_update_sk_prot)(struct sock *sk, bool restore); + int (*psock_update_sk_prot)(struct sock *sk, + struct sk_psock *psock, + bool restore); #endif /* Keeping track of sockets in use */ diff --git a/include/net/tcp.h b/include/net/tcp.h index eaea43afcc97..d05193cb0d99 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2215,7 +2215,7 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int tcp_bpf_update_proto(struct sock *sk, bool restore); +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/net/udp.h b/include/net/udp.h index f55aaeef7e91..360df454356c 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -543,7 +543,7 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb) #ifdef CONFIG_BPF_SYSCALL struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); -int udp_bpf_update_proto(struct sock *sk, bool restore); +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); #endif #endif /* _UDP_H */ diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3d190d22b0d8..f473c51cbc4b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -188,7 +188,7 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; - return sk->sk_prot->psock_update_sk_prot(sk, false); + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3d622a0d0753..4930bc8ab47e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -499,9 +499,8 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP; } -int tcp_bpf_update_proto(struct sock *sk, bool restore) +int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { - struct sk_psock *psock = sk_psock(sk); int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 4a7e38c5d842..954c4591a6fd 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -103,10 +103,9 @@ static int __init udp_bpf_v4_build_proto(void) } core_initcall(udp_bpf_v4_build_proto); -int udp_bpf_update_proto(struct sock *sk, bool restore) +int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6; - struct sk_psock *psock = sk_psock(sk); if (restore) { sk->sk_write_space = psock->saved_write_space; -- cgit v1.2.3 From 5c507329000e282dce91e6c98ee6ffa61a8a5e49 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 12 Apr 2021 16:24:32 -0300 Subject: libbpf: Clarify flags in ringbuf helpers In 'bpf_ringbuf_reserve()' we require the flag to '0' at the moment. For 'bpf_ringbuf_{discard,submit,output}' a flag of '0' might send a notification to the process if needed. Signed-off-by: Pedro Tammela Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210412192434.944343-1-pctammela@mojatatu.com --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e1ee1be7e49b..85c924bc21b1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4082,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4099,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4109,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e1ee1be7e49b..85c924bc21b1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4082,12 +4082,20 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. * Return * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. * Return * Valid pointer with *size* bytes of memory available; NULL, * otherwise. @@ -4099,6 +4107,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * @@ -4109,6 +4121,10 @@ union bpf_attr { * of new data availability is sent. * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. * Return * Nothing. Always succeeds. * -- cgit v1.2.3 From 3e1b0c168f6c8648f217c78ed6a4135af8c9d830 Mon Sep 17 00:00:00 2001 From: wenxu Date: Sat, 3 Apr 2021 21:59:42 +0800 Subject: netfilter: flowtable: add vlan match offload support This patch adds support for vlan_id, vlan_priority and vlan_proto match for flowtable offload. Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 2 ++ net/netfilter/nf_flow_table_offload.c | 37 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 583b327d8fc0..d46e422c9d10 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -21,6 +21,8 @@ struct nf_flow_key { struct flow_dissector_key_control control; struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; + struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 7d0d128407be..dc1d6b4e35f8 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -78,6 +78,16 @@ static void nf_flow_rule_lwt_match(struct nf_flow_match *match, match->dissector.used_keys |= enc_keys; } +static void nf_flow_rule_vlan_match(struct flow_dissector_key_vlan *key, + struct flow_dissector_key_vlan *mask, + u16 vlan_id, __be16 proto) +{ + key->vlan_id = vlan_id; + mask->vlan_id = VLAN_VID_MASK; + key->vlan_tpid = proto; + mask->vlan_tpid = 0xffff; +} + static int nf_flow_rule_match(struct nf_flow_match *match, const struct flow_offload_tuple *tuple, struct dst_entry *other_dst) @@ -85,6 +95,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; struct ip_tunnel_info *tun_info; + bool vlan_encap = false; NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); @@ -102,6 +113,32 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->meta.ingress_ifindex = tuple->iifidx; mask->meta.ingress_ifindex = 0xffffffff; + if (tuple->encap_num > 0 && !(tuple->in_vlan_ingress & BIT(0)) && + tuple->encap[0].proto == htons(ETH_P_8021Q)) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, vlan); + nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, + tuple->encap[0].id, + tuple->encap[0].proto); + vlan_encap = true; + } + + if (tuple->encap_num > 1 && !(tuple->in_vlan_ingress & BIT(1)) && + tuple->encap[1].proto == htons(ETH_P_8021Q)) { + if (vlan_encap) { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CVLAN, + cvlan); + nf_flow_rule_vlan_match(&key->cvlan, &mask->cvlan, + tuple->encap[1].id, + tuple->encap[1].proto); + } else { + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_VLAN, + vlan); + nf_flow_rule_vlan_match(&key->vlan, &mask->vlan, + tuple->encap[1].id, + tuple->encap[1].proto); + } + } + switch (tuple->l3proto) { case AF_INET: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; -- cgit v1.2.3 From 098b5d3565e2391ca260964807e7324d489dd10b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 12 Apr 2021 21:55:40 +0200 Subject: netfilter: conntrack: move autoassign warning member to net_generic data Not accessed in fast path, place this is generic_net data instead. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 4 ++++ net/netfilter/nf_conntrack_helper.c | 9 ++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 86d86c860ede..c532b629db7b 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -44,6 +44,10 @@ union nf_conntrack_expect_proto { }; struct nf_conntrack_net { + /* only used when new connection is allocated: */ + bool auto_assign_helper_warned; + + /* only used from work queues, configuration plane, and so on: */ unsigned int users4; unsigned int users6; unsigned int users_bridge; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index b055187235f8..ad91964eaa92 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -43,6 +43,8 @@ MODULE_PARM_DESC(nf_conntrack_helper, static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; +extern unsigned int nf_conntrack_net_id; + /* Stupid hash, but collision free for the default registrations of the * helpers currently in the kernel. */ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) @@ -212,8 +214,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); static struct nf_conntrack_helper * nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + if (!net->ct.sysctl_auto_assign_helper) { - if (net->ct.auto_assign_helper_warned) + if (cnet->auto_assign_helper_warned) return NULL; if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) return NULL; @@ -221,7 +225,7 @@ nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) "has been turned off for security reasons and CT-based " "firewall rule not found. Use the iptables CT target " "to attach helpers instead.\n"); - net->ct.auto_assign_helper_warned = 1; + cnet->auto_assign_helper_warned = true; return NULL; } @@ -556,7 +560,6 @@ static const struct nf_ct_ext_type helper_extend = { void nf_conntrack_helper_pernet_init(struct net *net) { - net->ct.auto_assign_helper_warned = false; net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; } -- cgit v1.2.3 From 67f28216ca04b9ba965cd652fea08f670b99a0c6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 12 Apr 2021 21:55:41 +0200 Subject: netfilter: conntrack: move autoassign_helper sysctl to net_generic data While at it, make it an u8, no need to use an integer for a boolean. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 1 + net/netfilter/nf_conntrack_helper.c | 6 ++++-- net/netfilter/nf_conntrack_standalone.c | 7 +++---- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index c532b629db7b..db8f047eb75f 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -45,6 +45,7 @@ union nf_conntrack_expect_proto { struct nf_conntrack_net { /* only used when new connection is allocated: */ + u8 sysctl_auto_assign_helper; bool auto_assign_helper_warned; /* only used from work queues, configuration plane, and so on: */ diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index ad91964eaa92..ac396cc8bfae 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -216,7 +216,7 @@ nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) { struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); - if (!net->ct.sysctl_auto_assign_helper) { + if (!cnet->sysctl_auto_assign_helper) { if (cnet->auto_assign_helper_warned) return NULL; if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) @@ -560,7 +560,9 @@ static const struct nf_ct_ext_type helper_extend = { void nf_conntrack_helper_pernet_init(struct net *net) { - net->ct.sysctl_auto_assign_helper = nf_ct_auto_assign_helper; + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; } int nf_conntrack_helper_init(void) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 54d36d3eb905..a7538379cfca 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -662,10 +662,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_HELPER] = { .procname = "nf_conntrack_helper", - .data = &init_net.ct.sysctl_auto_assign_helper, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -1042,7 +1041,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; - table[NF_SYSCTL_CT_HELPER].data = &net->ct.sysctl_auto_assign_helper; + table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif -- cgit v1.2.3 From f6f2e580d5f7152fb5ab11232edecb7fbeca3759 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 12 Apr 2021 21:55:42 +0200 Subject: netfilter: conntrack: move expect counter to net_generic data Creation of a new conntrack entry isn't a frequent operation (compared to 'ct entry already exists'). Creation of a new entry that is also an expected (related) connection even less so. Place this counter in net_generic data. A followup patch will also move the conntrack count -- this will make netns_ct a read-mostly structure. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 1 + net/netfilter/nf_conntrack_core.c | 6 +++++- net/netfilter/nf_conntrack_expect.c | 22 ++++++++++++++++------ 3 files changed, 22 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index db8f047eb75f..0578a905b1df 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -45,6 +45,7 @@ union nf_conntrack_expect_proto { struct nf_conntrack_net { /* only used when new connection is allocated: */ + unsigned int expect_count; u8 sysctl_auto_assign_helper; bool auto_assign_helper_warned; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ace3e8265e0a..5fa68f94ec65 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -55,6 +55,8 @@ #include "nf_internals.h" +extern unsigned int nf_conntrack_net_id; + __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; EXPORT_SYMBOL_GPL(nf_conntrack_locks); @@ -1570,6 +1572,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; struct nf_conntrack_zone tmp; + struct nf_conntrack_net *cnet; if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { pr_debug("Can't invert tuple.\n"); @@ -1603,7 +1606,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, GFP_ATOMIC); local_bh_disable(); - if (net->ct.expect_count) { + cnet = net_generic(net, nf_conntrack_net_id); + if (cnet->expect_count) { spin_lock(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 42557d2b6a90..efdd391b3f72 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -43,18 +43,23 @@ unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static unsigned int nf_ct_expect_hashrnd __read_mostly; +extern unsigned int nf_conntrack_net_id; + /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); + struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); - net->ct.expect_count--; + + cnet = net_generic(net, nf_conntrack_net_id); + cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; @@ -118,10 +123,11 @@ __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct nf_conntrack_expect *i; unsigned int h; - if (!net->ct.expect_count) + if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); @@ -158,10 +164,11 @@ nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; - if (!net->ct.expect_count) + if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); @@ -368,6 +375,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { + struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); @@ -389,7 +397,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - net->ct.expect_count++; + cnet = net_generic(net, nf_conntrack_net_id); + cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } @@ -415,6 +424,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; + struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; @@ -458,7 +468,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, } } - if (net->ct.expect_count >= nf_ct_expect_max) { + cnet = net_generic(net, nf_conntrack_net_id); + if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } @@ -686,7 +697,6 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - net->ct.expect_count = 0; return exp_proc_init(net); } -- cgit v1.2.3 From c53bd0e96662c2f77109e08a9889c9e1ee86c52d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 12 Apr 2021 21:55:43 +0200 Subject: netfilter: conntrack: move ct counter to net_generic data Its only needed from slowpath (sysctl, ctnetlink, gc worker) and when a new conntrack object is allocated. Furthermore, each write dirties the otherwise read-mostly pernet data in struct net.ct, which are accessed from packet path. Move it to the net_generic data. This makes struct netns_ct read-mostly. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 ++ net/netfilter/nf_conntrack_core.c | 40 +++++++++++++++++++++++---------- net/netfilter/nf_conntrack_netlink.c | 5 +++-- net/netfilter/nf_conntrack_standalone.c | 17 +++++++++++--- 4 files changed, 47 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 0578a905b1df..06dc6db70d18 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -45,6 +45,7 @@ union nf_conntrack_expect_proto { struct nf_conntrack_net { /* only used when new connection is allocated: */ + atomic_t count; unsigned int expect_count; u8 sysctl_auto_assign_helper; bool auto_assign_helper_warned; @@ -337,6 +338,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, void nf_ct_tmpl_free(struct nf_conn *tmpl); u32 nf_ct_get_id(const struct nf_conn *ct); +u32 nf_conntrack_count(const struct net *net); static inline void nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5fa68f94ec65..e0befcf8113a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -87,6 +87,8 @@ static __read_mostly bool nf_conntrack_locks_all; static struct conntrack_gc_work conntrack_gc_work; +extern unsigned int nf_conntrack_net_id; + void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { /* 1) Acquire the lock */ @@ -1381,6 +1383,7 @@ static void gc_worker(struct work_struct *work) i = 0; hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { + struct nf_conntrack_net *cnet; struct net *net; tmp = nf_ct_tuplehash_to_ctrack(h); @@ -1401,7 +1404,8 @@ static void gc_worker(struct work_struct *work) continue; net = nf_ct_net(tmp); - if (atomic_read(&net->ct.count) < nf_conntrack_max95) + cnet = net_generic(net, nf_conntrack_net_id); + if (atomic_read(&cnet->count) < nf_conntrack_max95) continue; /* need to take reference to avoid possible races */ @@ -1480,17 +1484,18 @@ __nf_conntrack_alloc(struct net *net, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + unsigned int ct_count; struct nf_conn *ct; /* We don't want any race condition at early drop stage */ - atomic_inc(&net->ct.count); + ct_count = atomic_inc_return(&cnet->count); - if (nf_conntrack_max && - unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { + if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); } @@ -1525,7 +1530,7 @@ __nf_conntrack_alloc(struct net *net, atomic_set(&ct->ct_general.use, 0); return ct; out: - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); return ERR_PTR(-ENOMEM); } @@ -1542,6 +1547,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc); void nf_conntrack_free(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_net *cnet; /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU @@ -1550,8 +1556,10 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); + cnet = net_generic(net, nf_conntrack_net_id); + smp_mb__before_atomic(); - atomic_dec(&net->ct.count); + atomic_dec(&cnet->count); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -2309,9 +2317,11 @@ __nf_ct_unconfirmed_destroy(struct net *net) void nf_ct_unconfirmed_destroy(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + might_sleep(); - if (atomic_read(&net->ct.count) > 0) { + if (atomic_read(&cnet->count) > 0) { __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); synchronize_net(); @@ -2323,11 +2333,12 @@ void nf_ct_iterate_cleanup_net(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); struct iter_data d; might_sleep(); - if (atomic_read(&net->ct.count) == 0) + if (atomic_read(&cnet->count) == 0) return; d.iter = iter; @@ -2356,7 +2367,9 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) down_read(&net_rwsem); for_each_net(net) { - if (atomic_read(&net->ct.count) == 0) + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + + if (atomic_read(&cnet->count) == 0) continue; __nf_ct_unconfirmed_destroy(net); nf_queue_nf_hook_drop(net); @@ -2436,8 +2449,10 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); + nf_ct_iterate_cleanup(kill_all, net, 0, 0); - if (atomic_read(&net->ct.count) != 0) + if (atomic_read(&cnet->count) != 0) busy = 1; } if (busy) { @@ -2718,12 +2733,13 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { + struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); int ret = -ENOMEM; int cpu; BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); - atomic_set(&net->ct.count, 0); + atomic_set(&cnet->count, 0); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index c67a6ec22a74..44e3cb80e2e0 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2559,9 +2559,9 @@ static int ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct net *net) { - struct nlmsghdr *nlh; unsigned int flags = portid ? NLM_F_MULTI : 0, event; - unsigned int nr_conntracks = atomic_read(&net->ct.count); + unsigned int nr_conntracks; + struct nlmsghdr *nlh; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); nlh = nfnl_msg_put(skb, portid, seq, event, flags, AF_UNSPEC, @@ -2569,6 +2569,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, if (!nlh) goto nlmsg_failure; + nr_conntracks = nf_conntrack_count(net); if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks))) goto nla_put_failure; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index a7538379cfca..fb89f6e5c8bc 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -424,14 +424,16 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { struct net *net = seq_file_net(seq); - unsigned int nr_conntracks = atomic_read(&net->ct.count); const struct ip_conntrack_stat *st = v; + unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } + nr_conntracks = nf_conntrack_count(net); + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", nr_conntracks, @@ -507,6 +509,16 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) } #endif /* CONFIG_NF_CONNTRACK_PROCFS */ +u32 nf_conntrack_count(const struct net *net) +{ + const struct nf_conntrack_net *cnet; + + cnet = net_generic(net, nf_conntrack_net_id); + + return atomic_read(&cnet->count); +} +EXPORT_SYMBOL_GPL(nf_conntrack_count); + /* Sysctl support */ #ifdef CONFIG_SYSCTL @@ -614,7 +626,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", - .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, @@ -1037,7 +1048,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) if (!table) return -ENOMEM; - table[NF_SYSCTL_CT_COUNT].data = &net->ct.count; + table[NF_SYSCTL_CT_COUNT].data = &cnet->count; table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; -- cgit v1.2.3 From 9b1a4d0f914b1186248fc88b1cb6ee49e336a2b2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 12 Apr 2021 21:55:44 +0200 Subject: netfilter: conntrack: convert sysctls to u8 log_invalid sysctl allows values of 0 to 255 inclusive so we no longer need a range check: the min/max values can be removed. This also removes all member variables that were moved to net_generic data in previous patches. This reduces size of netns_ct struct by one cache line. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 23 ++++++++---------- net/netfilter/nf_conntrack_proto_tcp.c | 34 +++++++++++++------------- net/netfilter/nf_conntrack_standalone.c | 42 ++++++++++++++------------------- 3 files changed, 45 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index e5f664d69ead..ad0a95c2335e 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -24,9 +24,9 @@ struct nf_generic_net { struct nf_tcp_net { unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX]; - int tcp_loose; - int tcp_be_liberal; - int tcp_max_retrans; + u8 tcp_loose; + u8 tcp_be_liberal; + u8 tcp_max_retrans; }; enum udp_conntrack { @@ -45,7 +45,7 @@ struct nf_icmp_net { #ifdef CONFIG_NF_CT_PROTO_DCCP struct nf_dccp_net { - int dccp_loose; + u8 dccp_loose; unsigned int dccp_timeout[CT_DCCP_MAX + 1]; }; #endif @@ -93,18 +93,15 @@ struct ct_pcpu { }; struct netns_ct { - atomic_t count; - unsigned int expect_count; #ifdef CONFIG_NF_CONNTRACK_EVENTS bool ecache_dwork_pending; #endif - bool auto_assign_helper_warned; - unsigned int sysctl_log_invalid; /* Log invalid packets */ - int sysctl_events; - int sysctl_acct; - int sysctl_auto_assign_helper; - int sysctl_tstamp; - int sysctl_checksum; + u8 sysctl_log_invalid; /* Log invalid packets */ + u8 sysctl_events; + u8 sysctl_acct; + u8 sysctl_auto_assign_helper; + u8 sysctl_tstamp; + u8 sysctl_checksum; struct ct_pcpu __percpu *pcpu_lists; struct ip_conntrack_stat __percpu *stat; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index ec23330687a5..318b8f723349 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -31,20 +31,6 @@ #include #include -/* "Be conservative in what you do, - be liberal in what you accept from others." - If it's non-zero, we mark only out of window RST segments as INVALID. */ -static int nf_ct_tcp_be_liberal __read_mostly = 0; - -/* If it is set to zero, we disable picking up already established - connections. */ -static int nf_ct_tcp_loose __read_mostly = 1; - -/* Max number of the retransmitted packets without receiving an (acceptable) - ACK from the destination. If this number is reached, a shorter timer - will be started. */ -static int nf_ct_tcp_max_retrans __read_mostly = 3; - /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ @@ -1436,9 +1422,23 @@ void nf_conntrack_tcp_init_net(struct net *net) * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; - tn->tcp_loose = nf_ct_tcp_loose; - tn->tcp_be_liberal = nf_ct_tcp_be_liberal; - tn->tcp_max_retrans = nf_ct_tcp_max_retrans; + + /* If it is set to zero, we disable picking up already established + * connections. + */ + tn->tcp_loose = 1; + + /* "Be conservative in what you do, + * be liberal in what you accept from others." + * If it's non-zero, we mark only out of window RST segments as INVALID. + */ + tn->tcp_be_liberal = 0; + + /* Max number of the retransmitted packets without receiving an (acceptable) + * ACK from the destination. If this number is reached, a shorter timer + * will be started. + */ + tn->tcp_max_retrans = 3; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp = diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index fb89f6e5c8bc..fe99605444be 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -522,10 +522,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_count); /* Sysctl support */ #ifdef CONFIG_SYSCTL -/* Log invalid packets of a given protocol */ -static int log_invalid_proto_min __read_mostly; -static int log_invalid_proto_max __read_mostly = 255; - /* size the user *wants to set */ static unsigned int nf_conntrack_htable_size_user __read_mostly; @@ -640,20 +636,18 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_CHECKSUM] = { .procname = "nf_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_LOG_INVALID] = { .procname = "nf_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &log_invalid_proto_min, - .extra2 = &log_invalid_proto_max, + .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", @@ -665,9 +659,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", .data = &init_net.ct.sysctl_acct, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -683,9 +677,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", .data = &init_net.ct.sysctl_events, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -694,9 +688,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { [NF_SYSCTL_CT_TIMESTAMP] = { .procname = "nf_conntrack_timestamp", .data = &init_net.ct.sysctl_tstamp, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, @@ -769,25 +763,25 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_LIBERAL] = { .procname = "nf_conntrack_tcp_be_liberal", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, [NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = { .procname = "nf_conntrack_tcp_max_retrans", - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP] = { .procname = "nf_conntrack_udp_timeout", @@ -914,9 +908,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { .procname = "nf_conntrack_dccp_loose", - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -- cgit v1.2.3 From 78ed0a9bc6db76f8e5f5f4cb0d2b2f0d1bb21b24 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 13 Apr 2021 11:06:05 +0300 Subject: netfilter: flowtable: Add FLOW_OFFLOAD_XMIT_UNSPEC xmit type It could be xmit type was not set and would default to FLOW_OFFLOAD_XMIT_NEIGH and in this type the gc expect to have a route info. Fix that by adding FLOW_OFFLOAD_XMIT_UNSPEC which defaults to 0. Fixes: 8b9229d15877 ("netfilter: flowtable: dst_check() from garbage collector path") Signed-off-by: Roi Dayan Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 3 ++- net/netfilter/nf_flow_table_core.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index d46e422c9d10..51d8eb99764d 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -92,7 +92,8 @@ enum flow_offload_tuple_dir { #define FLOW_OFFLOAD_DIR_MAX IP_CT_DIR_MAX enum flow_offload_xmit_type { - FLOW_OFFLOAD_XMIT_NEIGH = 0, + FLOW_OFFLOAD_XMIT_UNSPEC = 0, + FLOW_OFFLOAD_XMIT_NEIGH, FLOW_OFFLOAD_XMIT_XFRM, FLOW_OFFLOAD_XMIT_DIRECT, }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 76573bae6664..39c02d1aeedf 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -130,6 +130,9 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; + default: + WARN_ON_ONCE(1); + break; } flow_tuple->xmit_type = route->tuple[dir].xmit_type; -- cgit v1.2.3 From 83216e3988cd196183542937c9bd58b279f946af Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 12 Apr 2021 19:47:17 +0200 Subject: of: net: pass the dst buffer to of_get_mac_address() of_get_mac_address() returns a "const void*" pointer to a MAC address. Lately, support to fetch the MAC address by an NVMEM provider was added. But this will only work with platform devices. It will not work with PCI devices (e.g. of an integrated root complex) and esp. not with DSA ports. There is an of_* variant of the nvmem binding which works without devices. The returned data of a nvmem_cell_read() has to be freed after use. On the other hand the return of_get_mac_address() points to some static data without a lifetime. The trick for now, was to allocate a device resource managed buffer which is then returned. This will only work if we have an actual device. Change it, so that the caller of of_get_mac_address() has to supply a buffer where the MAC address is written to. Unfortunately, this will touch all drivers which use the of_get_mac_address(). Usually the code looks like: const char *addr; addr = of_get_mac_address(np); if (!IS_ERR(addr)) ether_addr_copy(ndev->dev_addr, addr); This can then be simply rewritten as: of_get_mac_address(np, ndev->dev_addr); Sometimes is_valid_ether_addr() is used to test the MAC address. of_get_mac_address() already makes sure, it just returns a valid MAC address. Thus we can just test its return code. But we have to be careful if there are still other sources for the MAC address before the of_get_mac_address(). In this case we have to keep the is_valid_ether_addr() call. The following coccinelle patch was used to convert common cases to the new style. Afterwards, I've manually gone over the drivers and fixed the return code variable: either used a new one or if one was already available use that. Mansour Moufid, thanks for that coccinelle patch! @a@ identifier x; expression y, z; @@ - x = of_get_mac_address(y); + x = of_get_mac_address(y, z); <... - ether_addr_copy(z, x); ...> @@ identifier a.x; @@ - if (<+... x ...+>) {} @@ identifier a.x; @@ if (<+... x ...+>) { ... } - else {} @@ identifier a.x; expression e; @@ - if (<+... x ...+>@e) - {} - else + if (!(e)) {...} @@ expression x, y, z; @@ - x = of_get_mac_address(y, z); + of_get_mac_address(y, z); ... when != x All drivers, except drivers/net/ethernet/aeroflex/greth.c, were compile-time tested. Suggested-by: Andrew Lunn Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- arch/arm/mach-mvebu/kirkwood.c | 3 +- arch/powerpc/sysdev/tsi108_dev.c | 5 +- drivers/net/ethernet/aeroflex/greth.c | 6 +-- drivers/net/ethernet/allwinner/sun4i-emac.c | 10 ++-- drivers/net/ethernet/altera/altera_tse_main.c | 7 +-- drivers/net/ethernet/arc/emac_main.c | 8 +-- drivers/net/ethernet/atheros/ag71xx.c | 7 +-- drivers/net/ethernet/broadcom/bcm4908_enet.c | 7 +-- drivers/net/ethernet/broadcom/bcmsysport.c | 7 +-- drivers/net/ethernet/broadcom/bgmac-bcma.c | 10 ++-- drivers/net/ethernet/broadcom/bgmac-platform.c | 11 ++-- drivers/net/ethernet/cadence/macb_main.c | 11 ++-- drivers/net/ethernet/cavium/octeon/octeon_mgmt.c | 8 +-- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 5 +- drivers/net/ethernet/davicom/dm9000.c | 10 ++-- drivers/net/ethernet/ethoc.c | 6 +-- drivers/net/ethernet/ezchip/nps_enet.c | 7 +-- drivers/net/ethernet/freescale/fec_main.c | 7 +-- drivers/net/ethernet/freescale/fec_mpc52xx.c | 7 +-- drivers/net/ethernet/freescale/fman/mac.c | 9 ++-- .../net/ethernet/freescale/fs_enet/fs_enet-main.c | 5 +- drivers/net/ethernet/freescale/gianfar.c | 8 +-- drivers/net/ethernet/freescale/ucc_geth.c | 5 +- drivers/net/ethernet/hisilicon/hisi_femac.c | 7 +-- drivers/net/ethernet/hisilicon/hix5hd2_gmac.c | 7 +-- drivers/net/ethernet/lantiq_xrx200.c | 7 +-- drivers/net/ethernet/marvell/mv643xx_eth.c | 5 +- drivers/net/ethernet/marvell/mvneta.c | 6 +-- .../net/ethernet/marvell/prestera/prestera_main.c | 11 ++-- drivers/net/ethernet/marvell/pxa168_eth.c | 9 +--- drivers/net/ethernet/marvell/sky2.c | 8 ++- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 11 ++-- drivers/net/ethernet/micrel/ks8851_common.c | 7 ++- drivers/net/ethernet/microchip/lan743x_main.c | 5 +- drivers/net/ethernet/nxp/lpc_eth.c | 4 +- drivers/net/ethernet/qualcomm/qca_spi.c | 10 ++-- drivers/net/ethernet/qualcomm/qca_uart.c | 9 +--- drivers/net/ethernet/renesas/ravb_main.c | 12 +++-- drivers/net/ethernet/renesas/sh_eth.c | 5 +- .../net/ethernet/samsung/sxgbe/sxgbe_platform.c | 13 ++--- drivers/net/ethernet/socionext/sni_ave.c | 10 ++-- .../net/ethernet/stmicro/stmmac/dwmac-anarion.c | 2 +- .../ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-generic.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-intel-plat.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-ipq806x.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-meson8b.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c | 2 +- .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-visconti.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- .../net/ethernet/stmicro/stmmac/stmmac_platform.c | 14 ++--- .../net/ethernet/stmicro/stmmac/stmmac_platform.h | 2 +- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 19 ++++--- drivers/net/ethernet/ti/cpsw.c | 7 +-- drivers/net/ethernet/ti/cpsw_new.c | 7 +-- drivers/net/ethernet/ti/davinci_emac.c | 8 +-- drivers/net/ethernet/ti/netcp_core.c | 7 +-- drivers/net/ethernet/wiznet/w5100-spi.c | 8 ++- drivers/net/ethernet/wiznet/w5100.c | 2 +- drivers/net/ethernet/xilinx/ll_temac_main.c | 8 +-- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 15 +++--- drivers/net/ethernet/xilinx/xilinx_emaclite.c | 8 +-- drivers/net/wireless/ath/ath9k/init.c | 5 +- drivers/net/wireless/mediatek/mt76/eeprom.c | 9 +--- drivers/net/wireless/ralink/rt2x00/rt2x00dev.c | 6 +-- drivers/of/of_net.c | 60 ++++++++++------------ drivers/staging/octeon/ethernet.c | 10 ++-- drivers/staging/wfx/main.c | 7 ++- include/linux/of_net.h | 6 +-- include/net/dsa.h | 2 +- net/dsa/dsa2.c | 2 +- net/dsa/slave.c | 2 +- net/ethernet/eth.c | 11 ++-- 85 files changed, 218 insertions(+), 364 deletions(-) (limited to 'include') diff --git a/arch/arm/mach-mvebu/kirkwood.c b/arch/arm/mach-mvebu/kirkwood.c index ceaad6d5927e..06b1706595f4 100644 --- a/arch/arm/mach-mvebu/kirkwood.c +++ b/arch/arm/mach-mvebu/kirkwood.c @@ -84,6 +84,7 @@ static void __init kirkwood_dt_eth_fixup(void) struct device_node *pnp = of_get_parent(np); struct clk *clk; struct property *pmac; + u8 tmpmac[ETH_ALEN]; void __iomem *io; u8 *macaddr; u32 reg; @@ -93,7 +94,7 @@ static void __init kirkwood_dt_eth_fixup(void) /* skip disabled nodes or nodes with valid MAC address*/ if (!of_device_is_available(pnp) || - !IS_ERR(of_get_mac_address(np))) + !of_get_mac_address(np, tmpmac)) goto eth_fixup_skip; clk = of_clk_get(pnp, 0); diff --git a/arch/powerpc/sysdev/tsi108_dev.c b/arch/powerpc/sysdev/tsi108_dev.c index 0baec82510b9..4c4a6efd5e5f 100644 --- a/arch/powerpc/sysdev/tsi108_dev.c +++ b/arch/powerpc/sysdev/tsi108_dev.c @@ -73,7 +73,6 @@ static int __init tsi108_eth_of_init(void) struct device_node *phy, *mdio; hw_info tsi_eth_data; const unsigned int *phy_id; - const void *mac_addr; const phandle *ph; memset(r, 0, sizeof(r)); @@ -101,9 +100,7 @@ static int __init tsi108_eth_of_init(void) goto err; } - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(tsi_eth_data.mac_addr, mac_addr); + of_get_mac_address(np, tsi_eth_data.mac_addr); ph = of_get_property(np, "mdio-handle", NULL); mdio = of_find_node_by_phandle(*ph); diff --git a/drivers/net/ethernet/aeroflex/greth.c b/drivers/net/ethernet/aeroflex/greth.c index 9c5891bbfe61..d77fafbc1530 100644 --- a/drivers/net/ethernet/aeroflex/greth.c +++ b/drivers/net/ethernet/aeroflex/greth.c @@ -1449,10 +1449,10 @@ static int greth_of_probe(struct platform_device *ofdev) break; } if (i == 6) { - const u8 *addr; + u8 addr[ETH_ALEN]; - addr = of_get_mac_address(ofdev->dev.of_node); - if (!IS_ERR(addr)) { + err = of_get_mac_address(ofdev->dev.of_node, addr); + if (!err) { for (i = 0; i < 6; i++) macaddr[i] = (unsigned int) addr[i]; } else { diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c b/drivers/net/ethernet/allwinner/sun4i-emac.c index 5ed80d9a6b9f..f99ae317c188 100644 --- a/drivers/net/ethernet/allwinner/sun4i-emac.c +++ b/drivers/net/ethernet/allwinner/sun4i-emac.c @@ -790,7 +790,6 @@ static int emac_probe(struct platform_device *pdev) struct emac_board_info *db; struct net_device *ndev; int ret = 0; - const char *mac_addr; ndev = alloc_etherdev(sizeof(struct emac_board_info)); if (!ndev) { @@ -853,12 +852,9 @@ static int emac_probe(struct platform_device *pdev) } /* Read MAC-address from DT */ - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); - - /* Check if the MAC address is valid, if not get a random one */ - if (!is_valid_ether_addr(ndev->dev_addr)) { + ret = of_get_mac_address(np, ndev->dev_addr); + if (ret) { + /* if the MAC address is invalid get a random one */ eth_hw_addr_random(ndev); dev_warn(&pdev->dev, "using random MAC address %pM\n", ndev->dev_addr); diff --git a/drivers/net/ethernet/altera/altera_tse_main.c b/drivers/net/ethernet/altera/altera_tse_main.c index 907125abef2c..1c00d719e5d7 100644 --- a/drivers/net/ethernet/altera/altera_tse_main.c +++ b/drivers/net/ethernet/altera/altera_tse_main.c @@ -1351,7 +1351,6 @@ static int altera_tse_probe(struct platform_device *pdev) struct resource *control_port; struct resource *dma_res; struct altera_tse_private *priv; - const unsigned char *macaddr; void __iomem *descmap; const struct of_device_id *of_id = NULL; @@ -1525,10 +1524,8 @@ static int altera_tse_probe(struct platform_device *pdev) priv->rx_dma_buf_sz = ALTERA_RXDMABUFFER_SIZE; /* get default MAC address from device tree */ - macaddr = of_get_mac_address(pdev->dev.of_node); - if (!IS_ERR(macaddr)) - ether_addr_copy(ndev->dev_addr, macaddr); - else + ret = of_get_mac_address(pdev->dev.of_node, ndev->dev_addr); + if (ret) eth_hw_addr_random(ndev); /* get phy addr and create mdio */ diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c index b56a9e2aecd9..67b8113a2b53 100644 --- a/drivers/net/ethernet/arc/emac_main.c +++ b/drivers/net/ethernet/arc/emac_main.c @@ -857,7 +857,6 @@ int arc_emac_probe(struct net_device *ndev, int interface) struct device_node *phy_node; struct phy_device *phydev = NULL; struct arc_emac_priv *priv; - const char *mac_addr; unsigned int id, clock_frequency, irq; int err; @@ -942,11 +941,8 @@ int arc_emac_probe(struct net_device *ndev, int interface) } /* Get MAC address from device tree */ - mac_addr = of_get_mac_address(dev->of_node); - - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); - else + err = of_get_mac_address(dev->of_node, ndev->dev_addr); + if (err) eth_hw_addr_random(ndev); arc_emac_set_address_internal(ndev); diff --git a/drivers/net/ethernet/atheros/ag71xx.c b/drivers/net/ethernet/atheros/ag71xx.c index 7352f98123c7..3a23b92ebfe3 100644 --- a/drivers/net/ethernet/atheros/ag71xx.c +++ b/drivers/net/ethernet/atheros/ag71xx.c @@ -1856,7 +1856,6 @@ static int ag71xx_probe(struct platform_device *pdev) const struct ag71xx_dcfg *dcfg; struct net_device *ndev; struct resource *res; - const void *mac_addr; int tx_size, err, i; struct ag71xx *ag; @@ -1957,10 +1956,8 @@ static int ag71xx_probe(struct platform_device *pdev) ag->stop_desc->ctrl = 0; ag->stop_desc->next = (u32)ag->stop_desc_dma; - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - memcpy(ndev->dev_addr, mac_addr, ETH_ALEN); - if (IS_ERR(mac_addr) || !is_valid_ether_addr(ndev->dev_addr)) { + err = of_get_mac_address(np, ndev->dev_addr); + if (err) { netif_err(ag, probe, ndev, "invalid MAC address, using random address\n"); eth_random_addr(ndev->dev_addr); } diff --git a/drivers/net/ethernet/broadcom/bcm4908_enet.c b/drivers/net/ethernet/broadcom/bcm4908_enet.c index b7afac5c7ca7..60d908507f51 100644 --- a/drivers/net/ethernet/broadcom/bcm4908_enet.c +++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c @@ -686,7 +686,6 @@ static int bcm4908_enet_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct net_device *netdev; struct bcm4908_enet *enet; - const u8 *mac; int err; netdev = devm_alloc_etherdev(dev, sizeof(*enet)); @@ -716,10 +715,8 @@ static int bcm4908_enet_probe(struct platform_device *pdev) return err; SET_NETDEV_DEV(netdev, &pdev->dev); - mac = of_get_mac_address(dev->of_node); - if (!IS_ERR(mac)) - ether_addr_copy(netdev->dev_addr, mac); - else + err = of_get_mac_address(dev->of_node, netdev->dev_addr); + if (err) eth_hw_addr_random(netdev); netdev->netdev_ops = &bcm4908_enet_netdev_ops; netdev->min_mtu = ETH_ZLEN; diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 777bbf6d2586..d9f0f0df8f7b 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2457,7 +2457,6 @@ static int bcm_sysport_probe(struct platform_device *pdev) struct bcm_sysport_priv *priv; struct device_node *dn; struct net_device *dev; - const void *macaddr; u32 txq, rxq; int ret; @@ -2552,12 +2551,10 @@ static int bcm_sysport_probe(struct platform_device *pdev) } /* Initialize netdevice members */ - macaddr = of_get_mac_address(dn); - if (IS_ERR(macaddr)) { + ret = of_get_mac_address(dn, dev->dev_addr); + if (ret) { dev_warn(&pdev->dev, "using random Ethernet MAC\n"); eth_hw_addr_random(dev); - } else { - ether_addr_copy(dev->dev_addr, macaddr); } SET_NETDEV_DEV(dev, &pdev->dev); diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index a5fd161ab5ee..85fa0ab7201c 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -115,7 +115,7 @@ static int bgmac_probe(struct bcma_device *core) struct ssb_sprom *sprom = &core->bus->sprom; struct mii_bus *mii_bus; struct bgmac *bgmac; - const u8 *mac = NULL; + const u8 *mac; int err; bgmac = bgmac_alloc(&core->dev); @@ -128,11 +128,10 @@ static int bgmac_probe(struct bcma_device *core) bcma_set_drvdata(core, bgmac); - if (bgmac->dev->of_node) - mac = of_get_mac_address(bgmac->dev->of_node); + err = of_get_mac_address(bgmac->dev->of_node, bgmac->net_dev->dev_addr); /* If no MAC address assigned via device tree, check SPROM */ - if (IS_ERR_OR_NULL(mac)) { + if (err) { switch (core->core_unit) { case 0: mac = sprom->et0mac; @@ -149,10 +148,9 @@ static int bgmac_probe(struct bcma_device *core) err = -ENOTSUPP; goto err; } + ether_addr_copy(bgmac->net_dev->dev_addr, mac); } - ether_addr_copy(bgmac->net_dev->dev_addr, mac); - /* On BCM4706 we need common core to access PHY */ if (core->id.id == BCMA_CORE_4706_MAC_GBIT && !core->bus->drv_gmac_cmn.core) { diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index f37f1c58f368..9834b77cf4b6 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -173,7 +173,7 @@ static int bgmac_probe(struct platform_device *pdev) struct device_node *np = pdev->dev.of_node; struct bgmac *bgmac; struct resource *regs; - const u8 *mac_addr; + int ret; bgmac = bgmac_alloc(&pdev->dev); if (!bgmac) @@ -192,11 +192,10 @@ static int bgmac_probe(struct platform_device *pdev) bgmac->dev = &pdev->dev; bgmac->dma_dev = &pdev->dev; - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(bgmac->net_dev->dev_addr, mac_addr); - else - dev_warn(&pdev->dev, "MAC address not present in device tree\n"); + ret = of_get_mac_address(np, bgmac->net_dev->dev_addr); + if (ret) + dev_warn(&pdev->dev, + "MAC address not present in device tree\n"); bgmac->irq = platform_get_irq(pdev, 0); if (bgmac->irq < 0) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index ffd56a23f8b0..d6bde1748a22 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4649,7 +4649,6 @@ static int macb_probe(struct platform_device *pdev) struct net_device *dev; struct resource *regs; void __iomem *mem; - const char *mac; struct macb *bp; int err, val; @@ -4764,15 +4763,11 @@ static int macb_probe(struct platform_device *pdev) if (bp->caps & MACB_CAPS_NEEDS_RSTONUBR) bp->rx_intr_mask |= MACB_BIT(RXUBR); - mac = of_get_mac_address(np); - if (PTR_ERR(mac) == -EPROBE_DEFER) { - err = -EPROBE_DEFER; + err = of_get_mac_address(np, bp->dev->dev_addr); + if (err == -EPROBE_DEFER) goto err_out_free_netdev; - } else if (!IS_ERR_OR_NULL(mac)) { - ether_addr_copy(bp->dev->dev_addr, mac); - } else { + else if (err) macb_get_hwaddr(bp); - } err = of_get_phy_mode(np, &interface); if (err) diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c index ecffebd513be..48ff6fb0eed9 100644 --- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c +++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c @@ -1385,7 +1385,6 @@ static int octeon_mgmt_probe(struct platform_device *pdev) struct net_device *netdev; struct octeon_mgmt *p; const __be32 *data; - const u8 *mac; struct resource *res_mix; struct resource *res_agl; struct resource *res_agl_prt_ctl; @@ -1502,11 +1501,8 @@ static int octeon_mgmt_probe(struct platform_device *pdev) netdev->min_mtu = 64 - OCTEON_MGMT_RX_HEADROOM; netdev->max_mtu = 16383 - OCTEON_MGMT_RX_HEADROOM - VLAN_HLEN; - mac = of_get_mac_address(pdev->dev.of_node); - - if (!IS_ERR(mac)) - ether_addr_copy(netdev->dev_addr, mac); - else + result = of_get_mac_address(pdev->dev.of_node, netdev->dev_addr); + if (result) eth_hw_addr_random(netdev); p->phy_np = of_parse_phandle(pdev->dev.of_node, "phy-handle", 0); diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 8ff28ed04b7f..0c783aadf393 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1474,7 +1474,6 @@ static int bgx_init_of_phy(struct bgx *bgx) device_for_each_child_node(&bgx->pdev->dev, fwn) { struct phy_device *pd; struct device_node *phy_np; - const char *mac; /* Should always be an OF node. But if it is not, we * cannot handle it, so exit the loop. @@ -1483,9 +1482,7 @@ static int bgx_init_of_phy(struct bgx *bgx) if (!node) break; - mac = of_get_mac_address(node); - if (!IS_ERR(mac)) - ether_addr_copy(bgx->lmac[lmac].mac, mac); + of_get_mac_address(node, bgx->lmac[lmac].mac); SET_NETDEV_DEV(&bgx->lmac[lmac].netdev, &bgx->pdev->dev); bgx->lmac[lmac].lmacid = lmac; diff --git a/drivers/net/ethernet/davicom/dm9000.c b/drivers/net/ethernet/davicom/dm9000.c index 252adfa5d837..2374c51bf2b2 100644 --- a/drivers/net/ethernet/davicom/dm9000.c +++ b/drivers/net/ethernet/davicom/dm9000.c @@ -1385,7 +1385,7 @@ static struct dm9000_plat_data *dm9000_parse_dt(struct device *dev) { struct dm9000_plat_data *pdata; struct device_node *np = dev->of_node; - const void *mac_addr; + int ret; if (!IS_ENABLED(CONFIG_OF) || !np) return ERR_PTR(-ENXIO); @@ -1399,11 +1399,9 @@ static struct dm9000_plat_data *dm9000_parse_dt(struct device *dev) if (of_find_property(np, "davicom,no-eeprom", NULL)) pdata->flags |= DM9000_PLATF_NO_EEPROM; - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(pdata->dev_addr, mac_addr); - else if (PTR_ERR(mac_addr) == -EPROBE_DEFER) - return ERR_CAST(mac_addr); + ret = of_get_mac_address(np, pdata->dev_addr); + if (ret == -EPROBE_DEFER) + return ERR_PTR(ret); return pdata; } diff --git a/drivers/net/ethernet/ethoc.c b/drivers/net/ethernet/ethoc.c index 3d9b0b161e24..e1b43b07755b 100644 --- a/drivers/net/ethernet/ethoc.c +++ b/drivers/net/ethernet/ethoc.c @@ -1151,11 +1151,7 @@ static int ethoc_probe(struct platform_device *pdev) ether_addr_copy(netdev->dev_addr, pdata->hwaddr); priv->phy_id = pdata->phy_id; } else { - const void *mac; - - mac = of_get_mac_address(pdev->dev.of_node); - if (!IS_ERR(mac)) - ether_addr_copy(netdev->dev_addr, mac); + of_get_mac_address(pdev->dev.of_node, netdev->dev_addr); priv->phy_id = -1; } diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index 815fb62c4b02..e3954d8835e7 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -575,7 +575,6 @@ static s32 nps_enet_probe(struct platform_device *pdev) struct net_device *ndev; struct nps_enet_priv *priv; s32 err = 0; - const char *mac_addr; if (!dev->of_node) return -ENODEV; @@ -602,10 +601,8 @@ static s32 nps_enet_probe(struct platform_device *pdev) dev_dbg(dev, "Registers base address is 0x%p\n", priv->regs_base); /* set kernel MAC address to dev */ - mac_addr = of_get_mac_address(dev->of_node); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); - else + err = of_get_mac_address(dev->of_node, ndev->dev_addr); + if (err) eth_hw_addr_random(ndev); /* Get IRQ number */ diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 70aea9c274fe..aecc111fbe73 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1665,6 +1665,7 @@ static void fec_get_mac(struct net_device *ndev) { struct fec_enet_private *fep = netdev_priv(ndev); unsigned char *iap, tmpaddr[ETH_ALEN]; + int ret; /* * try to get mac address in following order: @@ -1680,9 +1681,9 @@ static void fec_get_mac(struct net_device *ndev) if (!is_valid_ether_addr(iap)) { struct device_node *np = fep->pdev->dev.of_node; if (np) { - const char *mac = of_get_mac_address(np); - if (!IS_ERR(mac)) - iap = (unsigned char *) mac; + ret = of_get_mac_address(np, tmpaddr); + if (!ret) + iap = tmpaddr; } } diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index b3bad429e03b..02c47658a215 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -813,7 +813,6 @@ static int mpc52xx_fec_probe(struct platform_device *op) const u32 *prop; int prop_size; struct device_node *np = op->dev.of_node; - const char *mac_addr; phys_addr_t rx_fifo; phys_addr_t tx_fifo; @@ -891,10 +890,8 @@ static int mpc52xx_fec_probe(struct platform_device *op) * * First try to read MAC address from DT */ - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) { - ether_addr_copy(ndev->dev_addr, mac_addr); - } else { + rv = of_get_mac_address(np, ndev->dev_addr); + if (rv) { struct mpc52xx_fec __iomem *fec = priv->fec; /* diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c index 901749a7a318..46ecb42f2ef8 100644 --- a/drivers/net/ethernet/freescale/fman/mac.c +++ b/drivers/net/ethernet/freescale/fman/mac.c @@ -605,7 +605,6 @@ static int mac_probe(struct platform_device *_of_dev) struct platform_device *of_dev; struct resource res; struct mac_priv_s *priv; - const u8 *mac_addr; u32 val; u8 fman_id; phy_interface_t phy_if; @@ -723,11 +722,9 @@ static int mac_probe(struct platform_device *_of_dev) priv->cell_index = (u8)val; /* Get the MAC address */ - mac_addr = of_get_mac_address(mac_node); - if (IS_ERR(mac_addr)) + err = of_get_mac_address(mac_node, mac_dev->addr); + if (err) dev_warn(dev, "of_get_mac_address(%pOF) failed\n", mac_node); - else - ether_addr_copy(mac_dev->addr, mac_addr); /* Get the port handles */ nph = of_count_phandle_with_args(mac_node, "fsl,fman-ports", NULL); @@ -853,7 +850,7 @@ static int mac_probe(struct platform_device *_of_dev) if (err < 0) dev_err(dev, "fman_set_mac_active_pause() = %d\n", err); - if (!IS_ERR(mac_addr)) + if (!is_zero_ether_addr(mac_dev->addr)) dev_info(dev, "FMan MAC address: %pM\n", mac_dev->addr); priv->eth_dev = dpaa_eth_add_device(fman_id, mac_dev); diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c index 78e008b81374..6ee325ad35c5 100644 --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c @@ -918,7 +918,6 @@ static int fs_enet_probe(struct platform_device *ofdev) const u32 *data; struct clk *clk; int err; - const u8 *mac_addr; const char *phy_connection_type; int privsize, len, ret = -ENODEV; @@ -1006,9 +1005,7 @@ static int fs_enet_probe(struct platform_device *ofdev) spin_lock_init(&fep->lock); spin_lock_init(&fep->tx_lock); - mac_addr = of_get_mac_address(ofdev->dev.of_node); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); + of_get_mac_address(ofdev->dev.of_node, ndev->dev_addr); ret = fep->ops->allocate_bd(ndev); if (ret) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 3ec4d9fddd52..339f9567ef9d 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -640,7 +640,6 @@ static phy_interface_t gfar_get_interface(struct net_device *dev) static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) { const char *model; - const void *mac_addr; int err = 0, i; phy_interface_t interface; struct net_device *dev = NULL; @@ -782,11 +781,8 @@ static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev) if (stash_len || stash_idx) priv->device_flags |= FSL_GIANFAR_DEV_HAS_BUF_STASHING; - mac_addr = of_get_mac_address(np); - - if (!IS_ERR(mac_addr)) { - ether_addr_copy(dev->dev_addr, mac_addr); - } else { + err = of_get_mac_address(np, dev->dev_addr); + if (err) { eth_hw_addr_random(dev); dev_info(&ofdev->dev, "Using random MAC address: %pM\n", dev->dev_addr); } diff --git a/drivers/net/ethernet/freescale/ucc_geth.c b/drivers/net/ethernet/freescale/ucc_geth.c index ef4e2febeb5b..e0936510fa34 100644 --- a/drivers/net/ethernet/freescale/ucc_geth.c +++ b/drivers/net/ethernet/freescale/ucc_geth.c @@ -3562,7 +3562,6 @@ static int ucc_geth_probe(struct platform_device* ofdev) struct resource res; int err, ucc_num, max_speed = 0; const unsigned int *prop; - const void *mac_addr; phy_interface_t phy_interface; static const int enet_to_speed[] = { SPEED_10, SPEED_10, SPEED_10, @@ -3733,9 +3732,7 @@ static int ucc_geth_probe(struct platform_device* ofdev) goto err_free_netdev; } - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(dev->dev_addr, mac_addr); + of_get_mac_address(np, dev->dev_addr); ugeth->ug_info = ug_info; ugeth->dev = device; diff --git a/drivers/net/ethernet/hisilicon/hisi_femac.c b/drivers/net/ethernet/hisilicon/hisi_femac.c index 57c3bc4f7089..3c4db4a6b431 100644 --- a/drivers/net/ethernet/hisilicon/hisi_femac.c +++ b/drivers/net/ethernet/hisilicon/hisi_femac.c @@ -772,7 +772,6 @@ static int hisi_femac_drv_probe(struct platform_device *pdev) struct net_device *ndev; struct hisi_femac_priv *priv; struct phy_device *phy; - const char *mac_addr; int ret; ndev = alloc_etherdev(sizeof(*priv)); @@ -842,10 +841,8 @@ static int hisi_femac_drv_probe(struct platform_device *pdev) (unsigned long)phy->phy_id, phy_modes(phy->interface)); - mac_addr = of_get_mac_address(node); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); - if (!is_valid_ether_addr(ndev->dev_addr)) { + ret = of_get_mac_address(node, ndev->dev_addr); + if (ret) { eth_hw_addr_random(ndev); dev_warn(dev, "using random MAC address %pM\n", ndev->dev_addr); diff --git a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c index 8b2bf85039f1..c1aae0fca5e9 100644 --- a/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c +++ b/drivers/net/ethernet/hisilicon/hix5hd2_gmac.c @@ -1098,7 +1098,6 @@ static int hix5hd2_dev_probe(struct platform_device *pdev) struct net_device *ndev; struct hix5hd2_priv *priv; struct mii_bus *bus; - const char *mac_addr; int ret; ndev = alloc_etherdev(sizeof(struct hix5hd2_priv)); @@ -1220,10 +1219,8 @@ static int hix5hd2_dev_probe(struct platform_device *pdev) goto out_phy_node; } - mac_addr = of_get_mac_address(node); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); - if (!is_valid_ether_addr(ndev->dev_addr)) { + ret = of_get_mac_address(node, ndev->dev_addr); + if (ret) { eth_hw_addr_random(ndev); netdev_warn(ndev, "using random MAC address %pM\n", ndev->dev_addr); diff --git a/drivers/net/ethernet/lantiq_xrx200.c b/drivers/net/ethernet/lantiq_xrx200.c index 0f8ef8f1232c..41c2ad210bc9 100644 --- a/drivers/net/ethernet/lantiq_xrx200.c +++ b/drivers/net/ethernet/lantiq_xrx200.c @@ -435,7 +435,6 @@ static int xrx200_probe(struct platform_device *pdev) struct resource *res; struct xrx200_priv *priv; struct net_device *net_dev; - const u8 *mac; int err; /* alloc the network device */ @@ -477,10 +476,8 @@ static int xrx200_probe(struct platform_device *pdev) return PTR_ERR(priv->clk); } - mac = of_get_mac_address(np); - if (!IS_ERR(mac)) - ether_addr_copy(net_dev->dev_addr, mac); - else + err = of_get_mac_address(np, net_dev->dev_addr); + if (err) eth_hw_addr_random(net_dev); /* bring up the dma engine and IP core */ diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index ca1681aa951a..d207bfcaf31d 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2702,7 +2702,6 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, struct platform_device *ppdev; struct mv643xx_eth_platform_data ppd; struct resource res; - const char *mac_addr; int ret; int dev_num = 0; @@ -2733,9 +2732,7 @@ static int mv643xx_eth_shared_of_add_port(struct platform_device *pdev, return -EINVAL; } - mac_addr = of_get_mac_address(pnp); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ppd.mac_addr, mac_addr); + of_get_mac_address(pnp, ppd.mac_addr); mv643xx_eth_property(pnp, "tx-queue-size", ppd.tx_queue_size); mv643xx_eth_property(pnp, "tx-sram-addr", ppd.tx_sram_addr); diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index f20dfd1d7a6b..7d5cd9bc6c99 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -5141,7 +5141,6 @@ static int mvneta_probe(struct platform_device *pdev) struct net_device *dev; struct phylink *phylink; struct phy *comphy; - const char *dt_mac_addr; char hw_mac_addr[ETH_ALEN]; phy_interface_t phy_mode; const char *mac_from; @@ -5237,10 +5236,9 @@ static int mvneta_probe(struct platform_device *pdev) goto err_free_ports; } - dt_mac_addr = of_get_mac_address(dn); - if (!IS_ERR(dt_mac_addr)) { + err = of_get_mac_address(dn, dev->dev_addr); + if (!err) { mac_from = "device tree"; - ether_addr_copy(dev->dev_addr, dt_mac_addr); } else { mvneta_get_mac_addr(pp, hw_mac_addr); if (is_valid_ether_addr(hw_mac_addr)) { diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 25dd903a3e92..f08c420a5803 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -456,20 +456,17 @@ static int prestera_switch_set_base_mac_addr(struct prestera_switch *sw) { struct device_node *base_mac_np; struct device_node *np; - const char *base_mac; + int ret; np = of_find_compatible_node(NULL, NULL, "marvell,prestera"); base_mac_np = of_parse_phandle(np, "base-mac-provider", 0); - base_mac = of_get_mac_address(base_mac_np); - of_node_put(base_mac_np); - if (!IS_ERR(base_mac)) - ether_addr_copy(sw->base_mac, base_mac); - - if (!is_valid_ether_addr(sw->base_mac)) { + ret = of_get_mac_address(base_mac_np, sw->base_mac); + if (ret) { eth_random_addr(sw->base_mac); dev_info(prestera_dev(sw), "using random base mac address\n"); } + of_node_put(base_mac_np); return prestera_hw_switch_mac_set(sw, sw->base_mac); } diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index 3712e1786091..e967867828d8 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -1392,7 +1392,6 @@ static int pxa168_eth_probe(struct platform_device *pdev) struct resource *res; struct clk *clk; struct device_node *np; - const unsigned char *mac_addr = NULL; int err; printk(KERN_NOTICE "PXA168 10/100 Ethernet Driver\n"); @@ -1435,12 +1434,8 @@ static int pxa168_eth_probe(struct platform_device *pdev) INIT_WORK(&pep->tx_timeout_task, pxa168_eth_tx_timeout_task); - if (pdev->dev.of_node) - mac_addr = of_get_mac_address(pdev->dev.of_node); - - if (!IS_ERR_OR_NULL(mac_addr)) { - ether_addr_copy(dev->dev_addr, mac_addr); - } else { + err = of_get_mac_address(pdev->dev.of_node, dev->dev_addr); + if (err) { /* try reading the mac address, if set by the bootloader */ pxa168_eth_get_mac_address(dev, dev->dev_addr); if (!is_valid_ether_addr(dev->dev_addr)) { diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 68c154d715d6..222c32367b2c 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -4728,7 +4728,7 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port, { struct sky2_port *sky2; struct net_device *dev = alloc_etherdev(sizeof(*sky2)); - const void *iap; + int ret; if (!dev) return NULL; @@ -4798,10 +4798,8 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port, * 1) from device tree data * 2) from internal registers set by bootloader */ - iap = of_get_mac_address(hw->pdev->dev.of_node); - if (!IS_ERR(iap)) - ether_addr_copy(dev->dev_addr, iap); - else + ret = of_get_mac_address(hw->pdev->dev.of_node, dev->dev_addr); + if (ret) memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8, ETH_ALEN); diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 810def064f11..6b00c12c6c43 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2484,14 +2484,11 @@ static int __init mtk_init(struct net_device *dev) { struct mtk_mac *mac = netdev_priv(dev); struct mtk_eth *eth = mac->hw; - const char *mac_addr; - - mac_addr = of_get_mac_address(mac->of_node); - if (!IS_ERR(mac_addr)) - ether_addr_copy(dev->dev_addr, mac_addr); + int ret; - /* If the mac address is invalid, use random mac address */ - if (!is_valid_ether_addr(dev->dev_addr)) { + ret = of_get_mac_address(mac->of_node, dev->dev_addr); + if (ret) { + /* If the mac address is invalid, use random mac address */ eth_hw_addr_random(dev); dev_err(eth->dev, "generated random MAC address %pM\n", dev->dev_addr); diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c index 2feed6ce19d3..13eef6e9bd2d 100644 --- a/drivers/net/ethernet/micrel/ks8851_common.c +++ b/drivers/net/ethernet/micrel/ks8851_common.c @@ -193,11 +193,10 @@ static void ks8851_read_mac_addr(struct net_device *dev) static void ks8851_init_mac(struct ks8851_net *ks, struct device_node *np) { struct net_device *dev = ks->netdev; - const u8 *mac_addr; + int ret; - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) { - ether_addr_copy(dev->dev_addr, mac_addr); + ret = of_get_mac_address(np, dev->dev_addr); + if (!ret) { ks8851_write_mac_addr(dev); return; } diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 11a1dc4c436d..dae10328c6cf 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -2771,7 +2771,6 @@ static int lan743x_pcidev_probe(struct pci_dev *pdev, { struct lan743x_adapter *adapter = NULL; struct net_device *netdev = NULL; - const void *mac_addr; int ret = -ENODEV; netdev = devm_alloc_etherdev(&pdev->dev, @@ -2788,9 +2787,7 @@ static int lan743x_pcidev_probe(struct pci_dev *pdev, NETIF_MSG_IFDOWN | NETIF_MSG_TX_QUEUED; netdev->max_mtu = LAN743X_MAX_FRAME_SIZE; - mac_addr = of_get_mac_address(pdev->dev.of_node); - if (!IS_ERR(mac_addr)) - ether_addr_copy(adapter->mac_address, mac_addr); + of_get_mac_address(pdev->dev.of_node, adapter->mac_address); ret = lan743x_pci_init(adapter, pdev); if (ret) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index e72fd33a214c..64c6842bd452 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1350,9 +1350,7 @@ static int lpc_eth_drv_probe(struct platform_device *pdev) __lpc_get_mac(pldat, ndev->dev_addr); if (!is_valid_ether_addr(ndev->dev_addr)) { - const char *macaddr = of_get_mac_address(np); - if (!IS_ERR(macaddr)) - ether_addr_copy(ndev->dev_addr, macaddr); + of_get_mac_address(np, ndev->dev_addr); } if (!is_valid_ether_addr(ndev->dev_addr)) eth_hw_addr_random(ndev); diff --git a/drivers/net/ethernet/qualcomm/qca_spi.c b/drivers/net/ethernet/qualcomm/qca_spi.c index 5a3b65a6eb4f..ab9b02574a15 100644 --- a/drivers/net/ethernet/qualcomm/qca_spi.c +++ b/drivers/net/ethernet/qualcomm/qca_spi.c @@ -885,7 +885,7 @@ qca_spi_probe(struct spi_device *spi) struct net_device *qcaspi_devs = NULL; u8 legacy_mode = 0; u16 signature; - const char *mac; + int ret; if (!spi->dev.of_node) { dev_err(&spi->dev, "Missing device tree\n"); @@ -962,12 +962,8 @@ qca_spi_probe(struct spi_device *spi) spi_set_drvdata(spi, qcaspi_devs); - mac = of_get_mac_address(spi->dev.of_node); - - if (!IS_ERR(mac)) - ether_addr_copy(qca->net_dev->dev_addr, mac); - - if (!is_valid_ether_addr(qca->net_dev->dev_addr)) { + ret = of_get_mac_address(spi->dev.of_node, qca->net_dev->dev_addr); + if (ret) { eth_hw_addr_random(qca->net_dev); dev_info(&spi->dev, "Using random MAC address: %pM\n", qca->net_dev->dev_addr); diff --git a/drivers/net/ethernet/qualcomm/qca_uart.c b/drivers/net/ethernet/qualcomm/qca_uart.c index 362b4f5c162c..bcdeca7b3366 100644 --- a/drivers/net/ethernet/qualcomm/qca_uart.c +++ b/drivers/net/ethernet/qualcomm/qca_uart.c @@ -323,7 +323,6 @@ static int qca_uart_probe(struct serdev_device *serdev) { struct net_device *qcauart_dev = alloc_etherdev(sizeof(struct qcauart)); struct qcauart *qca; - const char *mac; u32 speed = 115200; int ret; @@ -348,12 +347,8 @@ static int qca_uart_probe(struct serdev_device *serdev) of_property_read_u32(serdev->dev.of_node, "current-speed", &speed); - mac = of_get_mac_address(serdev->dev.of_node); - - if (!IS_ERR(mac)) - ether_addr_copy(qca->net_dev->dev_addr, mac); - - if (!is_valid_ether_addr(qca->net_dev->dev_addr)) { + ret = of_get_mac_address(serdev->dev.of_node, qca->net_dev->dev_addr); + if (ret) { eth_hw_addr_random(qca->net_dev); dev_info(&serdev->dev, "Using random MAC address: %pM\n", qca->net_dev->dev_addr); diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index 1409ae986aa2..8c84c40ab9a0 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -109,11 +109,13 @@ static void ravb_set_buffer_align(struct sk_buff *skb) * Ethernet AVB device doesn't have ROM for MAC address. * This function gets the MAC address that was used by a bootloader. */ -static void ravb_read_mac_address(struct net_device *ndev, const u8 *mac) +static void ravb_read_mac_address(struct device_node *np, + struct net_device *ndev) { - if (!IS_ERR(mac)) { - ether_addr_copy(ndev->dev_addr, mac); - } else { + int ret; + + ret = of_get_mac_address(np, ndev->dev_addr); + if (ret) { u32 mahr = ravb_read(ndev, MAHR); u32 malr = ravb_read(ndev, MALR); @@ -2207,7 +2209,7 @@ static int ravb_probe(struct platform_device *pdev) priv->msg_enable = RAVB_DEF_MSG_ENABLE; /* Read and set MAC address */ - ravb_read_mac_address(ndev, of_get_mac_address(np)); + ravb_read_mac_address(np, ndev); if (!is_valid_ether_addr(ndev->dev_addr)) { dev_warn(&pdev->dev, "no valid MAC address supplied, using a random one\n"); diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index ebedb1a11132..c5b154868c1f 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3170,7 +3170,6 @@ static struct sh_eth_plat_data *sh_eth_parse_dt(struct device *dev) struct device_node *np = dev->of_node; struct sh_eth_plat_data *pdata; phy_interface_t interface; - const char *mac_addr; int ret; pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL); @@ -3182,9 +3181,7 @@ static struct sh_eth_plat_data *sh_eth_parse_dt(struct device *dev) return NULL; pdata->phy_interface = interface; - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(pdata->mac_addr, mac_addr); + of_get_mac_address(np, pdata->mac_addr); pdata->no_ether_link = of_property_read_bool(np, "renesas,no-ether-link"); diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c index 33f79402850d..4639ed9438a3 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c @@ -25,8 +25,7 @@ #ifdef CONFIG_OF static int sxgbe_probe_config_dt(struct platform_device *pdev, - struct sxgbe_plat_data *plat, - const char **mac) + struct sxgbe_plat_data *plat) { struct device_node *np = pdev->dev.of_node; struct sxgbe_dma_cfg *dma_cfg; @@ -35,7 +34,6 @@ static int sxgbe_probe_config_dt(struct platform_device *pdev, if (!np) return -ENODEV; - *mac = of_get_mac_address(np); err = of_get_phy_mode(np, &plat->interface); if (err && err != -ENODEV) return err; @@ -63,8 +61,7 @@ static int sxgbe_probe_config_dt(struct platform_device *pdev, } #else static int sxgbe_probe_config_dt(struct platform_device *pdev, - struct sxgbe_plat_data *plat, - const char **mac) + struct sxgbe_plat_data *plat) { return -ENOSYS; } @@ -85,7 +82,6 @@ static int sxgbe_platform_probe(struct platform_device *pdev) void __iomem *addr; struct sxgbe_priv_data *priv = NULL; struct sxgbe_plat_data *plat_dat = NULL; - const char *mac = NULL; struct net_device *ndev = platform_get_drvdata(pdev); struct device_node *node = dev->of_node; @@ -101,7 +97,7 @@ static int sxgbe_platform_probe(struct platform_device *pdev) if (!plat_dat) return -ENOMEM; - ret = sxgbe_probe_config_dt(pdev, plat_dat, &mac); + ret = sxgbe_probe_config_dt(pdev, plat_dat); if (ret) { pr_err("%s: main dt probe failed\n", __func__); return ret; @@ -122,8 +118,7 @@ static int sxgbe_platform_probe(struct platform_device *pdev) } /* Get MAC address if available (DT) */ - if (!IS_ERR_OR_NULL(mac)) - ether_addr_copy(priv->dev->dev_addr, mac); + of_get_mac_address(node, priv->dev->dev_addr); /* Get the TX/RX IRQ numbers */ for (i = 0, chan = 1; i < SXGBE_TX_QUEUES; i++) { diff --git a/drivers/net/ethernet/socionext/sni_ave.c b/drivers/net/ethernet/socionext/sni_ave.c index 501b9c7aba56..fcbb4bb31408 100644 --- a/drivers/net/ethernet/socionext/sni_ave.c +++ b/drivers/net/ethernet/socionext/sni_ave.c @@ -1559,7 +1559,6 @@ static int ave_probe(struct platform_device *pdev) struct ave_private *priv; struct net_device *ndev; struct device_node *np; - const void *mac_addr; void __iomem *base; const char *name; int i, irq, ret; @@ -1600,12 +1599,9 @@ static int ave_probe(struct platform_device *pdev) ndev->max_mtu = AVE_MAX_ETHFRAME - (ETH_HLEN + ETH_FCS_LEN); - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); - - /* if the mac address is invalid, use random mac address */ - if (!is_valid_ether_addr(ndev->dev_addr)) { + ret = of_get_mac_address(np, ndev->dev_addr); + if (ret) { + /* if the mac address is invalid, use random mac address */ eth_hw_addr_random(ndev); dev_warn(dev, "Using random MAC address: %pM\n", ndev->dev_addr); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c index 08c76636c164..dfbaea06d108 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c @@ -115,7 +115,7 @@ static int anarion_dwmac_probe(struct platform_device *pdev) if (IS_ERR(gmac)) return PTR_ERR(gmac); - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 27254b27d7ed..bc91fd867dcd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -438,7 +438,7 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev) if (IS_ERR(stmmac_res.addr)) return PTR_ERR(stmmac_res.addr); - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c index fad503820e04..fbfda55b4c52 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-generic.c @@ -27,7 +27,7 @@ static int dwmac_generic_probe(struct platform_device *pdev) return ret; if (pdev->dev.of_node) { - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) { dev_err(&pdev->dev, "dt configuration failed\n"); return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index c1a361305a5a..84651207a1de 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -231,7 +231,7 @@ static int imx_dwmac_probe(struct platform_device *pdev) if (!dwmac) return -ENOMEM; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c index 6c19fcc76c6f..06d287f104be 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c @@ -85,7 +85,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) { dev_err(&pdev->dev, "dt configuration failed\n"); return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index 749585fe6fc9..28dd0ed85a82 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -255,7 +255,7 @@ static int ipq806x_gmac_probe(struct platform_device *pdev) if (val) return val; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c index 3d3f43d91b98..9d77c647badd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c @@ -37,7 +37,7 @@ static int lpc18xx_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index 9e4b83832938..58c0feaa8131 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -407,7 +407,7 @@ static int mediatek_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c index bbc16b5a410a..16fb66a0ca72 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson.c @@ -52,7 +52,7 @@ static int meson6_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c index 848e5c37746b..c7a6588d9398 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-meson8b.c @@ -398,7 +398,7 @@ static int meson8b_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c index 8551ea878ba5..adfeb8d3293d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-oxnas.c @@ -118,7 +118,7 @@ static int oxnas_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index a674b7d6b49a..84382fc5cc4d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -461,7 +461,7 @@ static int qcom_ethqos_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) { dev_err(&pdev->dev, "dt configuration failed\n"); return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c index 6ef30252bfe0..8d28a536e1bb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c @@ -1396,7 +1396,7 @@ static int rk_gmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 70d41783329d..85208128f135 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -398,7 +398,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index e1b63df6f96f..710d7435733e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -325,7 +325,7 @@ static int sti_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c index 5d4df4c5254e..2b38a499a404 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c @@ -371,7 +371,7 @@ static int stm32_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 19e7ec30af4c..4422baeed3d8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1221,7 +1221,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) if (ret) return -EINVAL; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 0e1ca2cba3c7..527077c98ebc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -108,7 +108,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index d23be45a64e5..d046e33b8a29 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -208,7 +208,7 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - plat_dat = stmmac_probe_config_dt(pdev, &stmmac_res.mac); + plat_dat = stmmac_probe_config_dt(pdev, stmmac_res.mac); if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index c49debb62b05..8b4ff9c189a1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -26,7 +26,7 @@ struct stmmac_resources { void __iomem *addr; - const char *mac; + u8 mac[ETH_ALEN]; int wol_irq; int lpi_irq; int irq; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 77285646c5fc..328aeb2cd276 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6016,7 +6016,7 @@ int stmmac_dvr_probe(struct device *device, for (i = 0; i < MTL_MAX_TX_QUEUES; i++) priv->tx_irq[i] = res->tx_irq[i]; - if (!IS_ERR_OR_NULL(res->mac)) + if (!is_zero_ether_addr(res->mac)) memcpy(priv->dev->dev_addr, res->mac, ETH_ALEN); dev_set_drvdata(device, priv->dev); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 5a1e018884e6..1e17a23d9118 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -394,7 +394,7 @@ static int stmmac_of_get_mac_mode(struct device_node *np) * set some private fields that will be used by the main at runtime. */ struct plat_stmmacenet_data * -stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) +stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) { struct device_node *np = pdev->dev.of_node; struct plat_stmmacenet_data *plat; @@ -406,12 +406,12 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) if (!plat) return ERR_PTR(-ENOMEM); - *mac = of_get_mac_address(np); - if (IS_ERR(*mac)) { - if (PTR_ERR(*mac) == -EPROBE_DEFER) - return ERR_CAST(*mac); + rc = of_get_mac_address(np, mac); + if (rc) { + if (rc == -EPROBE_DEFER) + return ERR_PTR(rc); - *mac = NULL; + eth_zero_addr(mac); } plat->phy_interface = device_get_phy_mode(&pdev->dev); @@ -627,7 +627,7 @@ void stmmac_remove_config_dt(struct platform_device *pdev, } #else struct plat_stmmacenet_data * -stmmac_probe_config_dt(struct platform_device *pdev, const char **mac) +stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) { return ERR_PTR(-EINVAL); } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h index 3a4663b7b460..3fff3f59d73d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h @@ -12,7 +12,7 @@ #include "stmmac.h" struct plat_stmmacenet_data * -stmmac_probe_config_dt(struct platform_device *pdev, const char **mac); +stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac); void stmmac_remove_config_dt(struct platform_device *pdev, struct plat_stmmacenet_data *plat); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 638d7b03be4b..6a67b026df0b 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1824,7 +1824,6 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common) for_each_child_of_node(node, port_np) { struct am65_cpsw_port *port; - const void *mac_addr; u32 port_id; /* it is not a slave port node, continue */ @@ -1903,15 +1902,15 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common) return ret; } - mac_addr = of_get_mac_address(port_np); - if (!IS_ERR(mac_addr)) { - ether_addr_copy(port->slave.mac_addr, mac_addr); - } else if (am65_cpsw_am654_get_efuse_macid(port_np, - port->port_id, - port->slave.mac_addr) || - !is_valid_ether_addr(port->slave.mac_addr)) { - random_ether_addr(port->slave.mac_addr); - dev_err(dev, "Use random MAC address\n"); + ret = of_get_mac_address(port_np, port->slave.mac_addr); + if (ret) { + am65_cpsw_am654_get_efuse_macid(port_np, + port->port_id, + port->slave.mac_addr); + if (!is_valid_ether_addr(port->slave.mac_addr)) { + random_ether_addr(port->slave.mac_addr); + dev_err(dev, "Use random MAC address\n"); + } } } of_node_put(node); diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index 074702af3dc6..c0cd7de88316 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -1296,7 +1296,6 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data, for_each_available_child_of_node(node, slave_node) { struct cpsw_slave_data *slave_data = data->slave_data + i; - const void *mac_addr = NULL; int lenp; const __be32 *parp; @@ -1368,10 +1367,8 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data, } no_phy_slave: - mac_addr = of_get_mac_address(slave_node); - if (!IS_ERR(mac_addr)) { - ether_addr_copy(slave_data->mac_addr, mac_addr); - } else { + ret = of_get_mac_address(slave_node, slave_data->mac_addr); + if (ret) { ret = ti_cm_get_macid(&pdev->dev, i, slave_data->mac_addr); if (ret) diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 0751f77de2c7..69b7a4e0220a 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1257,7 +1257,6 @@ static int cpsw_probe_dt(struct cpsw_common *cpsw) for_each_child_of_node(tmp_node, port_np) { struct cpsw_slave_data *slave_data; - const void *mac_addr; u32 port_id; ret = of_property_read_u32(port_np, "reg", &port_id); @@ -1316,10 +1315,8 @@ static int cpsw_probe_dt(struct cpsw_common *cpsw) goto err_node_put; } - mac_addr = of_get_mac_address(port_np); - if (!IS_ERR(mac_addr)) { - ether_addr_copy(slave_data->mac_addr, mac_addr); - } else { + ret = of_get_mac_address(port_np, slave_data->mac_addr); + if (ret) { ret = ti_cm_get_macid(dev, port_id - 1, slave_data->mac_addr); if (ret) diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index c7031e1960d4..14e7da7d302f 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -1687,7 +1687,6 @@ davinci_emac_of_get_pdata(struct platform_device *pdev, struct emac_priv *priv) const struct of_device_id *match; const struct emac_platform_data *auxdata; struct emac_platform_data *pdata = NULL; - const u8 *mac_addr; if (!IS_ENABLED(CONFIG_OF) || !pdev->dev.of_node) return dev_get_platdata(&pdev->dev); @@ -1699,11 +1698,8 @@ davinci_emac_of_get_pdata(struct platform_device *pdev, struct emac_priv *priv) np = pdev->dev.of_node; pdata->version = EMAC_VERSION_2; - if (!is_valid_ether_addr(pdata->mac_addr)) { - mac_addr = of_get_mac_address(np); - if (!IS_ERR(mac_addr)) - ether_addr_copy(pdata->mac_addr, mac_addr); - } + if (!is_valid_ether_addr(pdata->mac_addr)) + of_get_mac_address(np, pdata->mac_addr); of_property_read_u32(np, "ti,davinci-ctrl-reg-offset", &pdata->ctrl_reg_offset); diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index d7a144b4a09f..9030e619e543 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -1966,7 +1966,6 @@ static int netcp_create_interface(struct netcp_device *netcp_device, struct resource res; void __iomem *efuse = NULL; u32 efuse_mac = 0; - const void *mac_addr; u8 efuse_mac_addr[6]; u32 temp[2]; int ret = 0; @@ -2036,10 +2035,8 @@ static int netcp_create_interface(struct netcp_device *netcp_device, devm_iounmap(dev, efuse); devm_release_mem_region(dev, res.start, size); } else { - mac_addr = of_get_mac_address(node_interface); - if (!IS_ERR(mac_addr)) - ether_addr_copy(ndev->dev_addr, mac_addr); - else + ret = of_get_mac_address(node_interface, ndev->dev_addr); + if (ret) eth_random_addr(ndev->dev_addr); } diff --git a/drivers/net/ethernet/wiznet/w5100-spi.c b/drivers/net/ethernet/wiznet/w5100-spi.c index 2b4126d2427d..2b84848dc26a 100644 --- a/drivers/net/ethernet/wiznet/w5100-spi.c +++ b/drivers/net/ethernet/wiznet/w5100-spi.c @@ -423,8 +423,14 @@ static int w5100_spi_probe(struct spi_device *spi) const struct of_device_id *of_id; const struct w5100_ops *ops; kernel_ulong_t driver_data; + const void *mac = NULL; + u8 tmpmac[ETH_ALEN]; int priv_size; - const void *mac = of_get_mac_address(spi->dev.of_node); + int ret; + + ret = of_get_mac_address(spi->dev.of_node, tmpmac); + if (!ret) + mac = tmpmac; if (spi->dev.of_node) { of_id = of_match_device(w5100_of_match, &spi->dev); diff --git a/drivers/net/ethernet/wiznet/w5100.c b/drivers/net/ethernet/wiznet/w5100.c index c0d181a7f83a..ec5db481c9cd 100644 --- a/drivers/net/ethernet/wiznet/w5100.c +++ b/drivers/net/ethernet/wiznet/w5100.c @@ -1157,7 +1157,7 @@ int w5100_probe(struct device *dev, const struct w5100_ops *ops, INIT_WORK(&priv->setrx_work, w5100_setrx_work); INIT_WORK(&priv->restart_work, w5100_restart_work); - if (!IS_ERR_OR_NULL(mac_addr)) + if (mac_addr) memcpy(ndev->dev_addr, mac_addr, ETH_ALEN); else eth_hw_addr_random(ndev); diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 030185301014..a1f5f07f4ca9 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -438,7 +438,7 @@ static void temac_do_set_mac_address(struct net_device *ndev) static int temac_init_mac_address(struct net_device *ndev, const void *address) { - ether_addr_copy(ndev->dev_addr, address); + memcpy(ndev->dev_addr, address, ETH_ALEN); if (!is_valid_ether_addr(ndev->dev_addr)) eth_hw_addr_random(ndev); temac_do_set_mac_address(ndev); @@ -1351,7 +1351,7 @@ static int temac_probe(struct platform_device *pdev) struct device_node *temac_np = dev_of_node(&pdev->dev), *dma_np; struct temac_local *lp; struct net_device *ndev; - const void *addr; + u8 addr[ETH_ALEN]; __be32 *p; bool little_endian; int rc = 0; @@ -1542,8 +1542,8 @@ static int temac_probe(struct platform_device *pdev) if (temac_np) { /* Retrieve the MAC address */ - addr = of_get_mac_address(temac_np); - if (IS_ERR(addr)) { + rc = of_get_mac_address(temac_np, addr); + if (rc) { dev_err(&pdev->dev, "could not find MAC address\n"); return -ENODEV; } diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index feb1aa4ec927..b508c9453f40 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1835,8 +1835,8 @@ static int axienet_probe(struct platform_device *pdev) struct device_node *np; struct axienet_local *lp; struct net_device *ndev; - const void *mac_addr; struct resource *ethres; + u8 mac_addr[ETH_ALEN]; int addr_width = 32; u32 value; @@ -2062,13 +2062,14 @@ static int axienet_probe(struct platform_device *pdev) dev_info(&pdev->dev, "Ethernet core IRQ not defined\n"); /* Retrieve the MAC address */ - mac_addr = of_get_mac_address(pdev->dev.of_node); - if (IS_ERR(mac_addr)) { - dev_warn(&pdev->dev, "could not find MAC address property: %ld\n", - PTR_ERR(mac_addr)); - mac_addr = NULL; + ret = of_get_mac_address(pdev->dev.of_node, mac_addr); + if (!ret) { + axienet_set_mac_address(ndev, mac_addr); + } else { + dev_warn(&pdev->dev, "could not find MAC address property: %d\n", + ret); + axienet_set_mac_address(ndev, NULL); } - axienet_set_mac_address(ndev, mac_addr); lp->coalesce_count_rx = XAXIDMA_DFT_RX_THRESHOLD; lp->coalesce_count_tx = XAXIDMA_DFT_TX_THRESHOLD; diff --git a/drivers/net/ethernet/xilinx/xilinx_emaclite.c b/drivers/net/ethernet/xilinx/xilinx_emaclite.c index 007840d4a807..d9d58a7dabee 100644 --- a/drivers/net/ethernet/xilinx/xilinx_emaclite.c +++ b/drivers/net/ethernet/xilinx/xilinx_emaclite.c @@ -1115,7 +1115,6 @@ static int xemaclite_of_probe(struct platform_device *ofdev) struct net_device *ndev = NULL; struct net_local *lp = NULL; struct device *dev = &ofdev->dev; - const void *mac_address; int rc = 0; @@ -1157,12 +1156,9 @@ static int xemaclite_of_probe(struct platform_device *ofdev) lp->next_rx_buf_to_use = 0x0; lp->tx_ping_pong = get_bool(ofdev, "xlnx,tx-ping-pong"); lp->rx_ping_pong = get_bool(ofdev, "xlnx,rx-ping-pong"); - mac_address = of_get_mac_address(ofdev->dev.of_node); - if (!IS_ERR(mac_address)) { - /* Set the MAC address. */ - ether_addr_copy(ndev->dev_addr, mac_address); - } else { + rc = of_get_mac_address(ofdev->dev.of_node, ndev->dev_addr); + if (rc) { dev_warn(dev, "No MAC address found, using random\n"); eth_hw_addr_random(ndev); } diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index 01f9c26f9bf3..e9a36dd7144f 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -617,7 +617,6 @@ static int ath9k_of_init(struct ath_softc *sc) struct ath_hw *ah = sc->sc_ah; struct ath_common *common = ath9k_hw_common(ah); enum ath_bus_type bus_type = common->bus_ops->ath_bus_type; - const char *mac; char eeprom_name[100]; int ret; @@ -640,9 +639,7 @@ static int ath9k_of_init(struct ath_softc *sc) ah->ah_flags |= AH_NO_EEP_SWAP; } - mac = of_get_mac_address(np); - if (!IS_ERR(mac)) - ether_addr_copy(common->macaddr, mac); + of_get_mac_address(np, common->macaddr); return 0; } diff --git a/drivers/net/wireless/mediatek/mt76/eeprom.c b/drivers/net/wireless/mediatek/mt76/eeprom.c index 665b54c5c8ae..6d895738222a 100644 --- a/drivers/net/wireless/mediatek/mt76/eeprom.c +++ b/drivers/net/wireless/mediatek/mt76/eeprom.c @@ -91,16 +91,9 @@ void mt76_eeprom_override(struct mt76_phy *phy) { struct mt76_dev *dev = phy->dev; - -#ifdef CONFIG_OF struct device_node *np = dev->dev->of_node; - const u8 *mac = NULL; - if (np) - mac = of_get_mac_address(np); - if (!IS_ERR_OR_NULL(mac)) - ether_addr_copy(phy->macaddr, mac); -#endif + of_get_mac_address(np, phy->macaddr); if (!is_valid_ether_addr(phy->macaddr)) { eth_random_addr(phy->macaddr); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c index 61a4f1ad31e2..e95c101c2711 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c @@ -989,11 +989,7 @@ static void rt2x00lib_rate(struct ieee80211_rate *entry, void rt2x00lib_set_mac_address(struct rt2x00_dev *rt2x00dev, u8 *eeprom_mac_addr) { - const char *mac_addr; - - mac_addr = of_get_mac_address(rt2x00dev->dev->of_node); - if (!IS_ERR(mac_addr)) - ether_addr_copy(eeprom_mac_addr, mac_addr); + of_get_mac_address(rt2x00dev->dev->of_node, eeprom_mac_addr); if (!is_valid_ether_addr(eeprom_mac_addr)) { eth_random_addr(eeprom_mac_addr); diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c index bc0a27de69d4..cb77b774bf76 100644 --- a/drivers/of/of_net.c +++ b/drivers/of/of_net.c @@ -45,42 +45,35 @@ int of_get_phy_mode(struct device_node *np, phy_interface_t *interface) } EXPORT_SYMBOL_GPL(of_get_phy_mode); -static const void *of_get_mac_addr(struct device_node *np, const char *name) +static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) { struct property *pp = of_find_property(np, name, NULL); - if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) - return pp->value; - return NULL; + if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) { + memcpy(addr, pp->value, ETH_ALEN); + return 0; + } + return -ENODEV; } -static const void *of_get_mac_addr_nvmem(struct device_node *np) +static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) { - int ret; - const void *mac; - u8 nvmem_mac[ETH_ALEN]; struct platform_device *pdev = of_find_device_by_node(np); + int ret; if (!pdev) - return ERR_PTR(-ENODEV); + return -ENODEV; - ret = nvmem_get_mac_address(&pdev->dev, &nvmem_mac); - if (ret) { - put_device(&pdev->dev); - return ERR_PTR(ret); - } - - mac = devm_kmemdup(&pdev->dev, nvmem_mac, ETH_ALEN, GFP_KERNEL); + ret = nvmem_get_mac_address(&pdev->dev, addr); put_device(&pdev->dev); - if (!mac) - return ERR_PTR(-ENOMEM); - return mac; + return ret; } /** * of_get_mac_address() * @np: Caller's Device Node + * @addr: Pointer to a six-byte array for the result * * Search the device tree for the best MAC address to use. 'mac-address' is * checked first, because that is supposed to contain to "most recent" MAC @@ -101,24 +94,27 @@ static const void *of_get_mac_addr_nvmem(struct device_node *np) * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists * but is all zeros. * - * Return: Will be a valid pointer on success and ERR_PTR in case of error. + * Return: 0 on success and errno in case of error. */ -const void *of_get_mac_address(struct device_node *np) +int of_get_mac_address(struct device_node *np, u8 *addr) { - const void *addr; + int ret; + + if (!np) + return -ENODEV; - addr = of_get_mac_addr(np, "mac-address"); - if (addr) - return addr; + ret = of_get_mac_addr(np, "mac-address", addr); + if (!ret) + return 0; - addr = of_get_mac_addr(np, "local-mac-address"); - if (addr) - return addr; + ret = of_get_mac_addr(np, "local-mac-address", addr); + if (!ret) + return 0; - addr = of_get_mac_addr(np, "address"); - if (addr) - return addr; + ret = of_get_mac_addr(np, "address", addr); + if (!ret) + return 0; - return of_get_mac_addr_nvmem(np); + return of_get_mac_addr_nvmem(np, addr); } EXPORT_SYMBOL(of_get_mac_address); diff --git a/drivers/staging/octeon/ethernet.c b/drivers/staging/octeon/ethernet.c index 5dea6e96ec90..da7c2cd8ebb8 100644 --- a/drivers/staging/octeon/ethernet.c +++ b/drivers/staging/octeon/ethernet.c @@ -407,14 +407,10 @@ static int cvm_oct_common_set_mac_address(struct net_device *dev, void *addr) int cvm_oct_common_init(struct net_device *dev) { struct octeon_ethernet *priv = netdev_priv(dev); - const u8 *mac = NULL; + int ret; - if (priv->of_node) - mac = of_get_mac_address(priv->of_node); - - if (!IS_ERR_OR_NULL(mac)) - ether_addr_copy(dev->dev_addr, mac); - else + ret = of_get_mac_address(priv->of_node, dev->dev_addr); + if (ret) eth_hw_addr_random(dev); /* diff --git a/drivers/staging/wfx/main.c b/drivers/staging/wfx/main.c index e7bc1988124a..4b9fdf99981b 100644 --- a/drivers/staging/wfx/main.c +++ b/drivers/staging/wfx/main.c @@ -334,7 +334,6 @@ int wfx_probe(struct wfx_dev *wdev) { int i; int err; - const void *macaddr; struct gpio_desc *gpio_saved; // During first part of boot, gpio_wakeup cannot yet been used. So @@ -423,9 +422,9 @@ int wfx_probe(struct wfx_dev *wdev) for (i = 0; i < ARRAY_SIZE(wdev->addresses); i++) { eth_zero_addr(wdev->addresses[i].addr); - macaddr = of_get_mac_address(wdev->dev->of_node); - if (!IS_ERR_OR_NULL(macaddr)) { - ether_addr_copy(wdev->addresses[i].addr, macaddr); + err = of_get_mac_address(wdev->dev->of_node, + wdev->addresses[i].addr); + if (!err) { wdev->addresses[i].addr[ETH_ALEN - 1] += i; } else { ether_addr_copy(wdev->addresses[i].addr, diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 71bbfcf3adcd..daef3b0d9270 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -13,7 +13,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); -extern const void *of_get_mac_address(struct device_node *np); +extern int of_get_mac_address(struct device_node *np, u8 *mac); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else static inline int of_get_phy_mode(struct device_node *np, @@ -22,9 +22,9 @@ static inline int of_get_phy_mode(struct device_node *np, return -ENODEV; } -static inline const void *of_get_mac_address(struct device_node *np) +static inline int of_get_mac_address(struct device_node *np, u8 *mac) { - return ERR_PTR(-ENODEV); + return -ENODEV; } static inline struct net_device *of_find_net_device_by_node(struct device_node *np) diff --git a/include/net/dsa.h b/include/net/dsa.h index 57b2c49f72f4..1259b0f40684 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -260,7 +260,7 @@ struct dsa_port { unsigned int index; const char *name; struct dsa_port *cpu_dp; - const char *mac; + u8 mac[ETH_ALEN]; struct device_node *dn; unsigned int ageing_time; bool vlan_filtering; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 3c3e56a1f34d..d7c22e3a1fbf 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -392,7 +392,7 @@ static int dsa_port_setup(struct dsa_port *dp) break; case DSA_PORT_TYPE_USER: - dp->mac = of_get_mac_address(dp->dn); + of_get_mac_address(dp->dn, dp->mac); err = dsa_slave_create(dp); if (err) break; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 995e0e16f295..9300cb66e500 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1896,7 +1896,7 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->hw_features |= NETIF_F_HW_TC; slave_dev->features |= NETIF_F_LLTX; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; - if (!IS_ERR_OR_NULL(port->mac)) + if (!is_zero_ether_addr(port->mac)) ether_addr_copy(slave_dev->dev_addr, port->mac); else eth_hw_addr_inherit(slave_dev, master); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 933b427122be..9cce612e8976 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -511,13 +511,14 @@ unsigned char * __weak arch_get_platform_mac_address(void) int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) { - const unsigned char *addr = NULL; + unsigned char *addr; + int ret; - if (dev->of_node) - addr = of_get_mac_address(dev->of_node); - if (IS_ERR_OR_NULL(addr)) - addr = arch_get_platform_mac_address(); + ret = of_get_mac_address(dev->of_node, mac_addr); + if (!ret) + return 0; + addr = arch_get_platform_mac_address(); if (!addr) return -ENODEV; -- cgit v1.2.3 From 441e8c66b23e027c00ccebd70df9fd933918eefe Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 13 Apr 2021 11:16:06 +0200 Subject: bpf: Return target info when a tracing bpf_link is queried MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is currently no way to discover the target of a tracing program attachment after the fact. Add this information to bpf_link_info and return it when querying the bpf_link fd. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210413091607.58945-1-toke@redhat.com --- include/linux/bpf_verifier.h | 9 +++++++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 3 +++ tools/include/uapi/linux/bpf.h | 2 ++ 4 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 51c2ffa3d901..6023a1367853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -487,6 +487,15 @@ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } +/* unpack the IDs from the key as constructed above */ +static inline void bpf_trampoline_unpack_key(u64 key, u32 *obj_id, u32 *btf_id) +{ + if (obj_id) + *obj_id = key >> 32; + if (btf_id) + *btf_id = key & 0x7FFFFFFF; +} + int bpf_check_attach_target(struct bpf_verifier_log *log, const struct bpf_prog *prog, const struct bpf_prog *tgt_prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85c924bc21b1..df164a44bb41 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5416,6 +5416,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6428634da57e..fd495190115e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2551,6 +2551,9 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link); info->tracing.attach_type = tr_link->attach_type; + bpf_trampoline_unpack_key(tr_link->trampoline->key, + &info->tracing.target_obj_id, + &info->tracing.target_btf_id); return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85c924bc21b1..df164a44bb41 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5416,6 +5416,8 @@ struct bpf_link_info { } raw_tracepoint; struct { __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ } tracing; struct { __u64 cgroup_id; -- cgit v1.2.3 From 6308a5f06be08f3ea1f1a895a9ef54c7b65c4c35 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 2 Mar 2021 21:27:47 +0200 Subject: net/mlx5: E-Switch, Make vport number u16 Vport number is 16-bit field in hardware. Make it u16. Move location of vport in the structure so that it reduces a hole in the structure. Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 3 ++- include/linux/mlx5/eswitch.h | 3 +-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index c7a73dbd64b4..a4b9f78bf4d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -152,7 +152,6 @@ enum mlx5_eswitch_vport_event { struct mlx5_vport { struct mlx5_core_dev *dev; - int vport; struct hlist_head uc_list[MLX5_L2_ADDR_HASH_SIZE]; struct hlist_head mc_list[MLX5_L2_ADDR_HASH_SIZE]; struct mlx5_flow_handle *promisc_rule; @@ -174,6 +173,7 @@ struct mlx5_vport { u32 max_rate; } qos; + u16 vport; bool enabled; enum mlx5_eswitch_vport_event enabled_events; struct devlink_port *dl_port; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1f58e84bdfc6..bbb707117296 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -986,12 +986,13 @@ static void mlx5_eswitch_del_send_to_vport_meta_rules(struct mlx5_eswitch *esw) static int mlx5_eswitch_add_send_to_vport_meta_rules(struct mlx5_eswitch *esw) { - int num_vfs, vport_num, rule_idx = 0, err = 0; struct mlx5_flow_destination dest = {}; struct mlx5_flow_act flow_act = {0}; + int num_vfs, rule_idx = 0, err = 0; struct mlx5_flow_handle *flow_rule; struct mlx5_flow_handle **flows; struct mlx5_flow_spec *spec; + u16 vport_num; num_vfs = esw->esw_funcs.num_vfs; flows = kvzalloc(num_vfs * sizeof(*flows), GFP_KERNEL); diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 429a710c5a99..9cf1da2883c6 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -152,8 +152,7 @@ mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw) }; static inline u32 -mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, - int vport_num) +mlx5_eswitch_get_vport_metadata_for_match(struct mlx5_eswitch *esw, u16 vport_num) { return 0; }; -- cgit v1.2.3 From f4da56529da602010979e8497d1f02eaf5df8883 Mon Sep 17 00:00:00 2001 From: Tan Tee Min Date: Wed, 14 Apr 2021 08:16:17 +0800 Subject: net: stmmac: Add support for external trigger timestamping The Synopsis MAC controller supports auxiliary snapshot feature that allows user to store a snapshot of the system time based on an external event. This patch add supports to the above mentioned feature. Users will be able to triggered capturing the time snapshot from user-space using application such as testptp or any other applications that uses the PTP_EXTTS_REQUEST ioctl request. Cc: Richard Cochran Signed-off-by: Tan Tee Min Co-developed-by: Wong Vee Khee Signed-off-by: Wong Vee Khee Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 10 ++++++ drivers/net/ethernet/stmicro/stmmac/hwif.h | 5 +++ drivers/net/ethernet/stmicro/stmmac/stmmac.h | 3 ++ .../net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 39 +++++++++++++++++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 40 +++++++++++++++++++++- drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h | 1 + include/linux/stmmac.h | 2 ++ 8 files changed, 101 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 60566598d644..ec140fc4a0f5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -296,6 +296,13 @@ static int intel_crosststamp(ktime_t *device, intel_priv = priv->plat->bsp_priv; + /* Both internal crosstimestamping and external triggered event + * timestamping cannot be run concurrently. + */ + if (priv->plat->ext_snapshot_en) + return -EBUSY; + + mutex_lock(&priv->aux_ts_lock); /* Enable Internal snapshot trigger */ acr_value = readl(ptpaddr + PTP_ACR); acr_value &= ~PTP_ACR_MASK; @@ -321,6 +328,8 @@ static int intel_crosststamp(ktime_t *device, acr_value = readl(ptpaddr + PTP_ACR); acr_value |= PTP_ACR_ATSFC; writel(acr_value, ptpaddr + PTP_ACR); + /* Release the mutex */ + mutex_unlock(&priv->aux_ts_lock); /* Trigger Internal snapshot signal * Create a rising edge by just toggle the GPO1 to low @@ -520,6 +529,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->mdio_bus_data->phy_mask |= 1 << INTEL_MGBE_XPCS_ADDR; plat->int_snapshot_num = AUX_SNAPSHOT1; + plat->ext_snapshot_num = AUX_SNAPSHOT0; plat->has_crossts = true; plat->crosststamp = intel_crosststamp; diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 2b5022ef1e52..2cc91759b91f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -504,6 +504,8 @@ struct stmmac_ops { #define stmmac_fpe_irq_status(__priv, __args...) \ stmmac_do_callback(__priv, mac, fpe_irq_status, __args) +struct stmmac_priv; + /* PTP and HW Timer helpers */ struct stmmac_hwtimestamp { void (*config_hw_tstamping) (void __iomem *ioaddr, u32 data); @@ -515,6 +517,7 @@ struct stmmac_hwtimestamp { int add_sub, int gmac4); void (*get_systime) (void __iomem *ioaddr, u64 *systime); void (*get_ptptime)(void __iomem *ioaddr, u64 *ptp_time); + void (*timestamp_interrupt)(struct stmmac_priv *priv); }; #define stmmac_config_hw_tstamping(__priv, __args...) \ @@ -531,6 +534,8 @@ struct stmmac_hwtimestamp { stmmac_do_void_callback(__priv, ptp, get_systime, __args) #define stmmac_get_ptptime(__priv, __args...) \ stmmac_do_void_callback(__priv, ptp, get_ptptime, __args) +#define stmmac_timestamp_interrupt(__priv, __args...) \ + stmmac_do_void_callback(__priv, ptp, timestamp_interrupt, __args) /* Helpers to manage the descriptors for chain and ring modes */ struct stmmac_mode_ops { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index b8a42260066d..b6cd43eda7ac 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -250,6 +250,9 @@ struct stmmac_priv { int use_riwt; int irq_wake; spinlock_t ptp_lock; + /* Protects auxiliary snapshot registers from concurrent access. */ + struct mutex aux_ts_lock; + void __iomem *mmcaddr; void __iomem *ptpaddr; unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 113c51bcc0b5..074e2cdfb0fa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -12,8 +12,11 @@ #include #include #include +#include #include "common.h" #include "stmmac_ptp.h" +#include "dwmac4.h" +#include "stmmac.h" static void config_hw_tstamping(void __iomem *ioaddr, u32 data) { @@ -163,6 +166,41 @@ static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time) *ptp_time = ns; } +static void timestamp_interrupt(struct stmmac_priv *priv) +{ + u32 num_snapshot, ts_status, tsync_int; + struct ptp_clock_event event; + unsigned long flags; + u64 ptp_time; + int i; + + tsync_int = readl(priv->ioaddr + GMAC_INT_STATUS) & GMAC_INT_TSIE; + + if (!tsync_int) + return; + + /* Read timestamp status to clear interrupt from either external + * timestamp or start/end of PPS. + */ + ts_status = readl(priv->ioaddr + GMAC_TIMESTAMP_STATUS); + + if (!priv->plat->ext_snapshot_en) + return; + + num_snapshot = (ts_status & GMAC_TIMESTAMP_ATSNS_MASK) >> + GMAC_TIMESTAMP_ATSNS_SHIFT; + + for (i = 0; i < num_snapshot; i++) { + spin_lock_irqsave(&priv->ptp_lock, flags); + get_ptptime(priv->ptpaddr, &ptp_time); + spin_unlock_irqrestore(&priv->ptp_lock, flags); + event.type = PTP_CLOCK_EXTTS; + event.index = 0; + event.timestamp = ptp_time; + ptp_clock_event(priv->ptp_clock, &event); + } +} + const struct stmmac_hwtimestamp stmmac_ptp = { .config_hw_tstamping = config_hw_tstamping, .init_systime = init_systime, @@ -171,4 +209,5 @@ const struct stmmac_hwtimestamp stmmac_ptp = { .adjust_systime = adjust_systime, .get_systime = get_systime, .get_ptptime = get_ptptime, + .timestamp_interrupt = timestamp_interrupt, }; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index e3e22200a4fd..3a5ca5833ce1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5687,6 +5687,8 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv) else netif_carrier_off(priv->dev); } + + stmmac_timestamp_interrupt(priv, priv); } } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index b164ae22e35f..4e86cdf2bc9f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -135,7 +135,10 @@ static int stmmac_enable(struct ptp_clock_info *ptp, { struct stmmac_priv *priv = container_of(ptp, struct stmmac_priv, ptp_clock_ops); + void __iomem *ptpaddr = priv->ptpaddr; + void __iomem *ioaddr = priv->hw->pcsr; struct stmmac_pps_cfg *cfg; + u32 intr_value, acr_value; int ret = -EOPNOTSUPP; unsigned long flags; @@ -159,6 +162,37 @@ static int stmmac_enable(struct ptp_clock_info *ptp, priv->systime_flags); spin_unlock_irqrestore(&priv->ptp_lock, flags); break; + case PTP_CLK_REQ_EXTTS: + priv->plat->ext_snapshot_en = on; + mutex_lock(&priv->aux_ts_lock); + acr_value = readl(ptpaddr + PTP_ACR); + acr_value &= ~PTP_ACR_MASK; + if (on) { + /* Enable External snapshot trigger */ + acr_value |= priv->plat->ext_snapshot_num; + acr_value |= PTP_ACR_ATSFC; + netdev_dbg(priv->dev, "Auxiliary Snapshot %d enabled.\n", + priv->plat->ext_snapshot_num >> + PTP_ACR_ATSEN_SHIFT); + /* Enable Timestamp Interrupt */ + intr_value = readl(ioaddr + GMAC_INT_EN); + intr_value |= GMAC_INT_TSIE; + writel(intr_value, ioaddr + GMAC_INT_EN); + + } else { + netdev_dbg(priv->dev, "Auxiliary Snapshot %d disabled.\n", + priv->plat->ext_snapshot_num >> + PTP_ACR_ATSEN_SHIFT); + /* Disable Timestamp Interrupt */ + intr_value = readl(ioaddr + GMAC_INT_EN); + intr_value &= ~GMAC_INT_TSIE; + writel(intr_value, ioaddr + GMAC_INT_EN); + } + writel(acr_value, ptpaddr + PTP_ACR); + mutex_unlock(&priv->aux_ts_lock); + ret = 0; + break; + default: break; } @@ -202,7 +236,7 @@ static struct ptp_clock_info stmmac_ptp_clock_ops = { .name = "stmmac ptp", .max_adj = 62500000, .n_alarm = 0, - .n_ext_ts = 0, + .n_ext_ts = 0, /* will be overwritten in stmmac_ptp_register */ .n_per_out = 0, /* will be overwritten in stmmac_ptp_register */ .n_pins = 0, .pps = 0, @@ -237,8 +271,10 @@ void stmmac_ptp_register(struct stmmac_priv *priv) stmmac_ptp_clock_ops.max_adj = priv->plat->ptp_max_adj; stmmac_ptp_clock_ops.n_per_out = priv->dma_cap.pps_out_num; + stmmac_ptp_clock_ops.n_ext_ts = priv->dma_cap.aux_snapshot_n; spin_lock_init(&priv->ptp_lock); + mutex_init(&priv->aux_ts_lock); priv->ptp_clock_ops = stmmac_ptp_clock_ops; priv->ptp_clock = ptp_clock_register(&priv->ptp_clock_ops, @@ -264,4 +300,6 @@ void stmmac_ptp_unregister(struct stmmac_priv *priv) pr_debug("Removed PTP HW clock successfully on %s\n", priv->dev->name); } + + mutex_destroy(&priv->aux_ts_lock); } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h index f88727ce4d30..53172a439810 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.h @@ -73,6 +73,7 @@ #define PTP_ACR_ATSEN1 BIT(5) /* Auxiliary Snapshot 1 Enable */ #define PTP_ACR_ATSEN2 BIT(6) /* Auxiliary Snapshot 2 Enable */ #define PTP_ACR_ATSEN3 BIT(7) /* Auxiliary Snapshot 3 Enable */ +#define PTP_ACR_ATSEN_SHIFT 5 /* Auxiliary Snapshot shift */ #define PTP_ACR_MASK GENMASK(7, 4) /* Aux Snapshot Mask */ #define PMC_ART_VALUE0 0x01 /* PMC_ART[15:0] timer value */ #define PMC_ART_VALUE1 0x02 /* PMC_ART[31:16] timer value */ diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index e338ef7abc00..97edb31d6310 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -238,6 +238,8 @@ struct plat_stmmacenet_data { struct pci_dev *pdev; bool has_crossts; int int_snapshot_num; + int ext_snapshot_num; + bool ext_snapshot_en; bool multi_msi_en; int msi_mac_vec; int msi_wol_vec; -- cgit v1.2.3 From c5797f8a64158f724238d13fa5a4b351b03fe42d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Apr 2021 15:53:13 -0700 Subject: ethtool: move ethtool_stats_init We'll need it for FEC stats as well. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 6 ++++++ net/ethtool/pause.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 9f6f323af59a..069100b252bd 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -244,6 +244,12 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, #define ETHTOOL_STAT_NOT_SET (~0ULL) +static inline void ethtool_stats_init(u64 *stats, unsigned int n) +{ + while (n--) + stats[n] = ETHTOOL_STAT_NOT_SET; +} + /** * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames * @tx_pause_frames: transmitted pause frame count. Reported to user space diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 09998dc5c185..f1967c121278 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -21,12 +21,6 @@ const struct nla_policy ethnl_pause_get_policy[] = { NLA_POLICY_NESTED(ethnl_header_policy_stats), }; -static void ethtool_stats_init(u64 *stats, unsigned int n) -{ - while (n--) - stats[n] = ETHTOOL_STAT_NOT_SET; -} - static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) -- cgit v1.2.3 From be85dbfeb37c8c4d4344da2ee594d78034b82489 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Apr 2021 15:53:15 -0700 Subject: ethtool: add FEC statistics Similarly to pause statistics add stats for FEC. The IEEE standard mandates two sets of counters: - 30.5.1.1.17 aFECCorrectedBlocks - 30.5.1.1.18 aFECUncorrectableBlocks where block is a block of bits FEC operates on. Each of these counters is defined per lane (PCS instance). Multiple vendors provide number of corrected _bits_ rather than/as well as blocks. This set adds the 2 standard-based block counters and a extra one for corrected bits. Counters are exposed to user space via netlink in new attributes. Each attribute carries an array of u64s, first element is the total count, and the following ones are a per-lane break down. Much like with pause stats the operation will not fail when driver does not implement the get_fec_stats callback (nor can the driver fail the operation by returning an error). If stats can't be reported the relevant attributes will be empty. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 21 ++++++++ Documentation/networking/statistics.rst | 2 + include/linux/ethtool.h | 40 +++++++++++++++ include/uapi/linux/ethtool_netlink.h | 14 ++++++ net/ethtool/fec.c | 73 +++++++++++++++++++++++++++- 5 files changed, 149 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index bbecffc7b11a..f8219e2f489e 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1302,6 +1302,7 @@ Kernel response contents: ``ETHTOOL_A_FEC_MODES`` bitset configured modes ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection ``ETHTOOL_A_FEC_ACTIVE`` u32 index of active FEC mode + ``ETHTOOL_A_FEC_STATS`` nested FEC statistics ===================================== ====== ========================== ``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently @@ -1315,6 +1316,26 @@ This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface. ``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode bits (rather than old ``ETHTOOL_FEC_*`` bits). +``ETHTOOL_A_FEC_STATS`` are reported if ``ETHTOOL_FLAG_STATS`` was set in +``ETHTOOL_A_HEADER_FLAGS``. +Each attribute carries an array of 64bit statistics. First entry in the array +contains the total number of events on the port, while the following entries +are counters corresponding to lanes/PCS instances. The number of entries in +the array will be: + ++--------------+---------------------------------------------+ +| `0` | device does not support FEC statistics | ++--------------+---------------------------------------------+ +| `1` | device does not support per-lane break down | ++--------------+---------------------------------------------+ +| `1 + #lanes` | device has full support for FEC stats | ++--------------+---------------------------------------------+ + +Drivers fill in the statistics in the following structure: + +.. kernel-doc:: include/linux/ethtool.h + :identifiers: ethtool_fec_stats + FEC_SET ======= diff --git a/Documentation/networking/statistics.rst b/Documentation/networking/statistics.rst index 234abedc29b2..b748fe44ee02 100644 --- a/Documentation/networking/statistics.rst +++ b/Documentation/networking/statistics.rst @@ -130,6 +130,7 @@ the `ETHTOOL_FLAG_STATS` flag in `ETHTOOL_A_HEADER_FLAGS`. Currently statistics are supported in the following commands: - `ETHTOOL_MSG_PAUSE_GET` + - `ETHTOOL_MSG_FEC_GET` debugfs ------- @@ -176,3 +177,4 @@ translated to netlink attributes when dumped. Drivers must not overwrite the statistics they don't report with 0. - ethtool_pause_stats() +- ethtool_fec_stats() diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 069100b252bd..112a85b57f1f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -269,6 +269,39 @@ struct ethtool_pause_stats { u64 rx_pause_frames; }; +#define ETHTOOL_MAX_LANES 8 + +/** + * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC + * @corrected_blocks: number of received blocks corrected by FEC + * Reported to user space as %ETHTOOL_A_FEC_STAT_CORRECTED. + * + * Equivalent to `30.5.1.1.17 aFECCorrectedBlocks` from the standard. + * + * @uncorrectable_blocks: number of received blocks FEC was not able to correct + * Reported to user space as %ETHTOOL_A_FEC_STAT_UNCORR. + * + * Equivalent to `30.5.1.1.18 aFECUncorrectableBlocks` from the standard. + * + * @corrected_bits: number of bits corrected by FEC + * Similar to @corrected_blocks but counts individual bit changes, + * not entire FEC data blocks. This is a non-standard statistic. + * Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS. + * + * @lane: per-lane/PCS-instance counts as defined by the standard + * @total: error counts for the entire port, for drivers incapable of reporting + * per-lane stats + * + * Drivers should fill in either only total or per-lane statistics, core + * will take care of adding lane values up to produce the total. + */ +struct ethtool_fec_stats { + struct ethtool_fec_stat { + u64 total; + u64 lanes[ETHTOOL_MAX_LANES]; + } corrected_blocks, uncorrectable_blocks, corrected_bits; +}; + #define ETH_MODULE_EEPROM_PAGE_LEN 128 #define ETH_MODULE_MAX_I2C_ADDRESS 0x7f @@ -439,6 +472,11 @@ struct ethtool_module_eeprom { * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), * any change to them will be overwritten by kernel. Returns a negative * error code or zero. + * @get_fec_stats: Report FEC statistics. + * Core will sum up per-lane stats to get the total. + * Drivers must not zero statistics which they don't report. The stats + * structure is initialized to ETHTOOL_STAT_NOT_SET indicating driver does + * not report statistics. * @get_fecparam: Get the network device Forward Error Correction parameters. * @set_fecparam: Set the network device Forward Error Correction parameters. * @get_ethtool_phy_stats: Return extended statistics about the PHY device. @@ -544,6 +582,8 @@ struct ethtool_ops { struct ethtool_link_ksettings *); int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); + void (*get_fec_stats)(struct net_device *dev, + struct ethtool_fec_stats *fec_stats); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 9612dcd48a6a..3a2b31ccbc5b 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -643,11 +643,25 @@ enum { ETHTOOL_A_FEC_MODES, /* bitset */ ETHTOOL_A_FEC_AUTO, /* u8 */ ETHTOOL_A_FEC_ACTIVE, /* u32 */ + ETHTOOL_A_FEC_STATS, /* nest - _A_FEC_STAT */ __ETHTOOL_A_FEC_CNT, ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) }; +enum { + ETHTOOL_A_FEC_STAT_UNSPEC, + ETHTOOL_A_FEC_STAT_PAD, + + ETHTOOL_A_FEC_STAT_CORRECTED, /* array, u64 */ + ETHTOOL_A_FEC_STAT_UNCORR, /* array, u64 */ + ETHTOOL_A_FEC_STAT_CORR_BITS, /* array, u64 */ + + /* add new constants above here */ + __ETHTOOL_A_FEC_STAT_CNT, + ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) +}; + /* MODULE EEPROM */ enum { diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index 3e7d091ee7aa..8738dafd5417 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -13,6 +13,10 @@ struct fec_reply_data { __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes); u32 active_fec; u8 fec_auto; + struct fec_stat_grp { + u64 stats[1 + ETHTOOL_MAX_LANES]; + u8 cnt; + } corr, uncorr, corr_bits; }; #define FEC_REPDATA(__reply_base) \ @@ -21,7 +25,7 @@ struct fec_reply_data { #define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1) const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = { - [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static void @@ -64,6 +68,28 @@ ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec, return 0; } +static void +fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats) +{ + int i; + + if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) { + grp->stats[0] = stats->total; + grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET; + return; + } + + grp->cnt = 1; + grp->stats[0] = 0; + for (i = 0; i < ETHTOOL_MAX_LANES; i++) { + if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET) + break; + + grp->stats[0] += stats->lanes[i]; + grp->stats[grp->cnt++] = stats->lanes[i]; + } +} + static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -82,6 +108,17 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, ret = dev->ethtool_ops->get_fecparam(dev, &fec); if (ret) goto out_complete; + if (req_base->flags & ETHTOOL_FLAG_STATS && + dev->ethtool_ops->get_fec_stats) { + struct ethtool_fec_stats stats; + + ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats); + + fec_stats_recalc(&data->corr, &stats.corrected_blocks); + fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); + fec_stats_recalc(&data->corr_bits, &stats.corrected_bits); + } WARN_ON_ONCE(fec.reserved); @@ -120,9 +157,40 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ + if (req_base->flags & ETHTOOL_FLAG_STATS) + len += 3 * nla_total_size_64bit(sizeof(u64) * + (1 + ETHTOOL_MAX_LANES)); + return len; } +static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS); + if (!nest) + return -EMSGSIZE; + + if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED, + sizeof(u64) * data->corr.cnt, + data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) || + nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR, + sizeof(u64) * data->uncorr.cnt, + data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) || + nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS, + sizeof(u64) * data->corr_bits.cnt, + data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) @@ -143,6 +211,9 @@ static int fec_fill_reply(struct sk_buff *skb, nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec))) return -EMSGSIZE; + if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data)) + return -EMSGSIZE; + return 0; } -- cgit v1.2.3 From 36830159acbeb9896d7684b5f52db7b22efa197f Mon Sep 17 00:00:00 2001 From: Moshe Tal Date: Mon, 15 Feb 2021 16:13:02 +0200 Subject: net/mlx5: Add register layout to support extended link state Add needed structure layouts and defines for pddr register (Port Diagnostics Database Register) and the troublshooting page. This will be used to get extended link state from the monitor opcode bits. Signed-off-by: Moshe Tal Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 50 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2da953ad02ed..4e531c2aab52 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -127,6 +127,7 @@ enum { MLX5_REG_PELC = 0x500e, MLX5_REG_PVLC = 0x500f, MLX5_REG_PCMR = 0x5041, + MLX5_REG_PDDR = 0x5031, MLX5_REG_PMLP = 0x5002, MLX5_REG_PPLM = 0x5023, MLX5_REG_PCAM = 0x507f, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1599deee0456..f2c51d6833c6 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9956,6 +9956,53 @@ struct mlx5_ifc_mirc_reg_bits { u8 reserved_at_20[0x20]; }; +struct mlx5_ifc_pddr_monitor_opcode_bits { + u8 reserved_at_0[0x10]; + u8 monitor_opcode[0x10]; +}; + +union mlx5_ifc_pddr_troubleshooting_page_status_opcode_auto_bits { + struct mlx5_ifc_pddr_monitor_opcode_bits pddr_monitor_opcode; + u8 reserved_at_0[0x20]; +}; + +enum { + /* Monitor opcodes */ + MLX5_PDDR_REG_TRBLSH_GROUP_OPCODE_MONITOR = 0x0, +}; + +struct mlx5_ifc_pddr_troubleshooting_page_bits { + u8 reserved_at_0[0x10]; + u8 group_opcode[0x10]; + + union mlx5_ifc_pddr_troubleshooting_page_status_opcode_auto_bits status_opcode; + + u8 reserved_at_40[0x20]; + + u8 status_message[59][0x20]; +}; + +union mlx5_ifc_pddr_reg_page_data_auto_bits { + struct mlx5_ifc_pddr_troubleshooting_page_bits pddr_troubleshooting_page; + u8 reserved_at_0[0x7c0]; +}; + +enum { + MLX5_PDDR_REG_PAGE_SELECT_TROUBLESHOOTING_INFO_PAGE = 0x1, +}; + +struct mlx5_ifc_pddr_reg_bits { + u8 reserved_at_0[0x8]; + u8 local_port[0x8]; + u8 pnat[0x2]; + u8 reserved_at_12[0xe]; + + u8 reserved_at_20[0x18]; + u8 page_select[0x8]; + + union mlx5_ifc_pddr_reg_page_data_auto_bits page_data; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -9970,6 +10017,9 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_pamp_reg_bits pamp_reg; struct mlx5_ifc_paos_reg_bits paos_reg; struct mlx5_ifc_pcap_reg_bits pcap_reg; + struct mlx5_ifc_pddr_monitor_opcode_bits pddr_monitor_opcode; + struct mlx5_ifc_pddr_reg_bits pddr_reg; + struct mlx5_ifc_pddr_troubleshooting_page_bits pddr_troubleshooting_page; struct mlx5_ifc_peir_reg_bits peir_reg; struct mlx5_ifc_pelc_reg_bits pelc_reg; struct mlx5_ifc_pfcc_reg_bits pfcc_reg; -- cgit v1.2.3 From 2c4eca3ef7161f6632959c00c8eae182f4398901 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 14 Apr 2021 19:52:56 +0300 Subject: net: bridge: switchdev: include local flag in FDB notifications As explained in bugfix commit 6ab4c3117aec ("net: bridge: don't notify switchdev for local FDB addresses") as well as in this discussion: https://lore.kernel.org/netdev/20210117193009.io3nungdwuzmo5f7@skbuf/ the switchdev notifiers for FDB entries managed to have a zero-day bug, which was that drivers would not know what to do with local FDB entries, because they were not told that they are local. The bug fix was to simply not notify them of those addresses. Let us now add the 'is_local' bit to bridge FDB entries, and make all drivers ignore these entries by their own choice. Co-developed-by: Tobias Waldekranz Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Reviewed-by: Grygorii Strashko Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 4 ++-- drivers/net/ethernet/marvell/prestera/prestera_switchdev.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 5 +++-- drivers/net/ethernet/rocker/rocker_main.c | 4 ++-- drivers/net/ethernet/ti/am65-cpsw-switchdev.c | 4 ++-- drivers/net/ethernet/ti/cpsw_switchdev.c | 4 ++-- include/net/switchdev.h | 1 + net/bridge/br_switchdev.c | 3 +-- net/dsa/slave.c | 2 +- 9 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 5250d51d783c..05de37c3b64c 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -2098,7 +2098,7 @@ static void dpaa2_switch_event_work(struct work_struct *work) switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) break; if (is_unicast_ether_addr(fdb_info->addr)) err = dpaa2_switch_port_fdb_add_uc(netdev_priv(dev), @@ -2113,7 +2113,7 @@ static void dpaa2_switch_event_work(struct work_struct *work) &fdb_info->info, NULL); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) break; if (is_unicast_ether_addr(fdb_info->addr)) dpaa2_switch_port_fdb_del_uc(netdev_priv(dev), fdb_info->addr); diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 49e052273f30..cb564890a3dc 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -798,7 +798,7 @@ static void prestera_fdb_event_work(struct work_struct *work) switch (swdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: fdb_info = &swdev_work->fdb_info; - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) break; err = prestera_port_fdb_set(port, fdb_info, true); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index c1f05c17557d..eeccd586e781 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2916,7 +2916,8 @@ mlxsw_sp_switchdev_bridge_nve_fdb_event(struct mlxsw_sp_switchdev_event_work * return; if (switchdev_work->event == SWITCHDEV_FDB_ADD_TO_DEVICE && - !switchdev_work->fdb_info.added_by_user) + (!switchdev_work->fdb_info.added_by_user || + switchdev_work->fdb_info.is_local)) return; if (!netif_running(dev)) @@ -2971,7 +2972,7 @@ static void mlxsw_sp_switchdev_bridge_fdb_event_work(struct work_struct *work) switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) break; err = mlxsw_sp_port_fdb_set(mlxsw_sp_port, fdb_info, true); if (err) diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 3473d296b2e2..a46633606cae 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2736,7 +2736,7 @@ static void rocker_switchdev_event_work(struct work_struct *work) switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) break; err = rocker_world_port_fdb_add(rocker_port, fdb_info); if (err) { @@ -2747,7 +2747,7 @@ static void rocker_switchdev_event_work(struct work_struct *work) break; case SWITCHDEV_FDB_DEL_TO_DEVICE: fdb_info = &switchdev_work->fdb_info; - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) break; err = rocker_world_port_fdb_del(rocker_port, fdb_info); if (err) diff --git a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c index d93ffd8a08b0..23cfb91e9c4d 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c +++ b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c @@ -385,7 +385,7 @@ static void am65_cpsw_switchdev_event_work(struct work_struct *work) fdb->addr, fdb->vid, fdb->added_by_user, fdb->offloaded, port_id); - if (!fdb->added_by_user) + if (!fdb->added_by_user || fdb->is_local) break; if (memcmp(port->slave.mac_addr, (u8 *)fdb->addr, ETH_ALEN) == 0) port_id = HOST_PORT_NUM; @@ -401,7 +401,7 @@ static void am65_cpsw_switchdev_event_work(struct work_struct *work) fdb->addr, fdb->vid, fdb->added_by_user, fdb->offloaded, port_id); - if (!fdb->added_by_user) + if (!fdb->added_by_user || fdb->is_local) break; if (memcmp(port->slave.mac_addr, (u8 *)fdb->addr, ETH_ALEN) == 0) port_id = HOST_PORT_NUM; diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index a72bb570756f..05a64fb7a04f 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -395,7 +395,7 @@ static void cpsw_switchdev_event_work(struct work_struct *work) fdb->addr, fdb->vid, fdb->added_by_user, fdb->offloaded, port); - if (!fdb->added_by_user) + if (!fdb->added_by_user || fdb->is_local) break; if (memcmp(priv->mac_addr, (u8 *)fdb->addr, ETH_ALEN) == 0) port = HOST_PORT_NUM; @@ -411,7 +411,7 @@ static void cpsw_switchdev_event_work(struct work_struct *work) fdb->addr, fdb->vid, fdb->added_by_user, fdb->offloaded, port); - if (!fdb->added_by_user) + if (!fdb->added_by_user || fdb->is_local) break; if (memcmp(priv->mac_addr, (u8 *)fdb->addr, ETH_ALEN) == 0) port = HOST_PORT_NUM; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 8c3218177136..f1a5a9a3634d 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -209,6 +209,7 @@ struct switchdev_notifier_fdb_info { const unsigned char *addr; u16 vid; u8 added_by_user:1, + is_local:1, offloaded:1; }; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index c390f84adea2..a5e601e41cb9 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -114,13 +114,12 @@ br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type) .addr = fdb->key.addr.addr, .vid = fdb->key.vlan_id, .added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags), + .is_local = test_bit(BR_FDB_LOCAL, &fdb->flags), .offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags), }; if (!fdb->dst) return; - if (test_bit(BR_FDB_LOCAL, &fdb->flags)) - return; switch (type) { case RTM_DELNEIGH: diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 9300cb66e500..3ae67202fda2 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2329,7 +2329,7 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, fdb_info = ptr; if (dsa_slave_dev_check(dev)) { - if (!fdb_info->added_by_user) + if (!fdb_info->added_by_user || fdb_info->is_local) return NOTIFY_OK; dp = dsa_slave_to_port(dev); -- cgit v1.2.3 From 9a44c1cc63887627284ae232a9626a9f1cd066fc Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Fri, 16 Apr 2021 10:36:33 +0200 Subject: net: Add a WWAN subsystem This change introduces initial support for a WWAN framework. Given the complexity and heterogeneity of existing WWAN hardwares and interfaces, there is no strict definition of what a WWAN device is and how it should be represented. It's often a collection of multiple devices that perform the global WWAN feature (netdev, tty, chardev, etc). One usual way to expose modem controls and configuration is via high level protocols such as the well known AT command protocol, MBIM or QMI. The USB modems started to expose them as character devices, and user daemons such as ModemManager learnt to use them. This initial version adds the concept of WWAN port, which is a logical pipe to a modem control protocol. The protocols are rawly exposed to user via character device, allowing straigthforward support in existing tools (ModemManager, ofono...). The WWAN core takes care of the generic part, including character device management, and relies on port driver operations to receive/submit protocol data. Since the different devices exposing protocols for a same WWAN hardware do not necessarily know about each others (e.g. two different USB interfaces, PCI/MHI channel devices...) and can be created/removed in different orders, the WWAN core ensures that all WAN ports contributing to the 'whole' WWAN feature are grouped under the same virtual WWAN device, relying on the provided parent device (e.g. mhi controller, USB device). It's a 'trick' I copied from Johannes's earlier WWAN subsystem proposal. This initial version is purposely minimalist, it's essentially moving the generic part of the previously proposed mhi_wwan_ctrl driver inside a common WWAN framework, but the implementation is open and flexible enough to allow extension for further drivers. Signed-off-by: Loic Poulain Signed-off-by: David S. Miller --- drivers/net/Kconfig | 2 + drivers/net/Makefile | 1 + drivers/net/wwan/Kconfig | 23 ++ drivers/net/wwan/Makefile | 7 + drivers/net/wwan/wwan_core.c | 552 +++++++++++++++++++++++++++++++++++++++++++ include/linux/wwan.h | 111 +++++++++ 6 files changed, 696 insertions(+) create mode 100644 drivers/net/wwan/Kconfig create mode 100644 drivers/net/wwan/Makefile create mode 100644 drivers/net/wwan/wwan_core.c create mode 100644 include/linux/wwan.h (limited to 'include') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 5895905b6aa1..74dc8e249faa 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -502,6 +502,8 @@ source "drivers/net/wan/Kconfig" source "drivers/net/ieee802154/Kconfig" +source "drivers/net/wwan/Kconfig" + config XEN_NETDEV_FRONTEND tristate "Xen network device frontend driver" depends on XEN diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 040e20b81317..7ffd2d03efaf 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_SUNGEM_PHY) += sungem_phy.o obj-$(CONFIG_WAN) += wan/ obj-$(CONFIG_WLAN) += wireless/ obj-$(CONFIG_IEEE802154) += ieee802154/ +obj-$(CONFIG_WWAN) += wwan/ obj-$(CONFIG_VMXNET3) += vmxnet3/ obj-$(CONFIG_XEN_NETDEV_FRONTEND) += xen-netfront.o diff --git a/drivers/net/wwan/Kconfig b/drivers/net/wwan/Kconfig new file mode 100644 index 000000000000..fc3f3a1c80ee --- /dev/null +++ b/drivers/net/wwan/Kconfig @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Wireless WAN device configuration +# + +menuconfig WWAN + bool "Wireless WAN" + help + This section contains Wireless WAN configuration for WWAN framework + and drivers. + +if WWAN + +config WWAN_CORE + tristate "WWAN Driver Core" + help + Say Y here if you want to use the WWAN driver core. This driver + provides a common framework for WWAN drivers. + + To compile this driver as a module, choose M here: the module will be + called wwan. + +endif # WWAN diff --git a/drivers/net/wwan/Makefile b/drivers/net/wwan/Makefile new file mode 100644 index 000000000000..934590b9e47d --- /dev/null +++ b/drivers/net/wwan/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux WWAN device drivers. +# + +obj-$(CONFIG_WWAN_CORE) += wwan.o +wwan-objs += wwan_core.o diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c new file mode 100644 index 000000000000..b618b7937846 --- /dev/null +++ b/drivers/net/wwan/wwan_core.c @@ -0,0 +1,552 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021, Linaro Ltd */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define WWAN_MAX_MINORS 256 /* 256 minors allowed with register_chrdev() */ + +static DEFINE_MUTEX(wwan_register_lock); /* WWAN device create|remove lock */ +static DEFINE_IDA(minors); /* minors for WWAN port chardevs */ +static DEFINE_IDA(wwan_dev_ids); /* for unique WWAN device IDs */ +static struct class *wwan_class; +static int wwan_major; + +#define to_wwan_dev(d) container_of(d, struct wwan_device, dev) +#define to_wwan_port(d) container_of(d, struct wwan_port, dev) + +/* WWAN port flags */ +#define WWAN_PORT_TX_OFF BIT(0) + +/** + * struct wwan_device - The structure that defines a WWAN device + * + * @id: WWAN device unique ID. + * @dev: Underlying device. + * @port_id: Current available port ID to pick. + */ +struct wwan_device { + unsigned int id; + struct device dev; + atomic_t port_id; +}; + +/** + * struct wwan_port - The structure that defines a WWAN port + * @type: Port type + * @start_count: Port start counter + * @flags: Store port state and capabilities + * @ops: Pointer to WWAN port operations + * @ops_lock: Protect port ops + * @dev: Underlying device + * @rxq: Buffer inbound queue + * @waitqueue: The waitqueue for port fops (read/write/poll) + */ +struct wwan_port { + enum wwan_port_type type; + unsigned int start_count; + unsigned long flags; + const struct wwan_port_ops *ops; + struct mutex ops_lock; /* Serialize ops + protect against removal */ + struct device dev; + struct sk_buff_head rxq; + wait_queue_head_t waitqueue; +}; + +static void wwan_dev_destroy(struct device *dev) +{ + struct wwan_device *wwandev = to_wwan_dev(dev); + + ida_free(&wwan_dev_ids, wwandev->id); + kfree(wwandev); +} + +static const struct device_type wwan_dev_type = { + .name = "wwan_dev", + .release = wwan_dev_destroy, +}; + +static int wwan_dev_parent_match(struct device *dev, const void *parent) +{ + return (dev->type == &wwan_dev_type && dev->parent == parent); +} + +static struct wwan_device *wwan_dev_get_by_parent(struct device *parent) +{ + struct device *dev; + + dev = class_find_device(wwan_class, NULL, parent, wwan_dev_parent_match); + if (!dev) + return ERR_PTR(-ENODEV); + + return to_wwan_dev(dev); +} + +/* This function allocates and registers a new WWAN device OR if a WWAN device + * already exist for the given parent, it gets a reference and return it. + * This function is not exported (for now), it is called indirectly via + * wwan_create_port(). + */ +static struct wwan_device *wwan_create_dev(struct device *parent) +{ + struct wwan_device *wwandev; + int err, id; + + /* The 'find-alloc-register' operation must be protected against + * concurrent execution, a WWAN device is possibly shared between + * multiple callers or concurrently unregistered from wwan_remove_dev(). + */ + mutex_lock(&wwan_register_lock); + + /* If wwandev already exists, return it */ + wwandev = wwan_dev_get_by_parent(parent); + if (!IS_ERR(wwandev)) + goto done_unlock; + + id = ida_alloc(&wwan_dev_ids, GFP_KERNEL); + if (id < 0) + goto done_unlock; + + wwandev = kzalloc(sizeof(*wwandev), GFP_KERNEL); + if (!wwandev) { + ida_free(&wwan_dev_ids, id); + goto done_unlock; + } + + wwandev->dev.parent = parent; + wwandev->dev.class = wwan_class; + wwandev->dev.type = &wwan_dev_type; + wwandev->id = id; + dev_set_name(&wwandev->dev, "wwan%d", wwandev->id); + + err = device_register(&wwandev->dev); + if (err) { + put_device(&wwandev->dev); + wwandev = NULL; + } + +done_unlock: + mutex_unlock(&wwan_register_lock); + + return wwandev; +} + +static int is_wwan_child(struct device *dev, void *data) +{ + return dev->class == wwan_class; +} + +static void wwan_remove_dev(struct wwan_device *wwandev) +{ + int ret; + + /* Prevent concurrent picking from wwan_create_dev */ + mutex_lock(&wwan_register_lock); + + /* WWAN device is created and registered (get+add) along with its first + * child port, and subsequent port registrations only grab a reference + * (get). The WWAN device must then be unregistered (del+put) along with + * its latest port, and reference simply dropped (put) otherwise. + */ + ret = device_for_each_child(&wwandev->dev, NULL, is_wwan_child); + if (!ret) + device_unregister(&wwandev->dev); + else + put_device(&wwandev->dev); + + mutex_unlock(&wwan_register_lock); +} + +/* ------- WWAN port management ------- */ + +static void wwan_port_destroy(struct device *dev) +{ + struct wwan_port *port = to_wwan_port(dev); + + ida_free(&minors, MINOR(port->dev.devt)); + skb_queue_purge(&port->rxq); + mutex_destroy(&port->ops_lock); + kfree(port); +} + +static const struct device_type wwan_port_dev_type = { + .name = "wwan_port", + .release = wwan_port_destroy, +}; + +static int wwan_port_minor_match(struct device *dev, const void *minor) +{ + return (dev->type == &wwan_port_dev_type && + MINOR(dev->devt) == *(unsigned int *)minor); +} + +static struct wwan_port *wwan_port_get_by_minor(unsigned int minor) +{ + struct device *dev; + + dev = class_find_device(wwan_class, NULL, &minor, wwan_port_minor_match); + if (!dev) + return ERR_PTR(-ENODEV); + + return to_wwan_port(dev); +} + +/* Keep aligned with wwan_port_type enum */ +static const char * const wwan_port_type_str[] = { + "AT", + "MBIM", + "QMI", + "QCDM", + "FIREHOSE" +}; + +struct wwan_port *wwan_create_port(struct device *parent, + enum wwan_port_type type, + const struct wwan_port_ops *ops, + void *drvdata) +{ + struct wwan_device *wwandev; + struct wwan_port *port; + int minor, err = -ENOMEM; + + if (type >= WWAN_PORT_MAX || !ops) + return ERR_PTR(-EINVAL); + + /* A port is always a child of a WWAN device, retrieve (allocate or + * pick) the WWAN device based on the provided parent device. + */ + wwandev = wwan_create_dev(parent); + if (IS_ERR(wwandev)) + return ERR_CAST(wwandev); + + /* A port is exposed as character device, get a minor */ + minor = ida_alloc_range(&minors, 0, WWAN_MAX_MINORS - 1, GFP_KERNEL); + if (minor < 0) + goto error_wwandev_remove; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) { + ida_free(&minors, minor); + goto error_wwandev_remove; + } + + port->type = type; + port->ops = ops; + mutex_init(&port->ops_lock); + skb_queue_head_init(&port->rxq); + init_waitqueue_head(&port->waitqueue); + + port->dev.parent = &wwandev->dev; + port->dev.class = wwan_class; + port->dev.type = &wwan_port_dev_type; + port->dev.devt = MKDEV(wwan_major, minor); + dev_set_drvdata(&port->dev, drvdata); + + /* create unique name based on wwan device id, port index and type */ + dev_set_name(&port->dev, "wwan%up%u%s", wwandev->id, + atomic_inc_return(&wwandev->port_id), + wwan_port_type_str[port->type]); + + err = device_register(&port->dev); + if (err) + goto error_put_device; + + return port; + +error_put_device: + put_device(&port->dev); +error_wwandev_remove: + wwan_remove_dev(wwandev); + + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(wwan_create_port); + +void wwan_remove_port(struct wwan_port *port) +{ + struct wwan_device *wwandev = to_wwan_dev(port->dev.parent); + + mutex_lock(&port->ops_lock); + if (port->start_count) + port->ops->stop(port); + port->ops = NULL; /* Prevent any new port operations (e.g. from fops) */ + mutex_unlock(&port->ops_lock); + + wake_up_interruptible(&port->waitqueue); + + skb_queue_purge(&port->rxq); + dev_set_drvdata(&port->dev, NULL); + device_unregister(&port->dev); + + /* Release related wwan device */ + wwan_remove_dev(wwandev); +} +EXPORT_SYMBOL_GPL(wwan_remove_port); + +void wwan_port_rx(struct wwan_port *port, struct sk_buff *skb) +{ + skb_queue_tail(&port->rxq, skb); + wake_up_interruptible(&port->waitqueue); +} +EXPORT_SYMBOL_GPL(wwan_port_rx); + +void wwan_port_txon(struct wwan_port *port) +{ + clear_bit(WWAN_PORT_TX_OFF, &port->flags); + wake_up_interruptible(&port->waitqueue); +} +EXPORT_SYMBOL_GPL(wwan_port_txon); + +void wwan_port_txoff(struct wwan_port *port) +{ + set_bit(WWAN_PORT_TX_OFF, &port->flags); +} +EXPORT_SYMBOL_GPL(wwan_port_txoff); + +void *wwan_port_get_drvdata(struct wwan_port *port) +{ + return dev_get_drvdata(&port->dev); +} +EXPORT_SYMBOL_GPL(wwan_port_get_drvdata); + +static int wwan_port_op_start(struct wwan_port *port) +{ + int ret = 0; + + mutex_lock(&port->ops_lock); + if (!port->ops) { /* Port got unplugged */ + ret = -ENODEV; + goto out_unlock; + } + + /* If port is already started, don't start again */ + if (!port->start_count) + ret = port->ops->start(port); + + if (!ret) + port->start_count++; + +out_unlock: + mutex_unlock(&port->ops_lock); + + return ret; +} + +static void wwan_port_op_stop(struct wwan_port *port) +{ + mutex_lock(&port->ops_lock); + port->start_count--; + if (port->ops && !port->start_count) + port->ops->stop(port); + mutex_unlock(&port->ops_lock); +} + +static int wwan_port_op_tx(struct wwan_port *port, struct sk_buff *skb) +{ + int ret; + + mutex_lock(&port->ops_lock); + if (!port->ops) { /* Port got unplugged */ + ret = -ENODEV; + goto out_unlock; + } + + ret = port->ops->tx(port, skb); + +out_unlock: + mutex_unlock(&port->ops_lock); + + return ret; +} + +static bool is_read_blocked(struct wwan_port *port) +{ + return skb_queue_empty(&port->rxq) && port->ops; +} + +static bool is_write_blocked(struct wwan_port *port) +{ + return test_bit(WWAN_PORT_TX_OFF, &port->flags) && port->ops; +} + +static int wwan_wait_rx(struct wwan_port *port, bool nonblock) +{ + if (!is_read_blocked(port)) + return 0; + + if (nonblock) + return -EAGAIN; + + if (wait_event_interruptible(port->waitqueue, !is_read_blocked(port))) + return -ERESTARTSYS; + + return 0; +} + +static int wwan_wait_tx(struct wwan_port *port, bool nonblock) +{ + if (!is_write_blocked(port)) + return 0; + + if (nonblock) + return -EAGAIN; + + if (wait_event_interruptible(port->waitqueue, !is_write_blocked(port))) + return -ERESTARTSYS; + + return 0; +} + +static int wwan_port_fops_open(struct inode *inode, struct file *file) +{ + struct wwan_port *port; + int err = 0; + + port = wwan_port_get_by_minor(iminor(inode)); + if (IS_ERR(port)) + return PTR_ERR(port); + + file->private_data = port; + stream_open(inode, file); + + err = wwan_port_op_start(port); + if (err) + put_device(&port->dev); + + return err; +} + +static int wwan_port_fops_release(struct inode *inode, struct file *filp) +{ + struct wwan_port *port = filp->private_data; + + wwan_port_op_stop(port); + put_device(&port->dev); + + return 0; +} + +static ssize_t wwan_port_fops_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct wwan_port *port = filp->private_data; + struct sk_buff *skb; + size_t copied; + int ret; + + ret = wwan_wait_rx(port, !!(filp->f_flags & O_NONBLOCK)); + if (ret) + return ret; + + skb = skb_dequeue(&port->rxq); + if (!skb) + return -EIO; + + copied = min_t(size_t, count, skb->len); + if (copy_to_user(buf, skb->data, copied)) { + kfree_skb(skb); + return -EFAULT; + } + skb_pull(skb, copied); + + /* skb is not fully consumed, keep it in the queue */ + if (skb->len) + skb_queue_head(&port->rxq, skb); + else + consume_skb(skb); + + return copied; +} + +static ssize_t wwan_port_fops_write(struct file *filp, const char __user *buf, + size_t count, loff_t *offp) +{ + struct wwan_port *port = filp->private_data; + struct sk_buff *skb; + int ret; + + ret = wwan_wait_tx(port, !!(filp->f_flags & O_NONBLOCK)); + if (ret) + return ret; + + skb = alloc_skb(count, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + if (copy_from_user(skb_put(skb, count), buf, count)) { + kfree_skb(skb); + return -EFAULT; + } + + ret = wwan_port_op_tx(port, skb); + if (ret) { + kfree_skb(skb); + return ret; + } + + return count; +} + +static __poll_t wwan_port_fops_poll(struct file *filp, poll_table *wait) +{ + struct wwan_port *port = filp->private_data; + __poll_t mask = 0; + + poll_wait(filp, &port->waitqueue, wait); + + if (!is_write_blocked(port)) + mask |= EPOLLOUT | EPOLLWRNORM; + if (!is_read_blocked(port)) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static const struct file_operations wwan_port_fops = { + .owner = THIS_MODULE, + .open = wwan_port_fops_open, + .release = wwan_port_fops_release, + .read = wwan_port_fops_read, + .write = wwan_port_fops_write, + .poll = wwan_port_fops_poll, + .llseek = noop_llseek, +}; + +static int __init wwan_init(void) +{ + wwan_class = class_create(THIS_MODULE, "wwan"); + if (IS_ERR(wwan_class)) + return PTR_ERR(wwan_class); + + /* chrdev used for wwan ports */ + wwan_major = register_chrdev(0, "wwan_port", &wwan_port_fops); + if (wwan_major < 0) { + class_destroy(wwan_class); + return wwan_major; + } + + return 0; +} + +static void __exit wwan_exit(void) +{ + unregister_chrdev(wwan_major, "wwan_port"); + class_destroy(wwan_class); +} + +module_init(wwan_init); +module_exit(wwan_exit); + +MODULE_AUTHOR("Loic Poulain "); +MODULE_DESCRIPTION("WWAN core"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/wwan.h b/include/linux/wwan.h new file mode 100644 index 000000000000..aa05a253dcf9 --- /dev/null +++ b/include/linux/wwan.h @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021, Linaro Ltd */ + +#ifndef __WWAN_H +#define __WWAN_H + +#include +#include +#include + +/** + * enum wwan_port_type - WWAN port types + * @WWAN_PORT_AT: AT commands + * @WWAN_PORT_MBIM: Mobile Broadband Interface Model control + * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control + * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface + * @WWAN_PORT_FIREHOSE: XML based command protocol + * @WWAN_PORT_MAX: Number of supported port types + */ +enum wwan_port_type { + WWAN_PORT_AT, + WWAN_PORT_MBIM, + WWAN_PORT_QMI, + WWAN_PORT_QCDM, + WWAN_PORT_FIREHOSE, + WWAN_PORT_MAX, +}; + +struct wwan_port; + +/** struct wwan_port_ops - The WWAN port operations + * @start: The routine for starting the WWAN port device. + * @stop: The routine for stopping the WWAN port device. + * @tx: The routine that sends WWAN port protocol data to the device. + * + * The wwan_port_ops structure contains a list of low-level operations + * that control a WWAN port device. All functions are mandatory. + */ +struct wwan_port_ops { + int (*start)(struct wwan_port *port); + void (*stop)(struct wwan_port *port); + int (*tx)(struct wwan_port *port, struct sk_buff *skb); +}; + +/** + * wwan_create_port - Add a new WWAN port + * @parent: Device to use as parent and shared by all WWAN ports + * @type: WWAN port type + * @ops: WWAN port operations + * @drvdata: Pointer to caller driver data + * + * Allocate and register a new WWAN port. The port will be automatically exposed + * to user as a character device and attached to the right virtual WWAN device, + * based on the parent pointer. The parent pointer is the device shared by all + * components of a same WWAN modem (e.g. USB dev, PCI dev, MHI controller...). + * + * drvdata will be placed in the WWAN port device driver data and can be + * retrieved with wwan_port_get_drvdata(). + * + * This function must be balanced with a call to wwan_remove_port(). + * + * Returns a valid pointer to wwan_port on success or PTR_ERR on failure + */ +struct wwan_port *wwan_create_port(struct device *parent, + enum wwan_port_type type, + const struct wwan_port_ops *ops, + void *drvdata); + +/** + * wwan_remove_port - Remove a WWAN port + * @port: WWAN port to remove + * + * Remove a previously created port. + */ +void wwan_remove_port(struct wwan_port *port); + +/** + * wwan_port_rx - Receive data from the WWAN port + * @port: WWAN port for which data is received + * @skb: Pointer to the rx buffer + * + * A port driver calls this function upon data reception (MBIM, AT...). + */ +void wwan_port_rx(struct wwan_port *port, struct sk_buff *skb); + +/** + * wwan_port_txoff - Stop TX on WWAN port + * @port: WWAN port for which TX must be stopped + * + * Used for TX flow control, a port driver calls this function to indicate TX + * is temporary unavailable (e.g. due to ring buffer fullness). + */ +void wwan_port_txoff(struct wwan_port *port); + + +/** + * wwan_port_txon - Restart TX on WWAN port + * @port: WWAN port for which TX must be restarted + * + * Used for TX flow control, a port driver calls this function to indicate TX + * is available again. + */ +void wwan_port_txon(struct wwan_port *port); + +/** + * wwan_port_get_drvdata - Retrieve driver data from a WWAN port + * @port: Related WWAN port + */ +void *wwan_port_get_drvdata(struct wwan_port *port); + +#endif /* __WWAN_H */ -- cgit v1.2.3 From f09ea6fb12723d6726293d68de00b6307368bd76 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:39 -0700 Subject: ethtool: add a new command for reading standard stats Add an interface for reading standard stats, including stats which don't have a corresponding control interface. Start with IEEE 802.3 PHY stats. There seems to be only one stat to expose there. Define API to not require user space changes when new stats or groups are added. Groups are based on bitset, stats have a string set associated. v1: wrap stats in a nest Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 10 ++ include/uapi/linux/ethtool.h | 4 + include/uapi/linux/ethtool_netlink.h | 47 ++++++++ net/ethtool/Makefile | 2 +- net/ethtool/netlink.c | 10 ++ net/ethtool/netlink.h | 5 + net/ethtool/stats.c | 200 +++++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 10 ++ 8 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/stats.c (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 112a85b57f1f..2d5455eedbf4 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -250,6 +250,13 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n) stats[n] = ETHTOOL_STAT_NOT_SET; } +/* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed + * via a more targeted API. + */ +struct ethtool_eth_phy_stats { + u64 SymbolErrorDuringCarrier; +}; + /** * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames * @tx_pause_frames: transmitted pause frame count. Reported to user space @@ -487,6 +494,7 @@ struct ethtool_module_eeprom { * @get_module_eeprom_by_page: Get a region of plug-in module EEPROM data from * specified page. Returns a negative error code or the amount of bytes * read. + * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -597,6 +605,8 @@ struct ethtool_ops { int (*get_module_eeprom_by_page)(struct net_device *dev, const struct ethtool_module_eeprom *page, struct netlink_ext_ack *extack); + void (*get_eth_phy_stats)(struct net_device *dev, + struct ethtool_eth_phy_stats *phy_stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f91e079e3108..190ae6e03918 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -669,6 +669,8 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_TS_TX_TYPES: timestamping Tx types * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types + * @ETH_SS_STATS_STD: standardized stats + * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -689,6 +691,8 @@ enum ethtool_stringset { ETH_SS_TS_TX_TYPES, ETH_SS_TS_RX_FILTERS, ETH_SS_UDP_TUNNEL_TYPES, + ETH_SS_STATS_STD, + ETH_SS_STATS_ETH_PHY, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 3a2b31ccbc5b..a54cfe625f34 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -45,6 +45,7 @@ enum { ETHTOOL_MSG_FEC_GET, ETHTOOL_MSG_FEC_SET, ETHTOOL_MSG_MODULE_EEPROM_GET, + ETHTOOL_MSG_STATS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -86,6 +87,7 @@ enum { ETHTOOL_MSG_FEC_GET_REPLY, ETHTOOL_MSG_FEC_NTF, ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, + ETHTOOL_MSG_STATS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -679,6 +681,51 @@ enum { ETHTOOL_A_MODULE_EEPROM_MAX = (__ETHTOOL_A_MODULE_EEPROM_CNT - 1) }; +/* STATS */ + +enum { + ETHTOOL_A_STATS_UNSPEC, + ETHTOOL_A_STATS_PAD, + ETHTOOL_A_STATS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_STATS_GROUPS, /* bitset */ + + ETHTOOL_A_STATS_GRP, /* nest - _A_STATS_GRP_* */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_CNT, + ETHTOOL_A_STATS_MAX = (__ETHTOOL_A_STATS_CNT - 1) +}; + +enum { + ETHTOOL_STATS_ETH_PHY, + + /* add new constants above here */ + __ETHTOOL_STATS_CNT +}; + +enum { + ETHTOOL_A_STATS_GRP_UNSPEC, + ETHTOOL_A_STATS_GRP_PAD, + + ETHTOOL_A_STATS_GRP_ID, /* u32 */ + ETHTOOL_A_STATS_GRP_SS_ID, /* u32 */ + + ETHTOOL_A_STATS_GRP_STAT, /* nest */ + + /* add new constants above here */ + __ETHTOOL_A_STATS_GRP_CNT, + ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1) +}; + +enum { + /* 30.3.2.1.5 aSymbolErrorDuringCarrier */ + ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, + + /* add new constants above here */ + __ETHTOOL_A_STATS_ETH_PHY_CNT, + ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 83842685fd8c..723c9a8a8cdf 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o + tunnels.o fec.o eeprom.o stats.o diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 5f5d7c4b3d4a..290012d0d11d 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -247,6 +247,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, + [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -942,6 +943,15 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_module_eeprom_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_STATS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_stats_get_policy, + .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 4305ac971bb0..9d88983b6597 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -346,6 +346,7 @@ extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; +extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -380,6 +381,7 @@ extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INF extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_DATA + 1]; +extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -399,4 +401,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); +extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; + #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c new file mode 100644 index 000000000000..fd8f47178c06 --- /dev/null +++ b/net/ethtool/stats.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct stats_req_info { + struct ethnl_req_info base; + DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT); +}; + +#define STATS_REQINFO(__req_base) \ + container_of(__req_base, struct stats_req_info, base) + +struct stats_reply_data { + struct ethnl_reply_data base; + struct ethtool_eth_phy_stats phy_stats; +}; + +#define STATS_REPDATA(__reply_base) \ + container_of(__reply_base, struct stats_reply_data, base) + +const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_STATS_ETH_PHY] = "eth-phy", +}; + +const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier", +}; + +const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { + [ETHTOOL_A_STATS_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED }, +}; + +static int stats_parse_request(struct ethnl_req_info *req_base, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct stats_req_info *req_info = STATS_REQINFO(req_base); + bool mod = false; + int err; + + err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT, + tb[ETHTOOL_A_STATS_GROUPS], stats_std_names, + extack, &mod); + if (err) + return err; + + if (!mod) { + NL_SET_ERR_MSG(extack, "no stats requested"); + return -EINVAL; + } + + return 0; +} + +static int stats_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + struct stats_reply_data *data = STATS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); + + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && + dev->ethtool_ops->get_eth_phy_stats) + dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); + + ethnl_ops_complete(dev); + return 0; +} + +static int stats_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + unsigned int n_grps = 0, n_stats = 0; + int len = 0; + + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); + n_grps++; + } + + len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ + nla_total_size(4) + /* _A_STATS_GRP_ID */ + nla_total_size(4)); /* _A_STATS_GRP_SS_ID */ + len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */ + nla_total_size_64bit(sizeof(u64))); + + return len; +} + +static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val) +{ + struct nlattr *nest; + int ret; + + if (val == ETHTOOL_STAT_NOT_SET) + return 0; + + /* We want to start stats attr types from 0, so we don't have a type + * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside + * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the + * actual attr we're 4B off - nla_need_padding_for_64bit() & co. + * can't be used. + */ +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) + if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0)) + return -EMSGSIZE; +#endif + + nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT); + if (!nest) + return -EMSGSIZE; + + ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */); + if (ret) { + nla_nest_cancel(skb, nest); + return ret; + } + + nla_nest_end(skb, nest); + return 0; +} + +static int stats_put_phy_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR, + data->phy_stats.SymbolErrorDuringCarrier)) + return -EMSGSIZE; + return 0; +} + +static int stats_put_stats(struct sk_buff *skb, + const struct stats_reply_data *data, + u32 id, u32 ss_id, + int (*cb)(struct sk_buff *skb, + const struct stats_reply_data *data)) +{ + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) || + nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id)) + goto err_cancel; + + if (cb(skb, data)) + goto err_cancel; + + nla_nest_end(skb, nest); + return 0; + +err_cancel: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int stats_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct stats_req_info *req_info = STATS_REQINFO(req_base); + const struct stats_reply_data *data = STATS_REPDATA(reply_base); + int ret = 0; + + if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, + ETH_SS_STATS_ETH_PHY, + stats_put_phy_stats); + + return ret; +} + +const struct ethnl_request_ops ethnl_stats_request_ops = { + .request_cmd = ETHTOOL_MSG_STATS_GET, + .reply_cmd = ETHTOOL_MSG_STATS_GET_REPLY, + .hdr_attr = ETHTOOL_A_STATS_HEADER, + .req_info_size = sizeof(struct stats_req_info), + .reply_data_size = sizeof(struct stats_reply_data), + + .parse_request = stats_parse_request, + .prepare_data = stats_prepare_data, + .reply_size = stats_reply_size, + .fill_reply = stats_fill_reply, +}; diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index c3a5489964cd..5f3c73587ff4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -80,6 +80,16 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, .strings = udp_tunnel_type_names, }, + [ETH_SS_STATS_STD] = { + .per_dev = false, + .count = __ETHTOOL_STATS_CNT, + .strings = stats_std_names, + }, + [ETH_SS_STATS_ETH_PHY] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, + .strings = stats_eth_phy_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From ca2244547ec7505d1cf61d43f5e76e3ffd99cf77 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:40 -0700 Subject: ethtool: add interface to read standard MAC stats Most of the MAC statistics are included in struct rtnl_link_stats64, but some fields are aggregated. Besides it's good to expose these clearly hardware stats separately. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 31 +++++++++++++ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 53 +++++++++++++++++++++ net/ethtool/netlink.h | 1 + net/ethtool/stats.c | 90 ++++++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 5 ++ 6 files changed, 182 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 2d5455eedbf4..3c689a13e679 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -250,6 +250,34 @@ static inline void ethtool_stats_init(u64 *stats, unsigned int n) stats[n] = ETHTOOL_STAT_NOT_SET; } +/* Basic IEEE 802.3 MAC statistics (30.3.1.1.*), not otherwise exposed + * via a more targeted API. + */ +struct ethtool_eth_mac_stats { + u64 FramesTransmittedOK; + u64 SingleCollisionFrames; + u64 MultipleCollisionFrames; + u64 FramesReceivedOK; + u64 FrameCheckSequenceErrors; + u64 AlignmentErrors; + u64 OctetsTransmittedOK; + u64 FramesWithDeferredXmissions; + u64 LateCollisions; + u64 FramesAbortedDueToXSColls; + u64 FramesLostDueToIntMACXmitError; + u64 CarrierSenseErrors; + u64 OctetsReceivedOK; + u64 FramesLostDueToIntMACRcvError; + u64 MulticastFramesXmittedOK; + u64 BroadcastFramesXmittedOK; + u64 FramesWithExcessiveDeferral; + u64 MulticastFramesReceivedOK; + u64 BroadcastFramesReceivedOK; + u64 InRangeLengthErrors; + u64 OutOfRangeLengthField; + u64 FrameTooLongErrors; +}; + /* Basic IEEE 802.3 PHY statistics (30.3.2.1.*), not otherwise exposed * via a more targeted API. */ @@ -495,6 +523,7 @@ struct ethtool_module_eeprom { * specified page. Returns a negative error code or the amount of bytes * read. * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. + * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -607,6 +636,8 @@ struct ethtool_ops { struct netlink_ext_ack *extack); void (*get_eth_phy_stats)(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats); + void (*get_eth_mac_stats)(struct net_device *dev, + struct ethtool_eth_mac_stats *mac_stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 190ae6e03918..c227376d811a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -671,6 +671,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types * @ETH_SS_STATS_STD: standardized stats * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics + * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -693,6 +694,7 @@ enum ethtool_stringset { ETH_SS_UDP_TUNNEL_TYPES, ETH_SS_STATS_STD, ETH_SS_STATS_ETH_PHY, + ETH_SS_STATS_ETH_MAC, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index a54cfe625f34..f0fbe8f4eb1b 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -698,6 +698,7 @@ enum { enum { ETHTOOL_STATS_ETH_PHY, + ETHTOOL_STATS_ETH_MAC, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -726,6 +727,58 @@ enum { ETHTOOL_A_STATS_ETH_PHY_MAX = (__ETHTOOL_A_STATS_ETH_PHY_CNT - 1) }; +enum { + /* 30.3.1.1.2 aFramesTransmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT, + /* 30.3.1.1.3 aSingleCollisionFrames */ + ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL, + /* 30.3.1.1.4 aMultipleCollisionFrames */ + ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL, + /* 30.3.1.1.5 aFramesReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT, + /* 30.3.1.1.6 aFrameCheckSequenceErrors */ + ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR, + /* 30.3.1.1.7 aAlignmentErrors */ + ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR, + /* 30.3.1.1.8 aOctetsTransmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES, + /* 30.3.1.1.9 aFramesWithDeferredXmissions */ + ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER, + /* 30.3.1.1.10 aLateCollisions */ + ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL, + /* 30.3.1.1.11 aFramesAbortedDueToXSColls */ + ETHTOOL_A_STATS_ETH_MAC_11_XS_COL, + /* 30.3.1.1.12 aFramesLostDueToIntMACXmitError */ + ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR, + /* 30.3.1.1.13 aCarrierSenseErrors */ + ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR, + /* 30.3.1.1.14 aOctetsReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES, + /* 30.3.1.1.15 aFramesLostDueToIntMACRcvError */ + ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR, + + /* 30.3.1.1.18 aMulticastFramesXmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST, + /* 30.3.1.1.19 aBroadcastFramesXmittedOK */ + ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST, + /* 30.3.1.1.20 aFramesWithExcessiveDeferral */ + ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER, + /* 30.3.1.1.21 aMulticastFramesReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST, + /* 30.3.1.1.22 aBroadcastFramesReceivedOK */ + ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST, + /* 30.3.1.1.23 aInRangeLengthErrors */ + ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR, + /* 30.3.1.1.24 aOutOfRangeLengthField */ + ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN, + /* 30.3.1.1.25 aFrameTooLongErrors */ + ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR, + + /* add new constants above here */ + __ETHTOOL_A_STATS_ETH_MAC_CNT, + ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 9d88983b6597..c70bac5329af 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -403,5 +403,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index fd8f47178c06..e80175872226 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -15,6 +15,7 @@ struct stats_req_info { struct stats_reply_data { struct ethnl_reply_data base; struct ethtool_eth_phy_stats phy_stats; + struct ethtool_eth_mac_stats mac_stats; }; #define STATS_REPDATA(__reply_base) \ @@ -22,12 +23,38 @@ struct stats_reply_data { const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", + [ETHTOOL_STATS_ETH_MAC] = "eth-mac", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier", }; +const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT] = "FramesTransmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL] = "SingleCollisionFrames", + [ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL] = "MultipleCollisionFrames", + [ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT] = "FramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR] = "FrameCheckSequenceErrors", + [ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR] = "AlignmentErrors", + [ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES] = "OctetsTransmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER] = "FramesWithDeferredXmissions", + [ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL] = "LateCollisions", + [ETHTOOL_A_STATS_ETH_MAC_11_XS_COL] = "FramesAbortedDueToXSColls", + [ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR] = "FramesLostDueToIntMACXmitError", + [ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR] = "CarrierSenseErrors", + [ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES] = "OctetsReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR] = "FramesLostDueToIntMACRcvError", + [ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST] = "MulticastFramesXmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST] = "BroadcastFramesXmittedOK", + [ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER] = "FramesWithExcessiveDeferral", + [ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST] = "MulticastFramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST] = "BroadcastFramesReceivedOK", + [ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR] = "InRangeLengthErrors", + [ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN] = "OutOfRangeLengthField", + [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -70,10 +97,14 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, return ret; memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); + memset(&data->mac_stats, 0xff, sizeof(data->mac_stats)); if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); + if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) && + dev->ethtool_ops->get_eth_mac_stats) + dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats); ethnl_ops_complete(dev); return 0; @@ -90,6 +121,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64); n_grps++; } + if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64); + n_grps++; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -143,6 +178,57 @@ static int stats_put_phy_stats(struct sk_buff *skb, return 0; } +static int stats_put_mac_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT, + data->mac_stats.FramesTransmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL, + data->mac_stats.SingleCollisionFrames) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL, + data->mac_stats.MultipleCollisionFrames) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT, + data->mac_stats.FramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR, + data->mac_stats.FrameCheckSequenceErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR, + data->mac_stats.AlignmentErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES, + data->mac_stats.OctetsTransmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER, + data->mac_stats.FramesWithDeferredXmissions) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL, + data->mac_stats.LateCollisions) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL, + data->mac_stats.FramesAbortedDueToXSColls) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR, + data->mac_stats.FramesLostDueToIntMACXmitError) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR, + data->mac_stats.CarrierSenseErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES, + data->mac_stats.OctetsReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR, + data->mac_stats.FramesLostDueToIntMACRcvError) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST, + data->mac_stats.MulticastFramesXmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST, + data->mac_stats.BroadcastFramesXmittedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER, + data->mac_stats.FramesWithExcessiveDeferral) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST, + data->mac_stats.MulticastFramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST, + data->mac_stats.BroadcastFramesReceivedOK) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR, + data->mac_stats.InRangeLengthErrors) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN, + data->mac_stats.OutOfRangeLengthField) || + stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR, + data->mac_stats.FrameTooLongErrors)) + return -EMSGSIZE; + return 0; +} + static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, @@ -182,6 +268,10 @@ static int stats_fill_reply(struct sk_buff *skb, ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY, ETH_SS_STATS_ETH_PHY, stats_put_phy_stats); + if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC, + ETH_SS_STATS_ETH_MAC, + stats_put_mac_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 5f3c73587ff4..a8aac7bcfcc9 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -90,6 +90,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_ETH_PHY_CNT, .strings = stats_eth_phy_names, }, + [ETH_SS_STATS_ETH_MAC] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, + .strings = stats_eth_mac_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From bfad2b979ddcc330c08bb071eb3c3f7b3411a681 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:41 -0700 Subject: ethtool: add interface to read standard MAC Ctrl stats Number of devices maintains the standard-based MAC control counters for control frames. Add a API for those. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 12 ++++++++++++ include/uapi/linux/ethtool.h | 2 ++ include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ net/ethtool/netlink.h | 1 + net/ethtool/stats.c | 33 +++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 5 +++++ 6 files changed, 67 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3c689a13e679..1ca6b836f9fe 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -285,6 +285,15 @@ struct ethtool_eth_phy_stats { u64 SymbolErrorDuringCarrier; }; +/* Basic IEEE 802.3 MAC Ctrl statistics (30.3.3.*), not otherwise exposed + * via a more targeted API. + */ +struct ethtool_eth_ctrl_stats { + u64 MACControlFramesTransmitted; + u64 MACControlFramesReceived; + u64 UnsupportedOpcodesReceived; +}; + /** * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames * @tx_pause_frames: transmitted pause frame count. Reported to user space @@ -524,6 +533,7 @@ struct ethtool_module_eeprom { * read. * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. + * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -638,6 +648,8 @@ struct ethtool_ops { struct ethtool_eth_phy_stats *phy_stats); void (*get_eth_mac_stats)(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats); + void (*get_eth_ctrl_stats)(struct net_device *dev, + struct ethtool_eth_ctrl_stats *ctrl_stats); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index c227376d811a..9cb8df89d4f2 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -672,6 +672,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_STATS_STD: standardized stats * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics + * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -695,6 +696,7 @@ enum ethtool_stringset { ETH_SS_STATS_STD, ETH_SS_STATS_ETH_PHY, ETH_SS_STATS_ETH_MAC, + ETH_SS_STATS_ETH_CTRL, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index f0fbe8f4eb1b..2ea5f049df6a 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -699,6 +699,7 @@ enum { enum { ETHTOOL_STATS_ETH_PHY, ETHTOOL_STATS_ETH_MAC, + ETHTOOL_STATS_ETH_CTRL, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -779,6 +780,19 @@ enum { ETHTOOL_A_STATS_ETH_MAC_MAX = (__ETHTOOL_A_STATS_ETH_MAC_CNT - 1) }; +enum { + /* 30.3.3.3 aMACControlFramesTransmitted */ + ETHTOOL_A_STATS_ETH_CTRL_3_TX, + /* 30.3.3.4 aMACControlFramesReceived */ + ETHTOOL_A_STATS_ETH_CTRL_4_RX, + /* 30.3.3.5 aUnsupportedOpcodesReceived */ + ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP, + + /* add new constants above here */ + __ETHTOOL_A_STATS_ETH_CTRL_CNT, + ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index c70bac5329af..febfa61e52e2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -404,5 +404,6 @@ int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; +extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index e80175872226..f4fded66731c 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -16,6 +16,7 @@ struct stats_reply_data { struct ethnl_reply_data base; struct ethtool_eth_phy_stats phy_stats; struct ethtool_eth_mac_stats mac_stats; + struct ethtool_eth_ctrl_stats ctrl_stats; }; #define STATS_REPDATA(__reply_base) \ @@ -24,6 +25,7 @@ struct stats_reply_data { const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", [ETHTOOL_STATS_ETH_MAC] = "eth-mac", + [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { @@ -55,6 +57,12 @@ const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors", }; +const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_ETH_CTRL_3_TX] = "MACControlFramesTransmitted", + [ETHTOOL_A_STATS_ETH_CTRL_4_RX] = "MACControlFramesReceived", + [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -98,6 +106,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); memset(&data->mac_stats, 0xff, sizeof(data->mac_stats)); + memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats)); if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) @@ -105,6 +114,9 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) && dev->ethtool_ops->get_eth_mac_stats) dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats); + if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) && + dev->ethtool_ops->get_eth_ctrl_stats) + dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats); ethnl_ops_complete(dev); return 0; @@ -125,6 +137,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64); n_grps++; } + if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64); + n_grps++; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -229,6 +245,19 @@ static int stats_put_mac_stats(struct sk_buff *skb, return 0; } +static int stats_put_ctrl_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX, + data->ctrl_stats.MACControlFramesTransmitted) || + stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX, + data->ctrl_stats.MACControlFramesReceived) || + stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP, + data->ctrl_stats.UnsupportedOpcodesReceived)) + return -EMSGSIZE; + return 0; +} + static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, @@ -272,6 +301,10 @@ static int stats_fill_reply(struct sk_buff *skb, ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC, ETH_SS_STATS_ETH_MAC, stats_put_mac_stats); + if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL, + ETH_SS_STATS_ETH_CTRL, + stats_put_ctrl_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index a8aac7bcfcc9..a33c603a7a02 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -95,6 +95,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_ETH_MAC_CNT, .strings = stats_eth_mac_names, }, + [ETH_SS_STATS_ETH_CTRL] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, + .strings = stats_eth_ctrl_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From a8b06e9d40d8b18c41c8ce060e8dc004fa59e708 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 16 Apr 2021 12:27:42 -0700 Subject: ethtool: add interface to read RMON stats Most devices maintain RMON (RFC 2819) stats - particularly the "histogram" of packets received by size. Unlike other RFCs which duplicate IEEE stats, the short/oversized frame counters in RMON don't seem to match IEEE stats 1-to-1 either, so expose those, too. Do not expose basic packet, CRC errors etc - those are already otherwise covered. Because standard defines packet ranges only up to 1518, and everything above that should theoretically be "oversized" - devices often create their own ranges. Going beyond what the RFC defines - expose the "histogram" in the Tx direction (assume for now that the ranges will be the same). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 43 ++++++++++++++++++ include/uapi/linux/ethtool.h | 2 + include/uapi/linux/ethtool_netlink.h | 23 ++++++++++ net/ethtool/netlink.h | 1 + net/ethtool/stats.c | 87 ++++++++++++++++++++++++++++++++++++ net/ethtool/strset.c | 5 +++ 6 files changed, 161 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 1ca6b836f9fe..e030f7510cd3 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -346,6 +346,44 @@ struct ethtool_fec_stats { } corrected_blocks, uncorrectable_blocks, corrected_bits; }; +/** + * struct ethtool_rmon_hist_range - byte range for histogram statistics + * @low: low bound of the bucket (inclusive) + * @high: high bound of the bucket (inclusive) + */ +struct ethtool_rmon_hist_range { + u16 low; + u16 high; +}; + +#define ETHTOOL_RMON_HIST_MAX 10 + +/** + * struct ethtool_rmon_stats - selected RMON (RFC 2819) statistics + * @undersize_pkts: Equivalent to `etherStatsUndersizePkts` from the RFC. + * @oversize_pkts: Equivalent to `etherStatsOversizePkts` from the RFC. + * @fragments: Equivalent to `etherStatsFragments` from the RFC. + * @jabbers: Equivalent to `etherStatsJabbers` from the RFC. + * @hist: Packet counter for packet length buckets (e.g. + * `etherStatsPkts128to255Octets` from the RFC). + * @hist_tx: Tx counters in similar form to @hist, not defined in the RFC. + * + * Selection of RMON (RFC 2819) statistics which are not exposed via different + * APIs, primarily the packet-length-based counters. + * Unfortunately different designs choose different buckets beyond + * the 1024B mark (jumbo frame teritory), so the definition of the bucket + * ranges is left to the driver. + */ +struct ethtool_rmon_stats { + u64 undersize_pkts; + u64 oversize_pkts; + u64 fragments; + u64 jabbers; + + u64 hist[ETHTOOL_RMON_HIST_MAX]; + u64 hist_tx[ETHTOOL_RMON_HIST_MAX]; +}; + #define ETH_MODULE_EEPROM_PAGE_LEN 128 #define ETH_MODULE_MAX_I2C_ADDRESS 0x7f @@ -534,6 +572,8 @@ struct ethtool_module_eeprom { * @get_eth_phy_stats: Query some of the IEEE 802.3 PHY statistics. * @get_eth_mac_stats: Query some of the IEEE 802.3 MAC statistics. * @get_eth_ctrl_stats: Query some of the IEEE 802.3 MAC Ctrl statistics. + * @get_rmon_stats: Query some of the RMON (RFC 2819) statistics. + * Set %ranges to a pointer to zero-terminated array of byte ranges. * * All operations are optional (i.e. the function pointer may be set * to %NULL) and callers must take this into account. Callers must @@ -650,6 +690,9 @@ struct ethtool_ops { struct ethtool_eth_mac_stats *mac_stats); void (*get_eth_ctrl_stats)(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats); + void (*get_rmon_stats)(struct net_device *dev, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges); }; int ethtool_check_ops(const struct ethtool_ops *ops); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9cb8df89d4f2..cfef6b08169a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -673,6 +673,7 @@ enum ethtool_link_ext_substate_cable_issue { * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics + * @ETH_SS_STATS_RMON: names of RMON statistics * * @ETH_SS_COUNT: number of defined string sets */ @@ -697,6 +698,7 @@ enum ethtool_stringset { ETH_SS_STATS_ETH_PHY, ETH_SS_STATS_ETH_MAC, ETH_SS_STATS_ETH_CTRL, + ETH_SS_STATS_RMON, /* add new constants above here */ ETH_SS_COUNT diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 2ea5f049df6a..825cfda1c5d5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -700,6 +700,7 @@ enum { ETHTOOL_STATS_ETH_PHY, ETHTOOL_STATS_ETH_MAC, ETHTOOL_STATS_ETH_CTRL, + ETHTOOL_STATS_RMON, /* add new constants above here */ __ETHTOOL_STATS_CNT @@ -714,6 +715,13 @@ enum { ETHTOOL_A_STATS_GRP_STAT, /* nest */ + ETHTOOL_A_STATS_GRP_HIST_RX, /* nest */ + ETHTOOL_A_STATS_GRP_HIST_TX, /* nest */ + + ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_BKT_HI, /* u32 */ + ETHTOOL_A_STATS_GRP_HIST_VAL, /* u64 */ + /* add new constants above here */ __ETHTOOL_A_STATS_GRP_CNT, ETHTOOL_A_STATS_GRP_MAX = (__ETHTOOL_A_STATS_CNT - 1) @@ -793,6 +801,21 @@ enum { ETHTOOL_A_STATS_ETH_CTRL_MAX = (__ETHTOOL_A_STATS_ETH_CTRL_CNT - 1) }; +enum { + /* etherStatsUndersizePkts */ + ETHTOOL_A_STATS_RMON_UNDERSIZE, + /* etherStatsOversizePkts */ + ETHTOOL_A_STATS_RMON_OVERSIZE, + /* etherStatsFragments */ + ETHTOOL_A_STATS_RMON_FRAG, + /* etherStatsJabbers */ + ETHTOOL_A_STATS_RMON_JABBER, + + /* add new constants above here */ + __ETHTOOL_A_STATS_RMON_CNT, + ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index febfa61e52e2..bed3afdf3656 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -405,5 +405,6 @@ extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; +extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index f4fded66731c..acb2b080c358 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -17,6 +17,8 @@ struct stats_reply_data { struct ethtool_eth_phy_stats phy_stats; struct ethtool_eth_mac_stats mac_stats; struct ethtool_eth_ctrl_stats ctrl_stats; + struct ethtool_rmon_stats rmon_stats; + const struct ethtool_rmon_hist_range *rmon_ranges; }; #define STATS_REPDATA(__reply_base) \ @@ -26,6 +28,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_PHY] = "eth-phy", [ETHTOOL_STATS_ETH_MAC] = "eth-mac", [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", + [ETHTOOL_STATS_RMON] = "rmon", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { @@ -63,6 +66,13 @@ const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived", }; +const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_RMON_UNDERSIZE] = "etherStatsUndersizePkts", + [ETHTOOL_A_STATS_RMON_OVERSIZE] = "etherStatsOversizePkts", + [ETHTOOL_A_STATS_RMON_FRAG] = "etherStatsFragments", + [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -107,6 +117,7 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, memset(&data->phy_stats, 0xff, sizeof(data->phy_stats)); memset(&data->mac_stats, 0xff, sizeof(data->mac_stats)); memset(&data->ctrl_stats, 0xff, sizeof(data->mac_stats)); + memset(&data->rmon_stats, 0xff, sizeof(data->rmon_stats)); if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) @@ -117,6 +128,10 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) && dev->ethtool_ops->get_eth_ctrl_stats) dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats); + if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) && + dev->ethtool_ops->get_rmon_stats) + dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats, + &data->rmon_ranges); ethnl_ops_complete(dev); return 0; @@ -141,6 +156,16 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64); n_grps++; } + if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64); + n_grps++; + /* Above includes the space for _A_STATS_GRP_HIST_VALs */ + + len += (nla_total_size(0) + /* _A_STATS_GRP_HIST */ + nla_total_size(4) + /* _A_STATS_GRP_HIST_BKT_LOW */ + nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */ + ETHTOOL_RMON_HIST_MAX * 2; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -258,6 +283,65 @@ static int stats_put_ctrl_stats(struct sk_buff *skb, return 0; } +static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist, + const struct ethtool_rmon_hist_range *ranges) +{ + struct nlattr *nest; + int i; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_RMON_HIST_MAX; i++) { + if (!ranges[i].low && !ranges[i].high) + break; + if (hist[i] == ETHTOOL_STAT_NOT_SET) + continue; + + nest = nla_nest_start(skb, attr); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI, + ranges[i].high) || + nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL, + hist[i], ETHTOOL_A_STATS_GRP_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + +static int stats_put_rmon_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX, + data->rmon_stats.hist, data->rmon_ranges) || + stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX, + data->rmon_stats.hist_tx, data->rmon_ranges)) + return -EMSGSIZE; + + if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE, + data->rmon_stats.undersize_pkts) || + stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE, + data->rmon_stats.oversize_pkts) || + stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG, + data->rmon_stats.fragments) || + stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER, + data->rmon_stats.jabbers)) + return -EMSGSIZE; + + return 0; +} + static int stats_put_stats(struct sk_buff *skb, const struct stats_reply_data *data, u32 id, u32 ss_id, @@ -305,6 +389,9 @@ static int stats_fill_reply(struct sk_buff *skb, ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL, ETH_SS_STATS_ETH_CTRL, stats_put_ctrl_stats); + if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON, + ETH_SS_STATS_RMON, stats_put_rmon_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index a33c603a7a02..b3029fff715d 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -100,6 +100,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT, .strings = stats_eth_ctrl_names, }, + [ETH_SS_STATS_RMON] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_RMON_CNT, + .strings = stats_rmon_names, + }, }; struct strset_req_info { -- cgit v1.2.3 From e10a9892097672b62be4ea265a9eb48f698ca3b8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 16 Apr 2021 15:38:04 -0700 Subject: mptcp: add tracepoint in mptcp_subflow_get_send This patch added a tracepoint in the packet scheduler function mptcp_subflow_get_send(). Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- MAINTAINERS | 1 + include/trace/events/mptcp.h | 60 ++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 8 +++--- 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 include/trace/events/mptcp.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 795b9941c151..0f82854cc430 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12546,6 +12546,7 @@ W: https://github.com/multipath-tcp/mptcp_net-next/wiki B: https://github.com/multipath-tcp/mptcp_net-next/issues F: Documentation/networking/mptcp-sysctl.rst F: include/net/mptcp.h +F: include/trace/events/mptcp.h F: include/uapi/linux/mptcp.h F: net/mptcp/ F: tools/testing/selftests/net/mptcp/ diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h new file mode 100644 index 000000000000..b1617a0162da --- /dev/null +++ b/include/trace/events/mptcp.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mptcp + +#if !defined(_TRACE_MPTCP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MPTCP_H + +#include + +TRACE_EVENT(mptcp_subflow_get_send, + + TP_PROTO(struct mptcp_subflow_context *subflow), + + TP_ARGS(subflow), + + TP_STRUCT__entry( + __field(bool, active) + __field(bool, free) + __field(u32, snd_wnd) + __field(u32, pace) + __field(u8, backup) + __field(u64, ratio) + ), + + TP_fast_assign( + struct sock *ssk; + + __entry->active = mptcp_subflow_active(subflow); + __entry->backup = subflow->backup; + + if (subflow->tcp_sock && sk_fullsock(subflow->tcp_sock)) + __entry->free = sk_stream_memory_free(subflow->tcp_sock); + else + __entry->free = 0; + + ssk = mptcp_subflow_tcp_sock(subflow); + if (ssk && sk_fullsock(ssk)) { + __entry->snd_wnd = tcp_sk(ssk)->snd_wnd; + __entry->pace = ssk->sk_pacing_rate; + } else { + __entry->snd_wnd = 0; + __entry->pace = 0; + } + + if (ssk && sk_fullsock(ssk) && __entry->pace) + __entry->ratio = div_u64((u64)ssk->sk_wmem_queued << 32, __entry->pace); + else + __entry->ratio = 0; + ), + + TP_printk("active=%d free=%d snd_wnd=%u pace=%u backup=%u ratio=%llu", + __entry->active, __entry->free, + __entry->snd_wnd, __entry->pace, + __entry->backup, __entry->ratio) +); + +#endif /* _TRACE_MPTCP_H */ + +/* This part must be outside protection */ +#include diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5a05c6ca943c..e26ea143754d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -25,6 +25,9 @@ #include "protocol.h" #include "mib.h" +#define CREATE_TRACE_POINTS +#include + #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct mptcp6_sock { struct mptcp_sock msk; @@ -1410,6 +1413,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) send_info[i].ratio = -1; } mptcp_for_each_subflow(msk, subflow) { + trace_mptcp_subflow_get_send(subflow); ssk = mptcp_subflow_tcp_sock(subflow); if (!mptcp_subflow_active(subflow)) continue; @@ -1430,10 +1434,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) } } - pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld", - msk, nr_active, send_info[0].ssk, send_info[0].ratio, - send_info[1].ssk, send_info[1].ratio); - /* pick the best backup if no other subflow is active */ if (!nr_active) send_info[0].ssk = send_info[1].ssk; -- cgit v1.2.3 From 0918e34b85c7e125f531caaf3d2918baf2b1a5f9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 16 Apr 2021 15:38:05 -0700 Subject: mptcp: add tracepoint in get_mapping_status This patch added a tracepoint in the mapping status function get_mapping_status() to dump every mpext field. Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/trace/events/mptcp.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/subflow.c | 6 ++--- 2 files changed, 55 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index b1617a0162da..ec20350d82eb 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -54,6 +54,58 @@ TRACE_EVENT(mptcp_subflow_get_send, __entry->backup, __entry->ratio) ); +DECLARE_EVENT_CLASS(mptcp_dump_mpext, + + TP_PROTO(struct mptcp_ext *mpext), + + TP_ARGS(mpext), + + TP_STRUCT__entry( + __field(u64, data_ack) + __field(u64, data_seq) + __field(u32, subflow_seq) + __field(u16, data_len) + __field(u8, use_map) + __field(u8, dsn64) + __field(u8, data_fin) + __field(u8, use_ack) + __field(u8, ack64) + __field(u8, mpc_map) + __field(u8, frozen) + __field(u8, reset_transient) + __field(u8, reset_reason) + ), + + TP_fast_assign( + __entry->data_ack = mpext->ack64 ? mpext->data_ack : mpext->data_ack32; + __entry->data_seq = mpext->data_seq; + __entry->subflow_seq = mpext->subflow_seq; + __entry->data_len = mpext->data_len; + __entry->use_map = mpext->use_map; + __entry->dsn64 = mpext->dsn64; + __entry->data_fin = mpext->data_fin; + __entry->use_ack = mpext->use_ack; + __entry->ack64 = mpext->ack64; + __entry->mpc_map = mpext->mpc_map; + __entry->frozen = mpext->frozen; + __entry->reset_transient = mpext->reset_transient; + __entry->reset_reason = mpext->reset_reason; + ), + + TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u", + __entry->data_ack, __entry->data_seq, + __entry->subflow_seq, __entry->data_len, + __entry->use_map, __entry->dsn64, + __entry->data_fin, __entry->use_ack, + __entry->ack64, __entry->mpc_map, + __entry->frozen, __entry->reset_transient, + __entry->reset_reason) +); + +DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status, + TP_PROTO(struct mptcp_ext *mpext), + TP_ARGS(mpext)); + #endif /* _TRACE_MPTCP_H */ /* This part must be outside protection */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c3da84576b3c..d8a2a55ae916 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -25,6 +25,8 @@ #include "protocol.h" #include "mib.h" +#include + static void mptcp_subflow_ops_undo_override(struct sock *ssk); static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, @@ -862,9 +864,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk, goto validate_seq; } - pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d", - mpext->data_seq, mpext->dsn64, mpext->subflow_seq, - mpext->data_len, mpext->data_fin); + trace_get_mapping_status(mpext); data_len = mpext->data_len; if (data_len == 0) { -- cgit v1.2.3 From ed66bfb4ce34a94174bb755eeaca85d1661d36ad Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 16 Apr 2021 15:38:06 -0700 Subject: mptcp: add tracepoint in ack_update_msk This patch added a tracepoint in ack_update_msk() to track the incoming data_ack and window/snd_una updates. Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/trace/events/mptcp.h | 32 ++++++++++++++++++++++++++++++++ net/mptcp/options.c | 6 ++++++ 2 files changed, 38 insertions(+) (limited to 'include') diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index ec20350d82eb..b90bfe45d995 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -106,6 +106,38 @@ DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status, TP_PROTO(struct mptcp_ext *mpext), TP_ARGS(mpext)); +TRACE_EVENT(ack_update_msk, + + TP_PROTO(u64 data_ack, u64 old_snd_una, + u64 new_snd_una, u64 new_wnd_end, + u64 msk_wnd_end), + + TP_ARGS(data_ack, old_snd_una, + new_snd_una, new_wnd_end, + msk_wnd_end), + + TP_STRUCT__entry( + __field(u64, data_ack) + __field(u64, old_snd_una) + __field(u64, new_snd_una) + __field(u64, new_wnd_end) + __field(u64, msk_wnd_end) + ), + + TP_fast_assign( + __entry->data_ack = data_ack; + __entry->old_snd_una = old_snd_una; + __entry->new_snd_una = new_snd_una; + __entry->new_wnd_end = new_wnd_end; + __entry->msk_wnd_end = msk_wnd_end; + ), + + TP_printk("data_ack=%llu old_snd_una=%llu new_snd_una=%llu new_wnd_end=%llu msk_wnd_end=%llu", + __entry->data_ack, __entry->old_snd_una, + __entry->new_snd_una, __entry->new_wnd_end, + __entry->msk_wnd_end) +); + #endif /* _TRACE_MPTCP_H */ /* This part must be outside protection */ diff --git a/net/mptcp/options.c b/net/mptcp/options.c index d51c3ad54d9a..99fc21406168 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -13,6 +13,8 @@ #include "protocol.h" #include "mib.h" +#include + static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; @@ -943,6 +945,10 @@ static void ack_update_msk(struct mptcp_sock *msk, __mptcp_data_acked(sk); } mptcp_data_unlock(sk); + + trace_ack_update_msk(mp_opt->data_ack, + old_snd_una, new_snd_una, + new_wnd_end, msk->wnd_end); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) -- cgit v1.2.3 From d96a838a7ce2772ed181f89becd79b72d267f93a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 16 Apr 2021 15:38:07 -0700 Subject: mptcp: add tracepoint in subflow_check_data_avail This patch added a tracepoint in subflow_check_data_avail() to show the mapping status. Suggested-by: Paolo Abeni Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/trace/events/mptcp.h | 29 +++++++++++++++++++++++++++++ net/mptcp/subflow.c | 4 +--- 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index b90bfe45d995..775a46d0b0f0 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -7,6 +7,14 @@ #include +#define show_mapping_status(status) \ + __print_symbolic(status, \ + { 0, "MAPPING_OK" }, \ + { 1, "MAPPING_INVALID" }, \ + { 2, "MAPPING_EMPTY" }, \ + { 3, "MAPPING_DATA_FIN" }, \ + { 4, "MAPPING_DUMMY" }) + TRACE_EVENT(mptcp_subflow_get_send, TP_PROTO(struct mptcp_subflow_context *subflow), @@ -138,6 +146,27 @@ TRACE_EVENT(ack_update_msk, __entry->msk_wnd_end) ); +TRACE_EVENT(subflow_check_data_avail, + + TP_PROTO(__u8 status, struct sk_buff *skb), + + TP_ARGS(status, skb), + + TP_STRUCT__entry( + __field(u8, status) + __field(const void *, skb) + ), + + TP_fast_assign( + __entry->status = status; + __entry->skb = skb; + ), + + TP_printk("mapping_status=%s, skb=%p", + show_mapping_status(__entry->status), + __entry->skb) +); + #endif /* _TRACE_MPTCP_H */ /* This part must be outside protection */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index d8a2a55ae916..82e91b00ad39 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1002,8 +1002,6 @@ static bool subflow_check_data_avail(struct sock *ssk) struct mptcp_sock *msk; struct sk_buff *skb; - pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk, - subflow->data_avail, skb_peek(&ssk->sk_receive_queue)); if (!skb_peek(&ssk->sk_receive_queue)) subflow->data_avail = 0; if (subflow->data_avail) @@ -1015,7 +1013,7 @@ static bool subflow_check_data_avail(struct sock *ssk) u64 old_ack; status = get_mapping_status(ssk, msk); - pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status); + trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue)); if (status == MAPPING_INVALID) { ssk->sk_err = EBADMSG; goto fatal; -- cgit v1.2.3 From 14c20643ef9457679cc6934d77adc24296505214 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 12 Apr 2021 14:11:39 +0200 Subject: netfilter: nft_payload: fix C-VLAN offload support - add another struct flow_dissector_key_vlan for C-VLAN - update layer 3 dependency to allow to match on IPv4/IPv6 Fixes: 89d8fd44abfb ("netfilter: nft_payload: add C-VLAN offload support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_offload.h | 1 + net/netfilter/nft_payload.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 1d34fe154fe0..b4d080061399 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -45,6 +45,7 @@ struct nft_flow_key { struct flow_dissector_key_ports tp; struct flow_dissector_key_ip ip; struct flow_dissector_key_vlan vlan; + struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_eth_addrs eth_addrs; struct flow_dissector_key_meta meta; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index cb1c8c231880..a990f37e0a60 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -241,7 +241,7 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan, vlan_tci, sizeof(__be16), reg); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + @@ -249,8 +249,9 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, vlan, + NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan, vlan_tpid, sizeof(__be16), reg); + nft_offload_set_dependency(ctx, NFT_OFFLOAD_DEP_NETWORK); break; default: return -EOPNOTSUPP; -- cgit v1.2.3 From ff4d90a89d3d4d9814e0a2696509a7d495be4163 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 12 Apr 2021 14:20:15 +0200 Subject: netfilter: nftables_offload: VLAN id needs host byteorder in flow dissector The flow dissector representation expects the VLAN id in host byteorder. Add the NFT_OFFLOAD_F_NETWORK2HOST flag to swap the bytes from nft_cmp. Fixes: a82055af5959 ("netfilter: nft_payload: add VLAN offload support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_offload.h | 11 ++++++++- net/netfilter/nft_cmp.c | 41 +++++++++++++++++++++++++++++-- net/netfilter/nft_payload.c | 10 +++++--- 3 files changed, 55 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index b4d080061399..434a6158852f 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -4,11 +4,16 @@ #include #include +enum nft_offload_reg_flags { + NFT_OFFLOAD_F_NETWORK2HOST = (1 << 0), +}; + struct nft_offload_reg { u32 key; u32 len; u32 base_offset; u32 offset; + u32 flags; struct nft_data data; struct nft_data mask; }; @@ -72,13 +77,17 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rul void nft_flow_rule_destroy(struct nft_flow_rule *flow); int nft_flow_rule_offload_commit(struct net *net); -#define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ +#define NFT_OFFLOAD_MATCH_FLAGS(__key, __base, __field, __len, __reg, __flags) \ (__reg)->base_offset = \ offsetof(struct nft_flow_key, __base); \ (__reg)->offset = \ offsetof(struct nft_flow_key, __base.__field); \ (__reg)->len = __len; \ (__reg)->key = __key; \ + (__reg)->flags = __flags; + +#define NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ + NFT_OFFLOAD_MATCH_FLAGS(__key, __base, __field, __len, __reg, 0) #define NFT_OFFLOAD_MATCH_EXACT(__key, __base, __field, __len, __reg) \ NFT_OFFLOAD_MATCH(__key, __base, __field, __len, __reg) \ diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index eb6a43a180bb..47b6d05f1ae6 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -114,19 +114,56 @@ nla_put_failure: return -1; } +union nft_cmp_offload_data { + u16 val16; + u32 val32; + u64 val64; +}; + +static void nft_payload_n2h(union nft_cmp_offload_data *data, + const u8 *val, u32 len) +{ + switch (len) { + case 2: + data->val16 = ntohs(*((u16 *)val)); + break; + case 4: + data->val32 = ntohl(*((u32 *)val)); + break; + case 8: + data->val64 = be64_to_cpu(*((u64 *)val)); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + static int __nft_cmp_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_cmp_expr *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->sreg]; + union nft_cmp_offload_data _data, _datamask; u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; + u8 *data, *datamask; if (priv->op != NFT_CMP_EQ || priv->len > reg->len) return -EOPNOTSUPP; - memcpy(key + reg->offset, &priv->data, reg->len); - memcpy(mask + reg->offset, ®->mask, reg->len); + if (reg->flags & NFT_OFFLOAD_F_NETWORK2HOST) { + nft_payload_n2h(&_data, (u8 *)&priv->data, reg->len); + nft_payload_n2h(&_datamask, (u8 *)®->mask, reg->len); + data = (u8 *)&_data; + datamask = (u8 *)&_datamask; + } else { + data = (u8 *)&priv->data; + datamask = (u8 *)®->mask; + } + + memcpy(key + reg->offset, data, reg->len); + memcpy(mask + reg->offset, datamask, reg->len); flow->match.dissector.used_keys |= BIT(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a990f37e0a60..501c5b24cc39 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -226,8 +226,9 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_VLAN, vlan, - vlan_tci, sizeof(__be16), reg); + NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_VLAN, vlan, + vlan_tci, sizeof(__be16), reg, + NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto): if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) @@ -241,8 +242,9 @@ static int nft_payload_offload_ll(struct nft_offload_ctx *ctx, if (!nft_payload_offload_mask(reg, priv->len, sizeof(__be16))) return -EOPNOTSUPP; - NFT_OFFLOAD_MATCH(FLOW_DISSECTOR_KEY_CVLAN, cvlan, - vlan_tci, sizeof(__be16), reg); + NFT_OFFLOAD_MATCH_FLAGS(FLOW_DISSECTOR_KEY_CVLAN, cvlan, + vlan_tci, sizeof(__be16), reg, + NFT_OFFLOAD_F_NETWORK2HOST); break; case offsetof(struct vlan_ethhdr, h_vlan_encapsulated_proto) + sizeof(struct vlan_hdr): -- cgit v1.2.3 From b72920f6e4a9d6607b723d69b7f412c829769c75 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 15 Apr 2021 20:10:18 +0200 Subject: netfilter: nftables: counter hardware offload support This patch adds the .offload_stats operation to synchronize hardware stats with the expression data. Update the counter expression to use this new interface. The hardware stats are retrieved from the netlink dump path via FLOW_CLS_STATS command to the driver. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/net/netfilter/nf_tables_offload.h | 1 + net/netfilter/nf_tables_api.c | 3 +++ net/netfilter/nf_tables_offload.c | 44 ++++++++++++++++++++++++++----- net/netfilter/nft_counter.c | 29 ++++++++++++++++++++ 5 files changed, 72 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f0f7a3c5da6a..4a75da2a2e1d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -867,6 +867,8 @@ struct nft_expr_ops { int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); + void (*offload_stats)(struct nft_expr *expr, + const struct flow_stats *stats); u32 offload_flags; const struct nft_expr_type *type; void *data; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 434a6158852f..f9d95ff82df8 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -74,6 +74,7 @@ void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, struct nft_rule; struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule); +int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule); void nft_flow_rule_destroy(struct nft_flow_rule *flow); int nft_flow_rule_offload_commit(struct net *net); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1b881a84bd01..37e9accd9aeb 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2878,6 +2878,9 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; } + if (chain->flags & NFT_CHAIN_HW_OFFLOAD) + nft_flow_rule_stats(chain, rule); + list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS); if (list == NULL) goto nla_put_failure; diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 1d428792018f..19215e81dd66 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -243,26 +243,56 @@ static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, cls_flow->rule = flow->rule; } -static int nft_flow_offload_rule(struct nft_chain *chain, - struct nft_rule *rule, - struct nft_flow_rule *flow, - enum flow_cls_command command) +static int nft_flow_offload_cmd(const struct nft_chain *chain, + const struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command, + struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; - struct flow_cls_offload cls_flow; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); - nft_flow_cls_offload_setup(&cls_flow, basechain, rule, flow, &extack, + nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); - return nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, + return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } +static int nft_flow_offload_rule(const struct nft_chain *chain, + struct nft_rule *rule, + struct nft_flow_rule *flow, + enum flow_cls_command command) +{ + struct flow_cls_offload cls_flow; + + return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); +} + +int nft_flow_rule_stats(const struct nft_chain *chain, + const struct nft_rule *rule) +{ + struct flow_cls_offload cls_flow = {}; + struct nft_expr *expr, *next; + int err; + + err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, + &cls_flow); + if (err < 0) + return err; + + nft_rule_for_each_expr(expr, next, rule) { + if (expr->ops->offload_stats) + expr->ops->offload_stats(expr, &cls_flow.stats); + } + + return 0; +} + static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 85ed461ec24e..8edd3b3c173d 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -13,6 +13,7 @@ #include #include #include +#include struct nft_counter { s64 bytes; @@ -248,6 +249,32 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) return 0; } +static int nft_counter_offload(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow, + const struct nft_expr *expr) +{ + /* No specific offload action is needed, but report success. */ + return 0; +} + +static void nft_counter_offload_stats(struct nft_expr *expr, + const struct flow_stats *stats) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter *this_cpu; + seqcount_t *myseq; + + preempt_disable(); + this_cpu = this_cpu_ptr(priv->counter); + myseq = this_cpu_ptr(&nft_counter_seq); + + write_seqcount_begin(myseq); + this_cpu->packets += stats->pkts; + this_cpu->bytes += stats->bytes; + write_seqcount_end(myseq); + preempt_enable(); +} + static struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, @@ -258,6 +285,8 @@ static const struct nft_expr_ops nft_counter_ops = { .destroy_clone = nft_counter_destroy, .dump = nft_counter_dump, .clone = nft_counter_clone, + .offload = nft_counter_offload, + .offload_stats = nft_counter_offload_stats, }; static struct nft_expr_type nft_counter_type __read_mostly = { -- cgit v1.2.3 From 810344ed07d9ea55e42e99d87034e234e7e6a4a5 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 13 Apr 2021 14:38:50 +0300 Subject: cfg80211: fix an htmldoc warning The htmldoc produces this warning which was introduced bu the commit below. include/net/cfg80211.h:6643: warning: expecting prototype for wiphy_rfkill_set_hw_state(). Prototype was for wiphy_rfkill_set_hw_state_reason() instead Fixes: 6f779a66dc84 ("cfg80211: allow specifying a reason for hw_rfkill") Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20210413113850.59098-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3b296f2b7a2c..c6134220dd8f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6634,7 +6634,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, */ /** - * wiphy_rfkill_set_hw_state - notify cfg80211 about hw block state + * wiphy_rfkill_set_hw_state_reason - notify cfg80211 about hw block state * @wiphy: the wiphy * @blocked: block status * @reason: one of reasons in &enum rfkill_hard_block_reasons -- cgit v1.2.3 From 5d9c358d05f62aa01ff5d63dae70a897498b0bae Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 16 Apr 2021 23:01:42 -0700 Subject: cfg80211: fix a few kernel-doc warnings Fix multiple kernel-doc warnings in cfg80211.h. cfg80211.h:363: warning: missing initial short description on line: * struct ieee80211_sband_iftype_data cfg80211.h:6743: warning: missing initial short description on line: * cfg80211_vendor_cmd_get_sender Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20210417060142.1648-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c6134220dd8f..528bea585bee 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -360,7 +360,7 @@ struct ieee80211_sta_he_cap { }; /** - * struct ieee80211_sband_iftype_data + * struct ieee80211_sband_iftype_data - sband data per interface type * * This structure encapsulates sband data that is relevant for the * interface types defined in @types_mask. Each type in the @@ -6740,7 +6740,7 @@ cfg80211_vendor_cmd_alloc_reply_skb(struct wiphy *wiphy, int approxlen) int cfg80211_vendor_cmd_reply(struct sk_buff *skb); /** - * cfg80211_vendor_cmd_get_sender + * cfg80211_vendor_cmd_get_sender - get the current sender netlink ID * @wiphy: the wiphy * * Return the current netlink port ID in a vendor command handler. -- cgit v1.2.3 From 623b988f2dcbecf3e638ecfaec97cc56a95eaa6a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 18 Apr 2021 01:39:53 -0700 Subject: cfg80211: constify ieee80211_get_response_rate return It's not modified so make it const with the eventual goal of moving data to text for various static struct ieee80211_rate arrays. Signed-off-by: Joe Perches Link: https://lore.kernel.org/r/8b210b5f5972e39eded269b35a1297cf824c4181.camel@perches.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/b43/main.c | 2 +- drivers/net/wireless/broadcom/b43legacy/main.c | 2 +- include/net/cfg80211.h | 2 +- net/wireless/util.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c index 150a366e8f62..17bcec5f3ff7 100644 --- a/drivers/net/wireless/broadcom/b43/main.c +++ b/drivers/net/wireless/broadcom/b43/main.c @@ -4053,7 +4053,7 @@ static void b43_update_basic_rates(struct b43_wldev *dev, u32 brates) { struct ieee80211_supported_band *sband = dev->wl->hw->wiphy->bands[b43_current_band(dev->wl)]; - struct ieee80211_rate *rate; + const struct ieee80211_rate *rate; int i; u16 basic, direct, offset, basic_offset, rateptr; diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c index 7692a2618c97..f64ebff68308 100644 --- a/drivers/net/wireless/broadcom/b43legacy/main.c +++ b/drivers/net/wireless/broadcom/b43legacy/main.c @@ -2762,7 +2762,7 @@ static void b43legacy_update_basic_rates(struct b43legacy_wldev *dev, u32 brates { struct ieee80211_supported_band *sband = dev->wl->hw->wiphy->bands[NL80211_BAND_2GHZ]; - struct ieee80211_rate *rate; + const struct ieee80211_rate *rate; int i; u16 basic, direct, offset, basic_offset, rateptr; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 528bea585bee..73b17ea89248 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5607,7 +5607,7 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) * which is, for this function, given as a bitmap of indices of * rates in the band's bitrate table. */ -struct ieee80211_rate * +const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate); diff --git a/net/wireless/util.c b/net/wireless/util.c index 1bf0200f562a..382c5262d997 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -24,7 +24,7 @@ #include "rdev-ops.h" -struct ieee80211_rate * +const struct ieee80211_rate * ieee80211_get_response_rate(struct ieee80211_supported_band *sband, u32 basic_rates, int bitrate) { -- cgit v1.2.3 From efce5b50bad8b63d07719318c34a664ccdb56b70 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 9 Apr 2021 12:40:26 +0300 Subject: ieee80211: add the values of ranging parameters max LTF total field Add an enum with the values of the ranging parameters max LTF total field, as defined in IEEE802.11az_D2.6, table Table 9-322h23fc. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.d2588ebb1974.I9424c8ade13c4c938cb9999d8ce99d0d4c1cc198@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 72ff75fb1971..25fc7bee868a 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3861,4 +3861,11 @@ struct ieee80211_neighbor_ap_info { u8 channel; } __packed; +enum ieee80211_range_params_max_total_ltf { + IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_4 = 0, + IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_8, + IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_16, + IEEE80211_RANGE_PARAMS_MAX_TOTAL_LTF_UNSPECIFIED, +}; + #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 73807523f9a6612106582ab19217f280ed128f24 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 9 Apr 2021 12:40:25 +0300 Subject: nl80211/cfg80211: add a flag to negotiate for LMR feedback in NDP ranging Add a flag that indicates that the ISTA shall indicate support for LMR feedback in NDP ranging negotiation. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.eff546283504.I2606161e700ac24d94d0b50c8edcdedd4c0395c2@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 ++++- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 1 + net/wireless/pmsr.c | 12 +++++++++++- 4 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 73b17ea89248..5224f885a99a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3521,6 +3521,8 @@ struct cfg80211_pmsr_result { * @non_trigger_based: use non trigger based ranging for the measurement * If neither @trigger_based nor @non_trigger_based is set, * EDCA based ranging will be used. + * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either + * @trigger_based or @non_trigger_based is set. * * See also nl80211 for the respective attribute documentation. */ @@ -3532,7 +3534,8 @@ struct cfg80211_pmsr_ftm_request_peer { request_lci:1, request_civicloc:1, trigger_based:1, - non_trigger_based:1; + non_trigger_based:1, + lmr_feedback:1; u8 num_bursts_exp; u8 burst_duration; u8 ftms_per_burst; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 18dfe744bcb5..00f696d177e6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6896,6 +6896,9 @@ enum nl80211_peer_measurement_ftm_capa { * if neither %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED nor * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set, EDCA based * ranging will be used. + * @NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK: negotiate for LMR feedback. Only + * valid if either %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED or + * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set. * * @NUM_NL80211_PMSR_FTM_REQ_ATTR: internal * @NL80211_PMSR_FTM_REQ_ATTR_MAX: highest attribute number @@ -6914,6 +6917,7 @@ enum nl80211_peer_measurement_ftm_req { NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC, NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED, NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED, + NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK, /* keep last */ NUM_NL80211_PMSR_FTM_REQ_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index adfa07c67b44..aad19348bb46 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -309,6 +309,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_REQUEST_CIVICLOC] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, + [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG }, }; static const struct nla_policy diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index a95c79d18349..6bdd96408022 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation */ #ifndef __PMSR_H #define __PMSR_H @@ -158,6 +158,16 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + out->ftm.lmr_feedback = + !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; + if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && + out->ftm.lmr_feedback) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK], + "FTM: LMR feedback set for EDCA based ranging"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From f30386a85f695aced2fa5b124d65ce5a5f3dc3ac Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 9 Apr 2021 12:40:19 +0300 Subject: mac80211: make ieee80211_vif_to_wdev work when the vif isn't in the driver This will allow the low level driver to get the wdev during the add_interface flow. In order to do that, remove a few checks from there and do not return NULL for vifs that were not yet added to the driver. Note that all the current callers of this helper function assume that the vif already exists: - The callers from the drivers already have a vif pointer. Before this change, ieee80211_vif_to_wdev would return NULL in some cases, but those callers don't even check they get a non-NULL pointer from ieee80211_vif_to_wdev. - The callers from net/mac80211/cfg.c assume the vif is already added to the driver as well. So, this change has no impact on existing callers of this helper function. Signed-off-by: Emmanuel Grumbach Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.6078d3517095.I1907a45f267a62dab052bcc44428aa7a2005ffc9@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 +---- net/mac80211/util.c | 10 +--------- 2 files changed, 2 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c21a0e27b35e..445b66c6eb7e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1768,10 +1768,7 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); * * This can be used by mac80211 drivers with direct cfg80211 APIs * (like the vendor commands) that needs to get the wdev for a vif. - * - * Note that this function may return %NULL if the given wdev isn't - * associated with a vif that the driver knows about (e.g. monitor - * or AP_VLAN interfaces.) + * This can also be useful to get the netdev associated to a vif. */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index c0fa526a45b4..0a0481f5af48 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -888,18 +888,10 @@ EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { - struct ieee80211_sub_if_data *sdata; - if (!vif) return NULL; - sdata = vif_to_sdata(vif); - - if (!ieee80211_sdata_running(sdata) || - !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) - return NULL; - - return &sdata->wdev; + return &vif_to_sdata(vif)->wdev; } EXPORT_SYMBOL_GPL(ieee80211_vif_to_wdev); -- cgit v1.2.3 From f12ce9f607ffa5c617cd86cb7a7a0aaefe58f127 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Fri, 9 Apr 2021 12:40:15 +0300 Subject: nl80211: Add new RSNXE related nl80211 extended features Draft P802.11ax_D2.5 defines the following capabilities that can be negotiated using RSNXE capabilities: - Secure LTF measurement exchange protocol. - Secure RTT measurement exchange protocol. - Management frame protection for all management frames exchanged during the negotiation and range measurement procedure. Extend the nl80211 API to allow drivers to declare support for these new capabilities as part of extended feature. Signed-off-by: Ilan Peer Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.8280e31d8091.Ifcb29f84f432290338f80c8378aa5c9e0a390c93@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 00f696d177e6..f962c06e9818 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5940,6 +5940,16 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_BEACON_RATE_HE: Driver supports beacon rate * configuration (AP/mesh) with HE rates. * + * @NL80211_EXT_FEATURE_SECURE_LTF: Device supports secure LTF measurement + * exchange protocol. + * + * @NL80211_EXT_FEATURE_SECURE_RTT: Device supports secure RTT measurement + * exchange protocol. + * + * @NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE: Device supports management + * frame protection for all management frames exchanged during the + * negotiation and range measurement procedure. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6001,6 +6011,9 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_FILS_DISCOVERY, NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP, NL80211_EXT_FEATURE_BEACON_RATE_HE, + NL80211_EXT_FEATURE_SECURE_LTF, + NL80211_EXT_FEATURE_SECURE_RTT, + NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From b07dd26f07af294ceed9715fd11e312ff8de6138 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 14 Apr 2021 18:12:51 +0200 Subject: flow: remove spi key from flowi struct xfrm session decode ipv4 path (but not ipv6) sets this, but there are no consumers. Remove it. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/flow.h | 3 --- net/xfrm/xfrm_policy.c | 39 --------------------------------------- 2 files changed, 42 deletions(-) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index 39d0cedcddee..6f5e70240071 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -59,7 +59,6 @@ union flowi_uli { __le16 sport; } dnports; - __be32 spi; __be32 gre_key; struct { @@ -90,7 +89,6 @@ struct flowi4 { #define fl4_dport uli.ports.dport #define fl4_icmp_type uli.icmpt.type #define fl4_icmp_code uli.icmpt.code -#define fl4_ipsec_spi uli.spi #define fl4_mh_type uli.mht.type #define fl4_gre_key uli.gre_key } __attribute__((__aligned__(BITS_PER_LONG/8))); @@ -150,7 +148,6 @@ struct flowi6 { #define fl6_dport uli.ports.dport #define fl6_icmp_type uli.icmpt.type #define fl6_icmp_code uli.icmpt.code -#define fl6_ipsec_spi uli.spi #define fl6_mh_type uli.mht.type #define fl6_gre_key uli.gre_key __u32 mp_hash; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 156347fd7e2e..cc6e02eb76c2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3326,39 +3326,6 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) fl4->fl4_icmp_code = icmp[1]; } break; - case IPPROTO_ESP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be32 *ehdr; - - xprth = skb_network_header(skb) + ihl * 4; - ehdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ehdr[0]; - } - break; - case IPPROTO_AH: - if (xprth + 8 < skb->data || - pskb_may_pull(skb, xprth + 8 - skb->data)) { - __be32 *ah_hdr; - - xprth = skb_network_header(skb) + ihl * 4; - ah_hdr = (__be32 *)xprth; - - fl4->fl4_ipsec_spi = ah_hdr[1]; - } - break; - case IPPROTO_COMP: - if (xprth + 4 < skb->data || - pskb_may_pull(skb, xprth + 4 - skb->data)) { - __be16 *ipcomp_hdr; - - xprth = skb_network_header(skb) + ihl * 4; - ipcomp_hdr = (__be16 *)xprth; - - fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); - } - break; case IPPROTO_GRE: if (xprth + 12 < skb->data || pskb_may_pull(skb, xprth + 12 - skb->data)) { @@ -3377,7 +3344,6 @@ decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) } break; default: - fl4->fl4_ipsec_spi = 0; break; } } @@ -3470,12 +3436,7 @@ decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse) fl6->flowi6_proto = nexthdr; return; #endif - /* XXX Why are there these headers? */ - case IPPROTO_AH: - case IPPROTO_ESP: - case IPPROTO_COMP: default: - fl6->fl6_ipsec_spi = 0; fl6->flowi6_proto = nexthdr; return; } -- cgit v1.2.3 From 76cf42213307f0908e010ac4c2bdcb77113202dd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 9 Apr 2021 12:40:17 +0300 Subject: wireless: align some HE capabilities with the spec Some names were changed, align that with the spec as of 802.11ax-D6.1. Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.b1e5fbab0d8c.I3eb6076cb0714ec6aec6b8f9dee613ce4a05d825@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 10 +++++----- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 6 +++--- drivers/net/wireless/mediatek/mt76/mt7915/init.c | 10 +++++----- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 4 ++-- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 2 +- include/linux/ieee80211.h | 14 +++++++------- net/mac80211/debugfs_sta.c | 19 ++++++++++--------- 7 files changed, 33 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index faa2e678e63e..343768afedc4 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -3807,7 +3807,7 @@ ath11k_mac_filter_he_cap_mesh(struct ieee80211_he_cap_elem *he_cap_elem) IEEE80211_HE_MAC_CAP4_BQR; he_cap_elem->mac_cap_info[4] &= ~m; - m = IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECVITE_TRANSMISSION | + m = IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION | IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU | IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING | IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX; @@ -3817,7 +3817,7 @@ ath11k_mac_filter_he_cap_mesh(struct ieee80211_he_cap_elem *he_cap_elem) IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO; he_cap_elem->phy_cap_info[2] &= ~m; - m = IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA | + m = IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU | IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK | IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK; he_cap_elem->phy_cap_info[3] &= ~m; @@ -3829,13 +3829,13 @@ ath11k_mac_filter_he_cap_mesh(struct ieee80211_he_cap_elem *he_cap_elem) he_cap_elem->phy_cap_info[5] &= ~m; m = IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU | - IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB | + IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB | IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB | IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO; he_cap_elem->phy_cap_info[6] &= ~m; - m = IEEE80211_HE_PHY_CAP7_SRP_BASED_SR | - IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR | + m = IEEE80211_HE_PHY_CAP7_PSR_BASED_SR | + IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP | IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ | IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ; he_cap_elem->phy_cap_info[7] &= ~m; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index af684f80b0cc..632f20d4027d 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -583,11 +583,11 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = { IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 | IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2, .phy_cap_info[6] = - IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB | - IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB | + IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB | + IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB | IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT, .phy_cap_info[7] = - IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR | + IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP | IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI | IEEE80211_HE_PHY_CAP7_MAX_NC_1, .phy_cap_info[8] = diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c index ad4e5b95158b..d9a0587f234b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c @@ -370,8 +370,8 @@ mt7915_set_stream_he_txbf_caps(struct ieee80211_sta_he_cap *he_cap, IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK; elem->phy_cap_info[5] &= ~c; - c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB | - IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB; + c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB | + IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB; elem->phy_cap_info[6] &= ~c; elem->phy_cap_info[7] &= ~IEEE80211_HE_PHY_CAP7_MAX_NC_MASK; @@ -408,8 +408,8 @@ mt7915_set_stream_he_txbf_caps(struct ieee80211_sta_he_cap *he_cap, c = (nss - 1) | (max_t(int, le16_to_cpu(mcs->tx_mcs_160), 1) << 3); elem->phy_cap_info[5] |= c; - c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB | - IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB; + c = IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB | + IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB; elem->phy_cap_info[6] |= c; /* the maximum cap is 4 x 3, (Nr, Nc) = (3, 2) */ @@ -535,7 +535,7 @@ mt7915_init_he_caps(struct mt7915_phy *phy, enum nl80211_band band, IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE | IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT; he_cap_elem->phy_cap_info[7] |= - IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR | + IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP | IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI; he_cap_elem->phy_cap_info[8] |= IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G | diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 195929242b72..97ef0265c516 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -1821,9 +1821,9 @@ mt7915_mcu_sta_bfer_he(struct ieee80211_sta *sta, struct ieee80211_vif *vif, bf->tx_mode = MT_PHY_TYPE_HE_SU; mt7915_mcu_sta_sounding_rate(bf); - bf->trigger_su = HE_PHY(CAP6_TRIG_SU_BEAMFORMER_FB, + bf->trigger_su = HE_PHY(CAP6_TRIG_SU_BEAMFORMING_FB, pe->phy_cap_info[6]); - bf->trigger_mu = HE_PHY(CAP6_TRIG_MU_BEAMFORMER_FB, + bf->trigger_mu = HE_PHY(CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB, pe->phy_cap_info[6]); bfer_nr = HE_PHY(CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK, ve->phy_cap_info[5]); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 729f6c42cdde..fdd93926b516 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -108,7 +108,7 @@ mt7921_init_he_caps(struct mt7921_phy *phy, enum nl80211_band band, IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE | IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT; he_cap_elem->phy_cap_info[7] |= - IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR | + IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP | IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI; he_cap_elem->phy_cap_info[8] |= IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G | diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 25fc7bee868a..687db25eb85f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2020,7 +2020,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 #define IEEE80211_HE_MAC_CAP4_QTP 0x02 #define IEEE80211_HE_MAC_CAP4_BQR 0x04 -#define IEEE80211_HE_MAC_CAP4_SRP_RESP 0x08 +#define IEEE80211_HE_MAC_CAP4_PSR_RESP 0x08 #define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 #define IEEE80211_HE_MAC_CAP4_OPS 0x20 #define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU 0x40 @@ -2031,7 +2031,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 0x01 #define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41 0x02 -#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECVITE_TRANSMISSION 0x04 +#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION 0x04 #define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU 0x08 #define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX 0x10 #define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS 0x20 @@ -2089,7 +2089,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 #define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 #define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 -#define IEEE80211_HE_PHY_CAP3_RX_HE_MU_PPDU_FROM_NON_AP_STA 0x40 +#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU 0x40 #define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 #define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 @@ -2136,15 +2136,15 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 #define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 -#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMER_FB 0x04 -#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMER_FB 0x08 +#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB 0x04 +#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB 0x08 #define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 #define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 #define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 #define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 -#define IEEE80211_HE_PHY_CAP7_SRP_BASED_SR 0x01 -#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_AR 0x02 +#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR 0x01 +#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP 0x02 #define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 #define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 #define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 5a27c61a7b38..d350224d45e8 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -732,15 +732,15 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(MAC, 4, BSRP_BQRP_A_MPDU_AGG, "BSRP-BQRP-A-MPDU-AGG"); PFLAG(MAC, 4, QTP, "QTP"); PFLAG(MAC, 4, BQR, "BQR"); - PFLAG(MAC, 4, SRP_RESP, "SRP-RESP"); + PFLAG(MAC, 4, PSR_RESP, "PSR-RESP"); PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP"); PFLAG(MAC, 4, OPS, "OPS"); PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU"); PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7); - PFLAG(MAC, 5, SUBCHAN_SELECVITE_TRANSMISSION, - "SUBCHAN-SELECVITE-TRANSMISSION"); + PFLAG(MAC, 5, SUBCHAN_SELECTIVE_TRANSMISSION, + "SUBCHAN-SELECTIVE-TRANSMISSION"); PFLAG(MAC, 5, UL_2x996_TONE_RU, "UL-2x996-TONE-RU"); PFLAG(MAC, 5, OM_CTRL_UL_MU_DATA_DIS_RX, "OM-CTRL-UL-MU-DATA-DIS-RX"); PFLAG(MAC, 5, HE_DYNAMIC_SM_PS, "HE-DYNAMIC-SM-PS"); @@ -832,8 +832,8 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(PHY, 3, DCM_MAX_RX_NSS_1, "DCM-MAX-RX-NSS-1"); PFLAG(PHY, 3, DCM_MAX_RX_NSS_2, "DCM-MAX-RX-NSS-2"); - PFLAG(PHY, 3, RX_HE_MU_PPDU_FROM_NON_AP_STA, - "RX-HE-MU-PPDU-FROM-NON-AP-STA"); + PFLAG(PHY, 3, RX_PARTIAL_BW_SU_IN_20MHZ_MU, + "RX-PARTIAL-BW-SU-IN-20MHZ-MU"); PFLAG(PHY, 3, SU_BEAMFORMER, "SU-BEAMFORMER"); PFLAG(PHY, 4, SU_BEAMFORMEE, "SU-BEAMFORMEE"); @@ -853,16 +853,17 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(PHY, 6, CODEBOOK_SIZE_42_SU, "CODEBOOK-SIZE-42-SU"); PFLAG(PHY, 6, CODEBOOK_SIZE_75_MU, "CODEBOOK-SIZE-75-MU"); - PFLAG(PHY, 6, TRIG_SU_BEAMFORMER_FB, "TRIG-SU-BEAMFORMER-FB"); - PFLAG(PHY, 6, TRIG_MU_BEAMFORMER_FB, "TRIG-MU-BEAMFORMER-FB"); + PFLAG(PHY, 6, TRIG_SU_BEAMFORMING_FB, "TRIG-SU-BEAMFORMING-FB"); + PFLAG(PHY, 6, TRIG_MU_BEAMFORMING_PARTIAL_BW_FB, + "MU-BEAMFORMING-PARTIAL-BW-FB"); PFLAG(PHY, 6, TRIG_CQI_FB, "TRIG-CQI-FB"); PFLAG(PHY, 6, PARTIAL_BW_EXT_RANGE, "PARTIAL-BW-EXT-RANGE"); PFLAG(PHY, 6, PARTIAL_BANDWIDTH_DL_MUMIMO, "PARTIAL-BANDWIDTH-DL-MUMIMO"); PFLAG(PHY, 6, PPE_THRESHOLD_PRESENT, "PPE-THRESHOLD-PRESENT"); - PFLAG(PHY, 7, SRP_BASED_SR, "SRP-BASED-SR"); - PFLAG(PHY, 7, POWER_BOOST_FACTOR_AR, "POWER-BOOST-FACTOR-AR"); + PFLAG(PHY, 7, PSR_BASED_SR, "PSR-BASED-SR"); + PFLAG(PHY, 7, POWER_BOOST_FACTOR_SUPP, "POWER-BOOST-FACTOR-SUPP"); PFLAG(PHY, 7, HE_SU_MU_PPDU_4XLTF_AND_08_US_GI, "HE-SU-MU-PPDU-4XLTF-AND-08-US-GI"); PFLAG_RANGE(PHY, 7, MAX_NC, 0, 1, 1, "MAX-NC-%d"); -- cgit v1.2.3 From 1f851b8dfd76a0e91560247802dd25a4754753c7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 9 Apr 2021 12:40:20 +0300 Subject: wireless: align HE capabilities A-MPDU Length Exponent Extension The A-MPDU length exponent extension is defined differently in 802.11ax D6.1, align with that. Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.c2a257d3e2df.I3455245d388c52c61dace7e7958dbed7e807cfb6@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 5 ++--- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 4 ++-- drivers/net/wireless/mac80211_hwsim.c | 8 ++++---- drivers/net/wireless/mediatek/mt76/mt7915/init.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 2 +- include/linux/ieee80211.h | 10 ++++------ net/mac80211/debugfs_sta.c | 16 ++++++++-------- 7 files changed, 22 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 343768afedc4..a44ca32e9f72 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1265,9 +1265,8 @@ static void ath11k_peer_assoc_h_he(struct ath11k *ar, * request, then use MAX_AMPDU_LEN_FACTOR as 16 to calculate max_ampdu * length. */ - ampdu_factor = (he_cap->he_cap_elem.mac_cap_info[3] & - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) >> - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_SHIFT; + ampdu_factor = u8_get_bits(he_cap->he_cap_elem.mac_cap_info[3], + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK); if (ampdu_factor) { if (sta->vht_cap.vht_supported) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index 632f20d4027d..d2058cdcb0d8 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -550,7 +550,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = { IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2, + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU | IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39, @@ -636,7 +636,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = { IEEE80211_HE_MAC_CAP2_BSR, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2, + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, .mac_cap_info[5] = diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index d56d2095a0d4..9630324c535e 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2795,7 +2795,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = { IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2, + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | @@ -2839,7 +2839,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = { IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2, + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | @@ -2885,7 +2885,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = { IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2, + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | @@ -2933,7 +2933,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = { IEEE80211_HE_MAC_CAP2_ACK_EN, .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2, + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c index d9a0587f234b..82b9e15dc6e3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c @@ -476,7 +476,7 @@ mt7915_init_he_caps(struct mt7915_phy *phy, enum nl80211_band band, IEEE80211_HE_MAC_CAP0_HTC_HE; he_cap_elem->mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED; + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3; he_cap_elem->mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU; diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index fdd93926b516..23149fcdf413 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -65,7 +65,7 @@ mt7921_init_he_caps(struct mt7921_phy *phy, enum nl80211_band band, IEEE80211_HE_MAC_CAP0_HTC_HE; he_cap_elem->mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | - IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED; + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3; he_cap_elem->mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU; diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 687db25eb85f..c74033aca726 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2006,17 +2006,15 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the * same field in the HE capabilities. */ -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT 0x00 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1 0x08 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2 0x10 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED 0x18 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0 0x00 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1 0x08 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2 0x10 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3 0x18 #define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK 0x18 #define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG 0x20 #define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 #define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_SHIFT 3 - #define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 #define IEEE80211_HE_MAC_CAP4_QTP 0x02 #define IEEE80211_HE_MAC_CAP4_BQR 0x04 diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index d350224d45e8..25b3d4822aed 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -711,17 +711,17 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(MAC, 3, OFDMA_RA, "OFDMA-RA"); switch (cap[3] & IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK) { - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_USE_VHT: - PRINT("MAX-AMPDU-LEN-EXP-USE-VHT"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0: + PRINT("MAX-AMPDU-LEN-EXP-USE-EXT-0"); break; - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_1: - PRINT("MAX-AMPDU-LEN-EXP-VHT-1"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1: + PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-1"); break; - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_VHT_2: - PRINT("MAX-AMPDU-LEN-EXP-VHT-2"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2: + PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-2"); break; - case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_RESERVED: - PRINT("MAX-AMPDU-LEN-EXP-RESERVED"); + case IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3: + PRINT("MAX-AMPDU-LEN-EXP-VHT-EXT-3"); break; } -- cgit v1.2.3 From 2f5164447cdab6419edddde3a214f93a53aa4e60 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 9 Apr 2021 12:40:24 +0300 Subject: wireless: fix spelling of A-MSDU in HE capabilities In the HE capabilities, spell A-MSDU correctly, not "A-MDSU". Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210409123755.9e6ff1af1181.If6868bc6902ccd9a95c74c78f716c4b41473ef14@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 4 ++-- drivers/net/wireless/mac80211_hwsim.c | 8 ++++---- drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/init.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 2 +- include/linux/ieee80211.h | 2 +- net/mac80211/debugfs_sta.c | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index d2058cdcb0d8..4e7da2e32354 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -552,7 +552,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = { IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2, .mac_cap_info[4] = - IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU | + IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU | IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39, .mac_cap_info[5] = IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 | @@ -638,7 +638,7 @@ static const struct ieee80211_sband_iftype_data iwl_he_capa[] = { IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2, .mac_cap_info[4] = - IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, + IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .mac_cap_info[5] = IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU, .phy_cap_info[0] = diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 9630324c535e..51ce767eaf88 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -2796,7 +2796,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = { .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, - .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, + .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | @@ -2840,7 +2840,7 @@ static const struct ieee80211_sband_iftype_data he_capa_2ghz[] = { .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, - .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, + .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[1] = IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | @@ -2886,7 +2886,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = { .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, - .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, + .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | @@ -2934,7 +2934,7 @@ static const struct ieee80211_sband_iftype_data he_capa_5ghz[] = { .mac_cap_info[3] = IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, - .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU, + .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, .phy_cap_info[0] = IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c index 6cbccfb05f8b..e7c23f9f0ea5 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mcu.c @@ -496,7 +496,7 @@ mt76_connac_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) if (elem->mac_cap_info[3] & IEEE80211_HE_MAC_CAP3_OMI_CONTROL) cap |= STA_REC_HE_CAP_OM; - if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU) + if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU) cap |= STA_REC_HE_CAP_AMSDU_IN_AMPDU; if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_BQR) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/init.c b/drivers/net/wireless/mediatek/mt76/mt7915/init.c index 82b9e15dc6e3..152ac7192163 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/init.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/init.c @@ -478,7 +478,7 @@ mt7915_init_he_caps(struct mt7915_phy *phy, enum nl80211_band band, IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3; he_cap_elem->mac_cap_info[4] = - IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU; + IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU; if (band == NL80211_BAND_2GHZ) he_cap_elem->phy_cap_info[0] = diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 97ef0265c516..c44091754a98 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -1330,7 +1330,7 @@ mt7915_mcu_sta_he_tlv(struct sk_buff *skb, struct ieee80211_sta *sta) if (elem->mac_cap_info[3] & IEEE80211_HE_MAC_CAP3_OMI_CONTROL) cap |= STA_REC_HE_CAP_OM; - if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU) + if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU) cap |= STA_REC_HE_CAP_AMSDU_IN_AMPDU; if (elem->mac_cap_info[4] & IEEE80211_HE_MAC_CAP4_BQR) diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 23149fcdf413..07141e98a077 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -67,7 +67,7 @@ mt7921_init_he_caps(struct mt7921_phy *phy, enum nl80211_band band, IEEE80211_HE_MAC_CAP3_OMI_CONTROL | IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3; he_cap_elem->mac_cap_info[4] = - IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU; + IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU; if (band == NL80211_BAND_2GHZ) he_cap_elem->phy_cap_info[0] = diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c74033aca726..2967437f1b11 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2021,7 +2021,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, #define IEEE80211_HE_MAC_CAP4_PSR_RESP 0x08 #define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 #define IEEE80211_HE_MAC_CAP4_OPS 0x20 -#define IEEE80211_HE_MAC_CAP4_AMDSU_IN_AMPDU 0x40 +#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU 0x40 /* Multi TID agg TX is split between byte #4 and #5 * The value is a combination of B39,B40,B41 */ diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 25b3d4822aed..936c9dfa86c8 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -735,7 +735,7 @@ static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, PFLAG(MAC, 4, PSR_RESP, "PSR-RESP"); PFLAG(MAC, 4, NDP_FB_REP, "NDP-FB-REP"); PFLAG(MAC, 4, OPS, "OPS"); - PFLAG(MAC, 4, AMDSU_IN_AMPDU, "AMSDU-IN-AMPDU"); + PFLAG(MAC, 4, AMSDU_IN_AMPDU, "AMSDU-IN-AMPDU"); PRINT("MULTI-TID-AGG-TX-QOS-%d", ((cap[5] << 1) | (cap[4] >> 7)) & 0x7); -- cgit v1.2.3 From d9c9e4db186ab4d81f84e6f22b225d333b9424e3 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:38 +0200 Subject: bpf: Factorize bpf_trace_printk and bpf_seq_printf Two helpers (trace_printk and seq_printf) have very similar implementations of format string parsing and a third one is coming (snprintf). To avoid code duplication and make the code easier to maintain, this moves the operations associated with format string parsing (validation and argument sanitization) into one generic function. The implementation of the two existing helpers already drifted quite a bit so unifying them entailed a lot of changes: - bpf_trace_printk always expected fmt[fmt_size] to be the terminating NULL character, this is no longer true, the first 0 is terminating. - bpf_trace_printk now supports %% (which produces the percentage char). - bpf_trace_printk now skips width formating fields. - bpf_trace_printk now supports the X modifier (capital hexadecimal). - bpf_trace_printk now supports %pK, %px, %pB, %pi4, %pI4, %pi6 and %pI6 - argument casting on 32 bit has been simplified into one macro and using an enum instead of obscure int increments. - bpf_seq_printf now uses bpf_trace_copy_string instead of strncpy_from_kernel_nofault and handles the %pks %pus specifiers. - bpf_seq_printf now prints longs correctly on 32 bit architectures. - both were changed to use a global per-cpu tmp buffer instead of one stack buffer for trace_printk and 6 small buffers for seq_printf. - to avoid per-cpu buffer usage conflict, these helpers disable preemption while the per-cpu buffer is in use. - both helpers now support the %ps and %pS specifiers to print symbols. The implementation is also moved from bpf_trace.c to helpers.c because the upcoming bpf_snprintf helper will be made available to all BPF programs and will need it. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210419155243.1632274-2-revest@chromium.org --- include/linux/bpf.h | 20 +++ kernel/bpf/helpers.c | 256 ++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 371 +++++------------------------------------------ 3 files changed, 313 insertions(+), 334 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ff8cd68c01b3..77d1d8c65b81 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2077,4 +2077,24 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); +enum bpf_printf_mod_type { + BPF_PRINTF_INT, + BPF_PRINTF_LONG, + BPF_PRINTF_LONG_LONG, +}; + +/* Workaround for getting va_list handling working with different argument type + * combinations generically for 32 and 64 bit archs. + */ +#define BPF_CAST_FMT_ARG(arg_nb, args, mod) \ + (mod[arg_nb] == BPF_PRINTF_LONG_LONG || \ + (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64) \ + ? (u64)args[arg_nb] \ + : (u32)args[arg_nb]) + +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args); +void bpf_printf_cleanup(void); + #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f306611c4ddf..9ca57eb1fc0d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -669,6 +669,262 @@ const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; +static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, + size_t bufsz) +{ + void __user *user_ptr = (__force void __user *)unsafe_ptr; + + buf[0] = 0; + + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)unsafe_ptr < TASK_SIZE) + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + fallthrough; +#endif + case 'k': + return strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); + case 'u': + return strncpy_from_user_nofault(buf, user_ptr, bufsz); + } + + return -EINVAL; +} + +/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p + */ +#define MAX_PRINTF_BUF_LEN 512 + +struct bpf_printf_buf { + char tmp_buf[MAX_PRINTF_BUF_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); +static DEFINE_PER_CPU(int, bpf_printf_buf_used); + +static int try_get_fmt_tmp_buf(char **tmp_buf) +{ + struct bpf_printf_buf *bufs; + int used; + + if (*tmp_buf) + return 0; + + preempt_disable(); + used = this_cpu_inc_return(bpf_printf_buf_used); + if (WARN_ON_ONCE(used > 1)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + return -EBUSY; + } + bufs = this_cpu_ptr(&bpf_printf_buf); + *tmp_buf = bufs->tmp_buf; + + return 0; +} + +void bpf_printf_cleanup(void) +{ + if (this_cpu_read(bpf_printf_buf_used)) { + this_cpu_dec(bpf_printf_buf_used); + preempt_enable(); + } +} + +/* + * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers + * + * Returns a negative value if fmt is an invalid format string or 0 otherwise. + * + * This can be used in two ways: + * - Format string verification only: when final_args and mod are NULL + * - Arguments preparation: in addition to the above verification, it writes in + * final_args a copy of raw_args where pointers from BPF have been sanitized + * into pointers safe to use by snprintf. This also writes in the mod array + * the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex. + * + * In argument preparation mode, if 0 is returned, safe temporary buffers are + * allocated and bpf_printf_cleanup should be called to free them after use. + */ +int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u64 *final_args, enum bpf_printf_mod_type *mod, + u32 num_args) +{ + char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end; + size_t tmp_buf_len = MAX_PRINTF_BUF_LEN; + int err, i, num_spec = 0, copy_size; + enum bpf_printf_mod_type cur_mod; + u64 cur_arg; + char fmt_ptype; + + if (!!final_args != !!mod) + return -EINVAL; + + fmt_end = strnchr(fmt, fmt_size, 0); + if (!fmt_end) + return -EINVAL; + fmt_size = fmt_end - fmt; + + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto cleanup; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (num_spec >= num_args) { + err = -EINVAL; + goto cleanup; + } + + /* The string is zero-terminated so if fmt[i] != 0, we can + * always access fmt[i + 1], in the worst case it will be a 0 + */ + i++; + + /* skip optional "[0 +-][num]" width formatting field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 'p') { + cur_mod = BPF_PRINTF_LONG; + + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + fmt[i + 1] == 'x' || fmt[i + 1] == 'B' || + fmt[i + 1] == 's' || fmt[i + 1] == 'S') { + /* just kernel pointers */ + if (final_args) + cur_arg = raw_args[num_spec]; + goto fmt_next; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || + (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { + err = -EINVAL; + goto cleanup; + } + + i += 2; + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + if (tmp_buf_len < copy_size) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr, + copy_size); + if (err < 0) + memset(tmp_buf, 0, copy_size); + cur_arg = (u64)(long)tmp_buf; + tmp_buf += copy_size; + tmp_buf_len -= copy_size; + + goto fmt_next; + } else if (fmt[i] == 's') { + cur_mod = BPF_PRINTF_LONG; + fmt_ptype = fmt[i]; +fmt_str: + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) { + err = -EINVAL; + goto cleanup; + } + + if (!final_args) + goto fmt_next; + + if (try_get_fmt_tmp_buf(&tmp_buf)) { + err = -EBUSY; + goto out; + } + + if (!tmp_buf_len) { + err = -ENOSPC; + goto cleanup; + } + + unsafe_ptr = (char *)(long)raw_args[num_spec]; + err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, + fmt_ptype, tmp_buf_len); + if (err < 0) { + tmp_buf[0] = '\0'; + err = 1; + } + + cur_arg = (u64)(long)tmp_buf; + tmp_buf += err; + tmp_buf_len -= err; + + goto fmt_next; + } + + cur_mod = BPF_PRINTF_INT; + + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG; + i++; + } + if (fmt[i] == 'l') { + cur_mod = BPF_PRINTF_LONG_LONG; + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && + fmt[i] != 'x' && fmt[i] != 'X') { + err = -EINVAL; + goto cleanup; + } + + if (final_args) + cur_arg = raw_args[num_spec]; +fmt_next: + if (final_args) { + mod[num_spec] = cur_mod; + final_args[num_spec] = cur_arg; + } + num_spec++; + } + + err = 0; +cleanup: + if (err) + bpf_printf_cleanup(); +out: + return err; +} + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0d23755c2747..a13f8644b357 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -372,188 +372,38 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) return &bpf_probe_write_user_proto; } -static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, - size_t bufsz) -{ - void __user *user_ptr = (__force void __user *)unsafe_ptr; - - buf[0] = 0; - - switch (fmt_ptype) { - case 's': -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)unsafe_ptr < TASK_SIZE) { - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } - fallthrough; -#endif - case 'k': - strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz); - break; - case 'u': - strncpy_from_user_nofault(buf, user_ptr, bufsz); - break; - } -} - static DEFINE_RAW_SPINLOCK(trace_printk_lock); -#define BPF_TRACE_PRINTK_SIZE 1024 +#define MAX_TRACE_PRINTK_VARARGS 3 +#define BPF_TRACE_PRINTK_SIZE 1024 -static __printf(1, 0) int bpf_do_trace_printk(const char *fmt, ...) +BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, + u64, arg2, u64, arg3) { + u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; + enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS]; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; - va_list ap; int ret; - raw_spin_lock_irqsave(&trace_printk_lock, flags); - va_start(ap, fmt); - ret = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); - /* vsnprintf() will not append null for zero-length strings */ + ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod, + MAX_TRACE_PRINTK_VARARGS); + if (ret < 0) + return ret; + + ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod)); + /* snprintf() will not append null for zero-length strings */ if (ret == 0) buf[0] = '\0'; + + raw_spin_lock_irqsave(&trace_printk_lock, flags); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - return ret; -} - -/* - * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pB %pks %pus %s - */ -BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, - u64, arg2, u64, arg3) -{ - int i, mod[3] = {}, fmt_cnt = 0; - char buf[64], fmt_ptype; - void *unsafe_ptr = NULL; - bool str_seen = false; + bpf_printf_cleanup(); - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - return -EINVAL; - - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) - return -EINVAL; - - if (fmt[i] != '%') - continue; - - if (fmt_cnt >= 3) - return -EINVAL; - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } else if (fmt[i] == 'p') { - mod[fmt_cnt]++; - if ((fmt[i + 1] == 'k' || - fmt[i + 1] == 'u') && - fmt[i + 2] == 's') { - fmt_ptype = fmt[i + 1]; - i += 2; - goto fmt_str; - } - - if (fmt[i + 1] == 'B') { - i++; - goto fmt_next; - } - - /* disallow any further format extensions */ - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - goto fmt_next; - } else if (fmt[i] == 's') { - mod[fmt_cnt]++; - fmt_ptype = fmt[i]; -fmt_str: - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - - if (fmt[i + 1] != 0 && - !isspace(fmt[i + 1]) && - !ispunct(fmt[i + 1])) - return -EINVAL; - - switch (fmt_cnt) { - case 0: - unsafe_ptr = (void *)(long)arg1; - arg1 = (long)buf; - break; - case 1: - unsafe_ptr = (void *)(long)arg2; - arg2 = (long)buf; - break; - case 2: - unsafe_ptr = (void *)(long)arg3; - arg3 = (long)buf; - break; - } - - bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype, - sizeof(buf)); - goto fmt_next; - } - - if (fmt[i] == 'l') { - mod[fmt_cnt]++; - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x') - return -EINVAL; -fmt_next: - fmt_cnt++; - } - -/* Horrid workaround for getting va_list handling working with different - * argument type combinations generically for 32 and 64 bit archs. - */ -#define __BPF_TP_EMIT() __BPF_ARG3_TP() -#define __BPF_TP(...) \ - bpf_do_trace_printk(fmt, ##__VA_ARGS__) - -#define __BPF_ARG1_TP(...) \ - ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_TP(arg1, ##__VA_ARGS__) \ - : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ - : __BPF_TP((u32)arg1, ##__VA_ARGS__))) - -#define __BPF_ARG2_TP(...) \ - ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ - : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ - : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) - -#define __BPF_ARG3_TP(...) \ - ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ - ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ - : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ - ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ - : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) - - return __BPF_TP_EMIT(); + return ret; } static const struct bpf_func_proto bpf_trace_printk_proto = { @@ -581,184 +431,37 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) } #define MAX_SEQ_PRINTF_VARARGS 12 -#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 -#define MAX_SEQ_PRINTF_STR_LEN 128 - -struct bpf_seq_printf_buf { - char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; -}; -static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); -static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { - int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; - int i, buf_used, copy_size, num_args; - u64 params[MAX_SEQ_PRINTF_VARARGS]; - struct bpf_seq_printf_buf *bufs; - const u64 *args = data; - - buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); - if (WARN_ON_ONCE(buf_used > 1)) { - err = -EBUSY; - goto out; - } - - bufs = this_cpu_ptr(&bpf_seq_printf_buf); - - /* - * bpf_check()->check_func_arg()->check_stack_boundary() - * guarantees that fmt points to bpf program stack, - * fmt_size bytes of it were initialized and fmt_size > 0 - */ - if (fmt[--fmt_size] != 0) - goto out; - - if (data_len & 7) - goto out; - - for (i = 0; i < fmt_size; i++) { - if (fmt[i] == '%') { - if (fmt[i + 1] == '%') - i++; - else if (!data || !data_len) - goto out; - } - } + enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS]; + u64 args[MAX_SEQ_PRINTF_VARARGS]; + int err, num_args; + if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; num_args = data_len / 8; - /* check format string for allowed specifiers */ - for (i = 0; i < fmt_size; i++) { - /* only printable ascii for now. */ - if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { - err = -EINVAL; - goto out; - } - - if (fmt[i] != '%') - continue; - - if (fmt[i + 1] == '%') { - i++; - continue; - } - - if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { - err = -E2BIG; - goto out; - } - - if (fmt_cnt >= num_args) { - err = -EINVAL; - goto out; - } - - /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ - i++; - - /* skip optional "[0 +-][num]" width formating field */ - while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || - fmt[i] == ' ') - i++; - if (fmt[i] >= '1' && fmt[i] <= '9') { - i++; - while (fmt[i] >= '0' && fmt[i] <= '9') - i++; - } - - if (fmt[i] == 's') { - void *unsafe_ptr; - - /* try our best to copy */ - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - unsafe_ptr = (void *)(long)args[fmt_cnt]; - err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt], - unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN); - if (err < 0) - bufs->buf[memcpy_cnt][0] = '\0'; - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'p') { - if (fmt[i + 1] == 0 || - fmt[i + 1] == 'K' || - fmt[i + 1] == 'x' || - fmt[i + 1] == 'B') { - /* just kernel pointers */ - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - continue; - } - - /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ - if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { - err = -EINVAL; - goto out; - } - if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { - err = -EINVAL; - goto out; - } - - if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { - err = -E2BIG; - goto out; - } - - - copy_size = (fmt[i + 2] == '4') ? 4 : 16; - - err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt], - (void *) (long) args[fmt_cnt], - copy_size); - if (err < 0) - memset(bufs->buf[memcpy_cnt], 0, copy_size); - params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; - - i += 2; - fmt_cnt++; - memcpy_cnt++; - continue; - } - - if (fmt[i] == 'l') { - i++; - if (fmt[i] == 'l') - i++; - } - - if (fmt[i] != 'i' && fmt[i] != 'd' && - fmt[i] != 'u' && fmt[i] != 'x' && - fmt[i] != 'X') { - err = -EINVAL; - goto out; - } - - params[fmt_cnt] = args[fmt_cnt]; - fmt_cnt++; - } + err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args); + if (err < 0) + return err; /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give * all of them to seq_printf(). */ - seq_printf(m, fmt, params[0], params[1], params[2], params[3], - params[4], params[5], params[6], params[7], params[8], - params[9], params[10], params[11]); + seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); - err = seq_has_overflowed(m) ? -EOVERFLOW : 0; -out: - this_cpu_dec(bpf_seq_printf_buf_used); - return err; + bpf_printf_cleanup(); + + return seq_has_overflowed(m) ? -EOVERFLOW : 0; } BTF_ID_LIST_SINGLE(btf_seq_file_ids, struct, seq_file) -- cgit v1.2.3 From fff13c4bb646ef849fd74ced87eef54340d28c21 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:39 +0200 Subject: bpf: Add a ARG_PTR_TO_CONST_STR argument type This type provides the guarantee that an argument is going to be a const pointer to somewhere in a read-only map value. It also checks that this pointer is followed by a zero character before the end of the map value. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210419155243.1632274-3-revest@chromium.org --- include/linux/bpf.h | 1 + kernel/bpf/verifier.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 77d1d8c65b81..c160526fc8bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -309,6 +309,7 @@ enum bpf_arg_type { ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ + ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ __BPF_ARG_TYPE_MAX, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 852541a435ef..5f46dd6f3383 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4787,6 +4787,7 @@ static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALU static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; +static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, @@ -4817,6 +4818,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, [ARG_PTR_TO_FUNC] = &func_ptr_types, [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, + [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, }; static int check_reg_type(struct bpf_verifier_env *env, u32 regno, @@ -5067,6 +5069,45 @@ skip_type_check: if (err) return err; err = check_ptr_alignment(env, reg, 0, size, true); + } else if (arg_type == ARG_PTR_TO_CONST_STR) { + struct bpf_map *map = reg->map_ptr; + int map_off; + u64 map_addr; + char *str_ptr; + + if (reg->type != PTR_TO_MAP_VALUE || !map || + !bpf_map_is_rdonly(map)) { + verbose(env, "R%d does not point to a readonly map'\n", regno); + return -EACCES; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a constant address'\n", regno); + return -EACCES; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + return -EACCES; + } + + err = check_map_access(env, regno, reg->off, + map->value_size - reg->off, false); + if (err) + return err; + + map_off = reg->off + reg->var_off.value; + err = map->ops->map_direct_value_addr(map, &map_addr, map_off); + if (err) { + verbose(env, "direct value access on string failed\n"); + return err; + } + + str_ptr = (char *)(long)(map_addr); + if (!strnchr(str_ptr + map_off, map->value_size - map_off, 0)) { + verbose(env, "string is not zero-terminated\n"); + return -EINVAL; + } } return err; -- cgit v1.2.3 From 7b15523a989b63927c2bb08e9b5b0bbc10b58bef Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 19 Apr 2021 17:52:40 +0200 Subject: bpf: Add a bpf_snprintf helper The implementation takes inspiration from the existing bpf_trace_printk helper but there are a few differences: To allow for a large number of format-specifiers, parameters are provided in an array, like in bpf_seq_printf. Because the output string takes two arguments and the array of parameters also takes two arguments, the format string needs to fit in one argument. Thankfully, ARG_PTR_TO_CONST_STR is guaranteed to point to a zero-terminated read-only map so we don't need a format string length arg. Because the format-string is known at verification time, we also do a first pass of format string validation in the verifier logic. This makes debugging easier. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210419155243.1632274-4-revest@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ kernel/bpf/helpers.c | 50 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 41 ++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 28 +++++++++++++++++++++++ 6 files changed, 150 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c160526fc8bf..f8a45f109e96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1953,6 +1953,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_snprintf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9ca57eb1fc0d..85b26ca5aacd 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -925,6 +925,54 @@ out: return err; } +#define MAX_SNPRINTF_VARARGS 12 + +BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, + const void *, data, u32, data_len) +{ + enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; + u64 args[MAX_SNPRINTF_VARARGS]; + int err, num_args; + + if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || + (data_len && !data)) + return -EINVAL; + num_args = data_len / 8; + + /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we + * can safely give an unbounded size. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + if (err < 0) + return err; + + /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give + * all of them to snprintf(). + */ + err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), + BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), + BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), + BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), + BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), + BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), + BPF_CAST_FMT_ARG(11, args, mod)); + + bpf_printf_cleanup(); + + return err + 1; +} + +const struct bpf_func_proto bpf_snprintf_proto = { + .func = bpf_snprintf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_PTR_TO_CONST_STR, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + const struct bpf_func_proto bpf_get_current_task_proto __weak; const struct bpf_func_proto bpf_probe_read_user_proto __weak; const struct bpf_func_proto bpf_probe_read_user_str_proto __weak; @@ -1013,6 +1061,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5f46dd6f3383..994ef36c5f60 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5918,6 +5918,41 @@ static int check_reference_leak(struct bpf_verifier_env *env) return state->acquired_refs ? -EINVAL : 0; } +static int check_bpf_snprintf_call(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) +{ + struct bpf_reg_state *fmt_reg = ®s[BPF_REG_3]; + struct bpf_reg_state *data_len_reg = ®s[BPF_REG_5]; + struct bpf_map *fmt_map = fmt_reg->map_ptr; + int err, fmt_map_off, num_args; + u64 fmt_addr; + char *fmt; + + /* data must be an array of u64 */ + if (data_len_reg->var_off.value % 8) + return -EINVAL; + num_args = data_len_reg->var_off.value / 8; + + /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const + * and map_direct_value_addr is set. + */ + fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, + fmt_map_off); + if (err) + return err; + fmt = (char *)(long)fmt_addr + fmt_map_off; + + /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we + * can focus on validating the format specifiers. + */ + err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + if (err < 0) + verbose(env, "Invalid format string\n"); + + return err; +} + static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { @@ -6032,6 +6067,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_snprintf) { + err = check_bpf_snprintf_call(env, regs); + if (err < 0) + return err; + } + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a13f8644b357..2a8bcdc927c7 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1076,6 +1076,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_snprintf: + return &bpf_snprintf_proto; default: return NULL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index df164a44bb41..ec6d85a81744 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4708,6 +4708,33 @@ union bpf_attr { * Return * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4875,6 +4902,7 @@ union bpf_attr { FN(sock_from_file), \ FN(check_mtu), \ FN(for_each_map_elem), \ + FN(snprintf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From c6400e3fc3fa821a26a58cf867331e0877a4c56b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 17 Apr 2021 14:38:07 +0300 Subject: netlink: simplify nl_set_extack_cookie_u64(), nl_set_extack_cookie_u32() Taking address of a function argument directly works just fine. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/netlink.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 0bcf98098c5a..61b1c7fcc401 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -129,23 +129,19 @@ struct netlink_ext_ack { static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { - u64 __cookie = cookie; - if (!extack) return; - memcpy(extack->cookie, &__cookie, sizeof(__cookie)); - extack->cookie_len = sizeof(__cookie); + memcpy(extack->cookie, &cookie, sizeof(cookie)); + extack->cookie_len = sizeof(cookie); } static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack, u32 cookie) { - u32 __cookie = cookie; - if (!extack) return; - memcpy(extack->cookie, &__cookie, sizeof(__cookie)); - extack->cookie_len = sizeof(__cookie); + memcpy(extack->cookie, &cookie, sizeof(cookie)); + extack->cookie_len = sizeof(cookie); } void netlink_kernel_release(struct sock *sk); -- cgit v1.2.3 From da702f34e3cc4b6b87ed2d63c17d65d841fa81c6 Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Mon, 19 Apr 2021 19:13:59 +0300 Subject: net: phy: add genphy_c45_pma_suspend/resume Add generic PMA suspend and resume callback functions for C45 PHYs. Signed-off-by: Radu Pirea (NXP OSS) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy-c45.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/phy.h | 2 ++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 91e3acb9e397..f4816b7d31b3 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -8,6 +8,49 @@ #include #include +/** + * genphy_c45_pma_can_sleep - checks if the PMA have sleep support + * @phydev: target phy_device struct + */ +static bool genphy_c45_pma_can_sleep(struct phy_device *phydev) +{ + int stat1; + + stat1 = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_STAT1); + if (stat1 < 0) + return false; + + return !!(stat1 & MDIO_STAT1_LPOWERABLE); +} + +/** + * genphy_c45_pma_resume - wakes up the PMA module + * @phydev: target phy_device struct + */ +int genphy_c45_pma_resume(struct phy_device *phydev) +{ + if (!genphy_c45_pma_can_sleep(phydev)) + return -EOPNOTSUPP; + + return phy_clear_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_CTRL1, + MDIO_CTRL1_LPOWER); +} +EXPORT_SYMBOL_GPL(genphy_c45_pma_resume); + +/** + * genphy_c45_pma_suspend - suspends the PMA module + * @phydev: target phy_device struct + */ +int genphy_c45_pma_suspend(struct phy_device *phydev) +{ + if (!genphy_c45_pma_can_sleep(phydev)) + return -EOPNOTSUPP; + + return phy_set_bits_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_CTRL1, + MDIO_CTRL1_LPOWER); +} +EXPORT_SYMBOL_GPL(genphy_c45_pma_suspend); + /** * genphy_c45_pma_setup_forced - configures a forced speed * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index 98fb441dd72e..e3d4d583463b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1535,6 +1535,8 @@ int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); int genphy_c45_loopback(struct phy_device *phydev, bool enable); +int genphy_c45_pma_resume(struct phy_device *phydev); +int genphy_c45_pma_suspend(struct phy_device *phydev); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; -- cgit v1.2.3 From 6980ffa0c5a8e65d53ff803d2cafdba3e2022714 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 21 Jun 2020 21:35:34 +0300 Subject: net/mlx5e: RX, Add checks for calculated Striding RQ attributes Striding RQ attributes below are mutually dependent. An unaware change to one might take the others out of the valid range derived by the HW caps: - The MPWQE size in bytes - The number of strides in a MPWQE - The stride size Add checks to verify they are valid and comply to the HW spec and SW assumptions/requirements. This is not a fix, no particular issue exists today. Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/params.c | 86 ++++++++++++++-------- .../net/ethernet/mellanox/mlx5/core/en/params.h | 20 ++--- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +- include/linux/mlx5/device.h | 7 +- 4 files changed, 76 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 69f1f41b2b83..f410c1268422 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -90,30 +90,39 @@ bool mlx5e_rx_is_linear_skb(struct mlx5e_params *params, return !params->lro_en && linear_frag_sz <= PAGE_SIZE; } -#define MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ ((BIT(__mlx5_bit_sz(wq, log_wqe_stride_size)) - 1) + \ - MLX5_MPWQE_LOG_STRIDE_SZ_BASE) -bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev, - struct mlx5e_params *params, - struct mlx5e_xsk_param *xsk) +bool mlx5e_verify_rx_mpwqe_strides(struct mlx5_core_dev *mdev, + u8 log_stride_sz, u8 log_num_strides) { - u32 linear_frag_sz = mlx5e_rx_get_linear_frag_sz(params, xsk); - s8 signed_log_num_strides_param; - u8 log_num_strides; + if (log_stride_sz + log_num_strides != MLX5_MPWRQ_LOG_WQE_SZ) + return false; - if (!mlx5e_rx_is_linear_skb(params, xsk)) + if (log_stride_sz < MLX5_MPWQE_LOG_STRIDE_SZ_BASE || + log_stride_sz > MLX5_MPWQE_LOG_STRIDE_SZ_MAX) return false; - if (order_base_2(linear_frag_sz) > MLX5_MAX_MPWQE_LOG_WQE_STRIDE_SZ) + if (log_num_strides > MLX5_MPWQE_LOG_NUM_STRIDES_MAX) return false; if (MLX5_CAP_GEN(mdev, ext_stride_num_range)) - return true; + return log_num_strides >= MLX5_MPWQE_LOG_NUM_STRIDES_EXT_BASE; - log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - order_base_2(linear_frag_sz); - signed_log_num_strides_param = - (s8)log_num_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE; + return log_num_strides >= MLX5_MPWQE_LOG_NUM_STRIDES_BASE; +} - return signed_log_num_strides_param >= 0; +bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk) +{ + s8 log_num_strides; + u8 log_stride_sz; + + if (!mlx5e_rx_is_linear_skb(params, xsk)) + return false; + + log_stride_sz = order_base_2(mlx5e_rx_get_linear_frag_sz(params, xsk)); + log_num_strides = MLX5_MPWRQ_LOG_WQE_SZ - log_stride_sz; + + return mlx5e_verify_rx_mpwqe_strides(mdev, log_stride_sz, log_num_strides); } u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5e_params *params, @@ -462,26 +471,36 @@ static void mlx5e_build_rx_cq_param(struct mlx5_core_dev *mdev, param->cq_period_mode = params->rx_cq_moderation.cq_period_mode; } -void mlx5e_build_rq_param(struct mlx5_core_dev *mdev, - struct mlx5e_params *params, - struct mlx5e_xsk_param *xsk, - u16 q_counter, - struct mlx5e_rq_param *param) +int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + u16 q_counter, + struct mlx5e_rq_param *param) { void *rqc = param->rqc; void *wq = MLX5_ADDR_OF(rqc, rqc, wq); int ndsegs = 1; switch (params->rq_wq_type) { - case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: { + u8 log_wqe_num_of_strides = mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk); + u8 log_wqe_stride_size = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk); + + if (!mlx5e_verify_rx_mpwqe_strides(mdev, log_wqe_stride_size, + log_wqe_num_of_strides)) { + mlx5_core_err(mdev, + "Bad RX MPWQE params: log_stride_size %u, log_num_strides %u\n", + log_wqe_stride_size, log_wqe_num_of_strides); + return -EINVAL; + } + MLX5_SET(wq, wq, log_wqe_num_of_strides, - mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk) - - MLX5_MPWQE_LOG_NUM_STRIDES_BASE); + log_wqe_num_of_strides - MLX5_MPWQE_LOG_NUM_STRIDES_BASE); MLX5_SET(wq, wq, log_wqe_stride_size, - mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk) - - MLX5_MPWQE_LOG_STRIDE_SZ_BASE); + log_wqe_stride_size - MLX5_MPWQE_LOG_STRIDE_SZ_BASE); MLX5_SET(wq, wq, log_wq_sz, mlx5e_mpwqe_get_log_rq_size(params, xsk)); break; + } default: /* MLX5_WQ_TYPE_CYCLIC */ MLX5_SET(wq, wq, log_wq_sz, params->log_rq_mtu_frames); mlx5e_build_rq_frags_info(mdev, params, xsk, ¶m->frags_info); @@ -499,6 +518,8 @@ void mlx5e_build_rq_param(struct mlx5_core_dev *mdev, param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); mlx5e_build_rx_cq_param(mdev, params, xsk, ¶m->cqp); + + return 0; } void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev, @@ -643,14 +664,17 @@ void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, mlx5e_build_tx_cq_param(mdev, params, ¶m->cqp); } -void mlx5e_build_channel_param(struct mlx5_core_dev *mdev, - struct mlx5e_params *params, - u16 q_counter, - struct mlx5e_channel_param *cparam) +int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + u16 q_counter, + struct mlx5e_channel_param *cparam) { u8 icosq_log_wq_sz, async_icosq_log_wq_sz; + int err; - mlx5e_build_rq_param(mdev, params, NULL, q_counter, &cparam->rq); + err = mlx5e_build_rq_param(mdev, params, NULL, q_counter, &cparam->rq); + if (err) + return err; icosq_log_wq_sz = mlx5e_build_icosq_log_wq_sz(params, &cparam->rq); async_icosq_log_wq_sz = mlx5e_build_async_icosq_log_wq_sz(mdev); @@ -659,4 +683,6 @@ void mlx5e_build_channel_param(struct mlx5_core_dev *mdev, mlx5e_build_xdpsq_param(mdev, params, &cparam->xdp_sq); mlx5e_build_icosq_param(mdev, icosq_log_wq_sz, &cparam->icosq); mlx5e_build_async_icosq_param(mdev, async_icosq_log_wq_sz, &cparam->async_icosq); + + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index fcc51ec6084e..e9593f5f0661 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -96,6 +96,8 @@ void mlx5e_build_rq_params(struct mlx5_core_dev *mdev, struct mlx5e_params *para void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params); void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); +bool mlx5e_verify_rx_mpwqe_strides(struct mlx5_core_dev *mdev, + u8 log_stride_sz, u8 log_num_strides); u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk); u32 mlx5e_rx_get_min_frag_sz(struct mlx5e_params *params, @@ -122,11 +124,11 @@ u16 mlx5e_get_rq_headroom(struct mlx5_core_dev *mdev, /* Build queue parameters */ void mlx5e_build_create_cq_param(struct mlx5e_create_cq_param *ccp, struct mlx5e_channel *c); -void mlx5e_build_rq_param(struct mlx5_core_dev *mdev, - struct mlx5e_params *params, - struct mlx5e_xsk_param *xsk, - u16 q_counter, - struct mlx5e_rq_param *param); +int mlx5e_build_rq_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + struct mlx5e_xsk_param *xsk, + u16 q_counter, + struct mlx5e_rq_param *param); void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev, u16 q_counter, struct mlx5e_rq_param *param); @@ -141,10 +143,10 @@ void mlx5e_build_tx_cq_param(struct mlx5_core_dev *mdev, void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_sq_param *param); -void mlx5e_build_channel_param(struct mlx5_core_dev *mdev, - struct mlx5e_params *params, - u16 q_counter, - struct mlx5e_channel_param *cparam); +int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, + struct mlx5e_params *params, + u16 q_counter, + struct mlx5e_channel_param *cparam); u16 mlx5e_calc_sq_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *params); int mlx5e_validate_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index bc2d37d2806f..bca832cdc4cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2086,7 +2086,10 @@ int mlx5e_open_channels(struct mlx5e_priv *priv, if (!chs->c || !cparam) goto err_free; - mlx5e_build_channel_param(priv->mdev, &chs->params, priv->q_counter, cparam); + err = mlx5e_build_channel_param(priv->mdev, &chs->params, priv->q_counter, cparam); + if (err) + goto err_free; + for (i = 0; i < chs->num; i++) { struct xsk_buff_pool *xsk_pool = NULL; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 92a029a800a0..578c4ccae91c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -911,8 +911,11 @@ static inline u16 get_cqe_flow_tag(struct mlx5_cqe64 *cqe) return be32_to_cpu(cqe->sop_drop_qpn) & 0xFFF; } -#define MLX5_MPWQE_LOG_NUM_STRIDES_BASE (9) -#define MLX5_MPWQE_LOG_STRIDE_SZ_BASE (6) +#define MLX5_MPWQE_LOG_NUM_STRIDES_EXT_BASE 3 +#define MLX5_MPWQE_LOG_NUM_STRIDES_BASE 9 +#define MLX5_MPWQE_LOG_NUM_STRIDES_MAX 16 +#define MLX5_MPWQE_LOG_STRIDE_SZ_BASE 6 +#define MLX5_MPWQE_LOG_STRIDE_SZ_MAX 13 struct mpwrq_cqe_bc { __be16 filler_consumed_strides; -- cgit v1.2.3 From 704cfecdd03d7b84403ed96ba0009ea07270e74e Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 28 Feb 2021 23:48:27 +0200 Subject: net/mlx5: mlx5_ifc updates for flex parser Added the required definitions for supporting more protocols by flex parsers (GTP-U, Geneve TLV options), and for using the right flex parser that was configured for this protocol. Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f2c51d6833c6..aa6effe1dd6d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -622,7 +622,19 @@ struct mlx5_ifc_fte_match_set_misc3_bits { u8 geneve_tlv_option_0_data[0x20]; - u8 reserved_at_140[0xc0]; + u8 gtpu_teid[0x20]; + + u8 gtpu_msg_type[0x8]; + u8 gtpu_msg_flags[0x8]; + u8 reserved_at_170[0x10]; + + u8 gtpu_dw_2[0x20]; + + u8 gtpu_first_ext_dw_0[0x20]; + + u8 gtpu_dw_0[0x20]; + + u8 reserved_at_1e0[0x20]; }; struct mlx5_ifc_fte_match_set_misc4_bits { @@ -1237,9 +1249,17 @@ enum { enum { MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, + MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED = 1 << 4, + mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5, MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, + MLX5_FLEX_PARSER_GENEVE_TLV_OPTION_0_ENABLED = 1 << 10, + MLX5_FLEX_PARSER_GTPU_ENABLED = 1 << 11, + MLX5_FLEX_PARSER_GTPU_DW_2_ENABLED = 1 << 16, + MLX5_FLEX_PARSER_GTPU_FIRST_EXT_DW_0_ENABLED = 1 << 17, + MLX5_FLEX_PARSER_GTPU_DW_0_ENABLED = 1 << 18, + MLX5_FLEX_PARSER_GTPU_TEID_ENABLED = 1 << 19, }; enum { @@ -1637,7 +1657,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 cqe_compression_timeout[0x10]; u8 cqe_compression_max_num[0x10]; - u8 reserved_at_5e0[0x10]; + u8 reserved_at_5e0[0x8]; + u8 flex_parser_id_gtpu_dw_0[0x4]; + u8 reserved_at_5ec[0x4]; u8 tag_matching[0x1]; u8 rndv_offload_rc[0x1]; u8 rndv_offload_dc[0x1]; @@ -1648,7 +1670,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 affiliate_nic_vport_criteria[0x8]; u8 native_port_num[0x8]; u8 num_vhca_ports[0x8]; - u8 reserved_at_618[0x6]; + u8 flex_parser_id_gtpu_teid[0x4]; + u8 reserved_at_61c[0x2]; u8 sw_owner_id[0x1]; u8 reserved_at_61f[0x1]; @@ -1683,7 +1706,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_6e0[0x10]; u8 sf_base_id[0x10]; - u8 reserved_at_700[0x8]; + u8 flex_parser_id_gtpu_dw_2[0x4]; + u8 flex_parser_id_gtpu_first_ext_dw_0[0x4]; u8 num_total_dynamic_vf_msix[0x18]; u8 reserved_at_720[0x14]; u8 dynamic_msix_table_size[0xc]; -- cgit v1.2.3 From 7304d603a57a1edecfecfbcc26f85edcda4cae81 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 2 Nov 2020 23:57:13 +0200 Subject: net/mlx5: DR, Add support for force-loopback QP When supported by the device, SW steering RoCE RC QP that is used to write/read to/from ICM will be created with force-loopback attribute. Such QP doesn't require GID index upon creation. Signed-off-by: Erez Shitrit Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/steering/dr_cmd.c | 36 ++++++++++++++++++++++ .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 34 +++++++++++++++++--- .../mellanox/mlx5/core/steering/dr_types.h | 7 +++++ include/linux/mlx5/mlx5_ifc.h | 7 +++-- 4 files changed, 77 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c index 6f9d7aa9fb4c..68d898e144fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -85,15 +85,51 @@ int mlx5dr_cmd_query_esw_caps(struct mlx5_core_dev *mdev, return 0; } +static int dr_cmd_query_nic_vport_roce_en(struct mlx5_core_dev *mdev, + u16 vport, bool *roce_en) +{ + u32 out[MLX5_ST_SZ_DW(query_nic_vport_context_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_nic_vport_context_in)] = {}; + int err; + + MLX5_SET(query_nic_vport_context_in, in, opcode, + MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT); + MLX5_SET(query_nic_vport_context_in, in, vport_number, vport); + MLX5_SET(query_nic_vport_context_in, in, other_vport, !!vport); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + *roce_en = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.roce_en); + return 0; +} + int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, struct mlx5dr_cmd_caps *caps) { + bool roce_en; + int err; + caps->prio_tag_required = MLX5_CAP_GEN(mdev, prio_tag_required); caps->eswitch_manager = MLX5_CAP_GEN(mdev, eswitch_manager); caps->gvmi = MLX5_CAP_GEN(mdev, vhca_id); caps->flex_protocols = MLX5_CAP_GEN(mdev, flex_parser_protocols); caps->sw_format_ver = MLX5_CAP_GEN(mdev, steering_format_version); + if (MLX5_CAP_GEN(mdev, roce)) { + err = dr_cmd_query_nic_vport_roce_en(mdev, 0, &roce_en); + if (err) + return err; + + caps->roce_caps.roce_en = roce_en; + caps->roce_caps.fl_rc_qp_when_roce_disabled = + MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_disabled); + caps->roce_caps.fl_rc_qp_when_roce_enabled = + MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_enabled); + } + if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED) { caps->flex_parser_id_icmp_dw0 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw0); caps->flex_parser_id_icmp_dw1 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw1); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 37377d668057..69d623bedefe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -32,6 +32,7 @@ struct dr_qp_rtr_attr { u8 min_rnr_timer; u8 sgid_index; u16 udp_src_port; + u8 fl:1; }; struct dr_qp_rts_attr { @@ -650,6 +651,7 @@ static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, attr->udp_src_port); MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); + MLX5_SET(qpc, qpc, primary_address_path.fl, attr->fl); MLX5_SET(qpc, qpc, min_rnr_nak, 1); MLX5_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP); @@ -658,6 +660,19 @@ static int dr_cmd_modify_qp_init2rtr(struct mlx5_core_dev *mdev, return mlx5_cmd_exec_in(mdev, init2rtr_qp, in); } +static bool dr_send_allow_fl(struct mlx5dr_cmd_caps *caps) +{ + /* Check whether RC RoCE QP creation with force loopback is allowed. + * There are two separate capability bits for this: + * - force loopback when RoCE is enabled + * - force loopback when RoCE is disabled + */ + return ((caps->roce_caps.roce_en && + caps->roce_caps.fl_rc_qp_when_roce_enabled) || + (!caps->roce_caps.roce_en && + caps->roce_caps.fl_rc_qp_when_roce_disabled)); +} + static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) { struct mlx5dr_qp *dr_qp = dmn->send_ring->qp; @@ -676,17 +691,26 @@ static int dr_prepare_qp_to_rts(struct mlx5dr_domain *dmn) } /* RTR */ - ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, &rtr_attr.dgid_attr); - if (ret) - return ret; - rtr_attr.mtu = mtu; rtr_attr.qp_num = dr_qp->qpn; rtr_attr.min_rnr_timer = 12; rtr_attr.port_num = port; - rtr_attr.sgid_index = gid_index; rtr_attr.udp_src_port = dmn->info.caps.roce_min_src_udp; + /* If QP creation with force loopback is allowed, then there + * is no need for GID index when creating the QP. + * Otherwise we query GID attributes and use GID index. + */ + rtr_attr.fl = dr_send_allow_fl(&dmn->info.caps); + if (!rtr_attr.fl) { + ret = mlx5dr_cmd_query_gid(dmn->mdev, port, gid_index, + &rtr_attr.dgid_attr); + if (ret) + return ret; + + rtr_attr.sgid_index = gid_index; + } + ret = dr_cmd_modify_qp_init2rtr(dmn->mdev, dr_qp, &rtr_attr); if (ret) { mlx5dr_err(dmn, "Failed modify QP init2rtr\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 7c1ab0b6417e..8de70566f85b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -747,6 +747,12 @@ struct mlx5dr_cmd_vport_cap { u32 num; }; +struct mlx5dr_roce_cap { + u8 roce_en:1; + u8 fl_rc_qp_when_roce_disabled:1; + u8 fl_rc_qp_when_roce_enabled:1; +}; + struct mlx5dr_cmd_caps { u16 gvmi; u64 nic_rx_drop_address; @@ -783,6 +789,7 @@ struct mlx5dr_cmd_caps { struct mlx5dr_esw_caps esw_caps; struct mlx5dr_cmd_vport_cap *vports_caps; bool prio_tag_required; + struct mlx5dr_roce_cap roce_caps; }; struct mlx5dr_domain_rx_tx { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index aa6effe1dd6d..4d9569c4b96c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -961,7 +961,9 @@ struct mlx5_ifc_roce_cap_bits { u8 roce_apm[0x1]; u8 reserved_at_1[0x3]; u8 sw_r_roce_src_udp_port[0x1]; - u8 reserved_at_5[0x19]; + u8 fl_rc_qp_when_roce_disabled[0x1]; + u8 fl_rc_qp_when_roce_enabled[0x1]; + u8 reserved_at_7[0x17]; u8 qp_ts_format[0x2]; u8 reserved_at_20[0x60]; @@ -2942,7 +2944,8 @@ struct mlx5_ifc_qpc_bits { u8 state[0x4]; u8 lag_tx_port_affinity[0x4]; u8 st[0x8]; - u8 reserved_at_10[0x3]; + u8 reserved_at_10[0x2]; + u8 isolate_vl_tc[0x1]; u8 pm_state[0x2]; u8 reserved_at_15[0x1]; u8 req_e2e_credit_mode[0x2]; -- cgit v1.2.3 From aeacb52a8de7046be5399ba311f49bce96e1b269 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 3 Nov 2020 01:31:53 +0200 Subject: net/mlx5: DR, Add support for isolate_vl_tc QP When using SW steering, rule insertion rate depends on the RDMA RC QP performance used for writing to the ICM. During stress this QP is competing on the HW resources with all the other QPs that are used to send data. To protect SW steering QP's performance in such cases, we set this QP to use isolated VL. The VL number is reserved by FW and is not exposed to the driver. Support for this QP on isolated VL exists only when both force-loopback and isolate_vl_tc capabilities are set. Signed-off-by: Alex Vesker Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c | 7 +++++++ drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c index 68d898e144fb..5970cb8fc0c0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -130,6 +130,8 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_enabled); } + caps->isolate_vl_tc = MLX5_CAP_GEN(mdev, isolate_vl_tc_new); + if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED) { caps->flex_parser_id_icmp_dw0 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw0); caps->flex_parser_id_icmp_dw1 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw1); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 69d623bedefe..12cf323a5943 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -46,6 +46,7 @@ struct dr_qp_init_attr { u32 pdn; u32 max_send_wr; struct mlx5_uars_page *uar; + u8 isolate_vl_tc:1; }; static int dr_parse_cqe(struct mlx5dr_cq *dr_cq, struct mlx5_cqe64 *cqe64) @@ -158,6 +159,7 @@ static struct mlx5dr_qp *dr_create_rc_qp(struct mlx5_core_dev *mdev, qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, isolate_vl_tc, attr->isolate_vl_tc); MLX5_SET(qpc, qpc, pd, attr->pdn); MLX5_SET(qpc, qpc, uar_page, attr->uar->index); MLX5_SET(qpc, qpc, log_page_size, @@ -924,6 +926,11 @@ int mlx5dr_send_ring_alloc(struct mlx5dr_domain *dmn) init_attr.pdn = dmn->pdn; init_attr.uar = dmn->uar; init_attr.max_send_wr = QUEUE_SIZE; + + /* Isolated VL is applicable only if force loopback is supported */ + if (dr_send_allow_fl(&dmn->info.caps)) + init_attr.isolate_vl_tc = dmn->info.caps.isolate_vl_tc; + spin_lock_init(&dmn->send_ring->lock); dmn->send_ring->qp = dr_create_rc_qp(dmn->mdev, &init_attr); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index 8de70566f85b..67460c42a99b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -790,6 +790,7 @@ struct mlx5dr_cmd_caps { struct mlx5dr_cmd_vport_cap *vports_caps; bool prio_tag_required; struct mlx5dr_roce_cap roce_caps; + u8 isolate_vl_tc:1; }; struct mlx5dr_domain_rx_tx { @@ -1164,6 +1165,7 @@ struct mlx5dr_cmd_qp_create_attr { u32 sq_wqe_cnt; u32 rq_wqe_cnt; u32 rq_wqe_shift; + u8 isolate_vl_tc:1; }; int mlx5dr_cmd_query_gid(struct mlx5_core_dev *mdev, u8 vhca_port_num, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4d9569c4b96c..52b7cabcde08 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1319,7 +1319,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_srq_sz[0x8]; u8 log_max_qp_sz[0x8]; u8 event_cap[0x1]; - u8 reserved_at_91[0x7]; + u8 reserved_at_91[0x2]; + u8 isolate_vl_tc_new[0x1]; + u8 reserved_at_94[0x4]; u8 prio_tag_required[0x1]; u8 reserved_at_99[0x2]; u8 log_max_qp[0x5]; -- cgit v1.2.3 From 014068dcb5b17dae110354c4de241833124edba1 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 19 Apr 2021 15:01:02 +0200 Subject: net: phy: genphy_loopback: add link speed configuration In case of loopback, in most cases we need to disable autoneg support and force some speed configuration. Otherwise, depending on currently active auto negotiated link speed, the loopback may or may not work. This patch was tested with following PHYs: TJA1102, KSZ8081, KSZ9031, AT8035, AR9331. Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 3 ++- drivers/net/phy/phy_device.c | 28 ++++++++++++++++++++++++++-- include/linux/phy.h | 1 + 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index fc2e7cb5b2e5..1f0512e39c65 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -701,7 +701,7 @@ out: } EXPORT_SYMBOL(phy_start_cable_test_tdr); -static int phy_config_aneg(struct phy_device *phydev) +int phy_config_aneg(struct phy_device *phydev) { if (phydev->drv->config_aneg) return phydev->drv->config_aneg(phydev); @@ -714,6 +714,7 @@ static int phy_config_aneg(struct phy_device *phydev) return genphy_config_aneg(phydev); } +EXPORT_SYMBOL(phy_config_aneg); /** * phy_check_link_status - check link status and set state accordingly diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 320a3e5cd10a..0a2d8bedf73d 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2565,8 +2565,32 @@ EXPORT_SYMBOL(genphy_resume); int genphy_loopback(struct phy_device *phydev, bool enable) { - return phy_modify(phydev, MII_BMCR, BMCR_LOOPBACK, - enable ? BMCR_LOOPBACK : 0); + if (enable) { + u16 val, ctl = BMCR_LOOPBACK; + int ret; + + if (phydev->speed == SPEED_1000) + ctl |= BMCR_SPEED1000; + else if (phydev->speed == SPEED_100) + ctl |= BMCR_SPEED100; + + if (phydev->duplex == DUPLEX_FULL) + ctl |= BMCR_FULLDPLX; + + phy_modify(phydev, MII_BMCR, ~0, ctl); + + ret = phy_read_poll_timeout(phydev, MII_BMSR, val, + val & BMSR_LSTATUS, + 5000, 500000, true); + if (ret) + return ret; + } else { + phy_modify(phydev, MII_BMCR, BMCR_LOOPBACK, 0); + + phy_config_aneg(phydev); + } + + return 0; } EXPORT_SYMBOL(genphy_loopback); diff --git a/include/linux/phy.h b/include/linux/phy.h index e3d4d583463b..60d2b26026a2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1410,6 +1410,7 @@ void phy_disconnect(struct phy_device *phydev); void phy_detach(struct phy_device *phydev); void phy_start(struct phy_device *phydev); void phy_stop(struct phy_device *phydev); +int phy_config_aneg(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); int phy_speed_down(struct phy_device *phydev, bool sync); -- cgit v1.2.3 From 3e1e58d64c3d0a6789f9d865936c4ce46b20f3f5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 19 Apr 2021 15:01:03 +0200 Subject: net: add generic selftest support Port some parts of the stmmac selftest and reuse it as basic generic selftest library. This patch was tested with following combinations: - iMX6DL FEC -> AT8035 - iMX6DL FEC -> SJA1105Q switch -> KSZ8081 - iMX6DL FEC -> SJA1105Q switch -> KSZ9031 - AR9331 ag71xx -> AR9331 PHY - AR9331 ag71xx -> AR9331 switch -> AR9331 PHY Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- include/net/selftests.h | 12 ++ net/Kconfig | 4 + net/core/Makefile | 1 + net/core/selftests.c | 400 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 417 insertions(+) create mode 100644 include/net/selftests.h create mode 100644 net/core/selftests.c (limited to 'include') diff --git a/include/net/selftests.h b/include/net/selftests.h new file mode 100644 index 000000000000..9993b9498cf3 --- /dev/null +++ b/include/net/selftests.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_SELFTESTS +#define _NET_SELFTESTS + +#include + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf); +int net_selftest_get_count(void); +void net_selftest_get_strings(u8 *data); + +#endif /* _NET_SELFTESTS */ diff --git a/net/Kconfig b/net/Kconfig index 9c456acc379e..8d955195c069 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -429,6 +429,10 @@ config GRO_CELLS config SOCK_VALIDATE_XMIT bool +config NET_SELFTESTS + def_tristate PHYLIB + depends on PHYLIB + config NET_SOCK_MSG bool default n diff --git a/net/core/Makefile b/net/core/Makefile index 0c2233c826fd..1a6168d8f23b 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o endif diff --git a/net/core/selftests.c b/net/core/selftests.c new file mode 100644 index 000000000000..ba7b0171974c --- /dev/null +++ b/net/core/selftests.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates. + * stmmac Selftests Support + * + * Author: Jose Abreu + * + * Ported from stmmac by: + * Copyright (C) 2021 Oleksij Rempel + */ + +#include +#include +#include +#include + +struct net_packet_attrs { + unsigned char *src; + unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +static u8 net_test_next_id; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) + +static struct sk_buff *net_test_get_skb(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct sk_buff *skb = NULL; + struct udphdr *uhdr = NULL; + struct tcphdr *thdr = NULL; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct iphdr *ihdr; + int iplen, size; + + size = attr->size + NET_TEST_PKT_SIZE; + + if (attr->tcp) + size += sizeof(struct tcphdr); + else + size += sizeof(struct udphdr); + + if (attr->max_size && attr->max_size > size) + size = attr->max_size; + + skb = netdev_alloc_skb(ndev, size); + if (!skb) + return NULL; + + prefetchw(skb->data); + + ehdr = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); + ihdr = skb_put(skb, sizeof(*ihdr)); + + skb_set_transport_header(skb, skb->len); + if (attr->tcp) + thdr = skb_put(skb, sizeof(*thdr)); + else + uhdr = skb_put(skb, sizeof(*uhdr)); + + eth_zero_addr(ehdr->h_dest); + + if (attr->src) + ether_addr_copy(ehdr->h_source, attr->src); + if (attr->dst) + ether_addr_copy(ehdr->h_dest, attr->dst); + + ehdr->h_proto = htons(ETH_P_IP); + + if (attr->tcp) { + thdr->source = htons(attr->sport); + thdr->dest = htons(attr->dport); + thdr->doff = sizeof(struct tcphdr) / 4; + thdr->check = 0; + } else { + uhdr->source = htons(attr->sport); + uhdr->dest = htons(attr->dport); + uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size); + if (attr->max_size) + uhdr->len = htons(attr->max_size - + (sizeof(*ihdr) + sizeof(*ehdr))); + uhdr->check = 0; + } + + ihdr->ihl = 5; + ihdr->ttl = 32; + ihdr->version = 4; + if (attr->tcp) + ihdr->protocol = IPPROTO_TCP; + else + ihdr->protocol = IPPROTO_UDP; + iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size; + if (attr->tcp) + iplen += sizeof(*thdr); + else + iplen += sizeof(*uhdr); + + if (attr->max_size) + iplen = attr->max_size - sizeof(*ehdr); + + ihdr->tot_len = htons(iplen); + ihdr->frag_off = 0; + ihdr->saddr = htonl(attr->ip_src); + ihdr->daddr = htonl(attr->ip_dst); + ihdr->tos = 0; + ihdr->id = 0; + ip_send_check(ihdr); + + shdr = skb_put(skb, sizeof(*shdr)); + shdr->version = 0; + shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); + attr->id = net_test_next_id; + shdr->id = net_test_next_id++; + + if (attr->size) + skb_put(skb, attr->size); + if (attr->max_size && attr->max_size > skb->len) + skb_put(skb, attr->max_size - skb->len); + + skb->csum = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + if (attr->tcp) { + thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, + ihdr->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); + } + + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + skb->dev = ndev; + + return skb; +} + +static int net_test_loopback_validate(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt, + struct net_device *orig_ndev) +{ + struct net_test_priv *tpriv = pt->af_packet_priv; + unsigned char *src = tpriv->packet->src; + unsigned char *dst = tpriv->packet->dst; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct udphdr *uhdr; + struct tcphdr *thdr; + struct iphdr *ihdr; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (skb_linearize(skb)) + goto out; + if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN)) + goto out; + + ehdr = (struct ethhdr *)skb_mac_header(skb); + if (dst) { + if (!ether_addr_equal_unaligned(ehdr->h_dest, dst)) + goto out; + } + + if (src) { + if (!ether_addr_equal_unaligned(ehdr->h_source, src)) + goto out; + } + + ihdr = ip_hdr(skb); + if (tpriv->double_vlan) + ihdr = (struct iphdr *)(skb_network_header(skb) + 4); + + if (tpriv->packet->tcp) { + if (ihdr->protocol != IPPROTO_TCP) + goto out; + + thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (thdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr)); + } else { + if (ihdr->protocol != IPPROTO_UDP) + goto out; + + uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (uhdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr)); + } + + if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC)) + goto out; + if (tpriv->packet->id != shdr->id) + goto out; + + tpriv->ok = true; + complete(&tpriv->comp); +out: + kfree_skb(skb); + return 0; +} + +static int __net_test_loopback(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct net_test_priv *tpriv; + struct sk_buff *skb = NULL; + int ret = 0; + + tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); + if (!tpriv) + return -ENOMEM; + + tpriv->ok = false; + init_completion(&tpriv->comp); + + tpriv->pt.type = htons(ETH_P_IP); + tpriv->pt.func = net_test_loopback_validate; + tpriv->pt.dev = ndev; + tpriv->pt.af_packet_priv = tpriv; + tpriv->packet = attr; + dev_add_pack(&tpriv->pt); + + skb = net_test_get_skb(ndev, attr); + if (!skb) { + ret = -ENOMEM; + goto cleanup; + } + + ret = dev_direct_xmit(skb, attr->queue_mapping); + if (ret < 0) { + goto cleanup; + } else if (ret > 0) { + ret = -ENETUNREACH; + goto cleanup; + } + + if (!attr->timeout) + attr->timeout = NET_LB_TIMEOUT; + + wait_for_completion_timeout(&tpriv->comp, attr->timeout); + ret = tpriv->ok ? 0 : -ETIMEDOUT; + +cleanup: + dev_remove_pack(&tpriv->pt); + kfree(tpriv); + return ret; +} + +static int net_test_netif_carrier(struct net_device *ndev) +{ + return netif_carrier_ok(ndev) ? 0 : -ENOLINK; +} + +static int net_test_phy_phydev(struct net_device *ndev) +{ + return ndev->phydev ? 0 : -EOPNOTSUPP; +} + +static int net_test_phy_loopback_enable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, true); +} + +static int net_test_phy_loopback_disable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, false); +} + +static int net_test_phy_loopback_udp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_tcp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + return __net_test_loopback(ndev, &attr); +} + +static const struct net_test { + char name[ETH_GSTRING_LEN]; + int (*fn)(struct net_device *ndev); +} net_selftests[] = { + { + .name = "Carrier ", + .fn = net_test_netif_carrier, + }, { + .name = "PHY dev is present ", + .fn = net_test_phy_phydev, + }, { + /* This test should be done before all PHY loopback test */ + .name = "PHY internal loopback, enable ", + .fn = net_test_phy_loopback_enable, + }, { + .name = "PHY internal loopback, UDP ", + .fn = net_test_phy_loopback_udp, + }, { + .name = "PHY internal loopback, TCP ", + .fn = net_test_phy_loopback_tcp, + }, { + /* This test should be done after all PHY loopback test */ + .name = "PHY internal loopback, disable", + .fn = net_test_phy_loopback_disable, + }, +}; + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) +{ + int count = net_selftest_get_count(); + int i; + + memset(buf, 0, sizeof(*buf) * count); + net_test_next_id = 0; + + if (etest->flags != ETH_TEST_FL_OFFLINE) { + netdev_err(ndev, "Only offline tests are supported\n"); + etest->flags |= ETH_TEST_FL_FAILED; + return; + } + + + for (i = 0; i < count; i++) { + buf[i] = net_selftests[i].fn(ndev); + if (buf[i] && (buf[i] != -EOPNOTSUPP)) + etest->flags |= ETH_TEST_FL_FAILED; + } +} +EXPORT_SYMBOL_GPL(net_selftest); + +int net_selftest_get_count(void) +{ + return ARRAY_SIZE(net_selftests); +} +EXPORT_SYMBOL_GPL(net_selftest_get_count); + +void net_selftest_get_strings(u8 *data) +{ + u8 *p = data; + int i; + + for (i = 0; i < net_selftest_get_count(); i++) { + snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1, + net_selftests[i].name); + p += ETH_GSTRING_LEN; + } +} +EXPORT_SYMBOL_GPL(net_selftest_get_strings); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Oleksij Rempel "); -- cgit v1.2.3 From a71acad90a3f079685efcb068e2251b912083d68 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 19 Apr 2021 15:01:06 +0200 Subject: net: dsa: enable selftest support for all switches by default Most of generic selftest should be able to work with probably all ethernet controllers. The DSA switches are not exception, so enable it by default at least for DSA. This patch was tested with SJA1105 and AR9331. Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ net/dsa/Kconfig | 1 + net/dsa/slave.c | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1259b0f40684..b52e9b057be4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -577,6 +577,8 @@ struct dsa_switch_ops { int port, uint64_t *data); void (*get_stats64)(struct dsa_switch *ds, int port, struct rtnl_link_stats64 *s); + void (*self_test)(struct dsa_switch *ds, int port, + struct ethtool_test *etest, u64 *data); /* * ethtool Wake-on-LAN diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 8746b07668ae..cbc2bd643ab2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,6 +9,7 @@ menuconfig NET_DSA select NET_SWITCHDEV select PHYLINK select NET_DEVLINK + select NET_SELFTESTS help Say Y if you want to enable support for the hardware switches supported by the Distributed Switch Architecture. diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3ae67202fda2..77b33bd161b8 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -748,7 +749,10 @@ static void dsa_slave_get_strings(struct net_device *dev, if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); + } else if (stringset == ETH_SS_TEST) { + net_selftest_get_strings(data); } + } static void dsa_slave_get_ethtool_stats(struct net_device *dev, @@ -794,11 +798,27 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) count += ds->ops->get_sset_count(ds, dp->index, sset); return count; + } else if (sset == ETH_SS_TEST) { + return net_selftest_get_count(); } return -EOPNOTSUPP; } +static void dsa_slave_net_selftest(struct net_device *ndev, + struct ethtool_test *etest, u64 *buf) +{ + struct dsa_port *dp = dsa_slave_to_port(ndev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->self_test) { + ds->ops->self_test(ds, dp->index, etest, buf); + return; + } + + net_selftest(ndev, etest, buf); +} + static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -1630,6 +1650,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_rxnfc = dsa_slave_get_rxnfc, .set_rxnfc = dsa_slave_set_rxnfc, .get_ts_info = dsa_slave_get_ts_info, + .self_test = dsa_slave_net_selftest, }; /* legacy way, bypassing the bridge *****************************************/ -- cgit v1.2.3 From a978f7c479ea68d68a6267a37cbd44362bdd9811 Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Tue, 20 Apr 2021 09:54:03 +0200 Subject: net: phy: marvell: add support for Amethyst internal PHY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for Amethyst internal PHY. The only difference from Peridot is HWMON. Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 117 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/marvell_phy.h | 1 + 2 files changed, 115 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index e505060d0743..1cce86b280af 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -118,10 +118,21 @@ #define MII_88E6390_MISC_TEST_TEMP_SENSOR_ENABLE_ONESHOT (0x2 << 14) #define MII_88E6390_MISC_TEST_TEMP_SENSOR_DISABLE (0x3 << 14) #define MII_88E6390_MISC_TEST_TEMP_SENSOR_MASK (0x3 << 14) +#define MII_88E6393_MISC_TEST_SAMPLES_2048 (0x0 << 11) +#define MII_88E6393_MISC_TEST_SAMPLES_4096 (0x1 << 11) +#define MII_88E6393_MISC_TEST_SAMPLES_8192 (0x2 << 11) +#define MII_88E6393_MISC_TEST_SAMPLES_16384 (0x3 << 11) +#define MII_88E6393_MISC_TEST_SAMPLES_MASK (0x3 << 11) +#define MII_88E6393_MISC_TEST_RATE_2_3MS (0x5 << 8) +#define MII_88E6393_MISC_TEST_RATE_6_4MS (0x6 << 8) +#define MII_88E6393_MISC_TEST_RATE_11_9MS (0x7 << 8) +#define MII_88E6393_MISC_TEST_RATE_MASK (0x7 << 8) #define MII_88E6390_TEMP_SENSOR 0x1c -#define MII_88E6390_TEMP_SENSOR_MASK 0xff -#define MII_88E6390_TEMP_SENSOR_SAMPLES 10 +#define MII_88E6393_TEMP_SENSOR_THRESHOLD_MASK 0xff00 +#define MII_88E6393_TEMP_SENSOR_THRESHOLD_SHIFT 8 +#define MII_88E6390_TEMP_SENSOR_MASK 0xff +#define MII_88E6390_TEMP_SENSOR_SAMPLES 10 #define MII_88E1318S_PHY_MSCR1_REG 16 #define MII_88E1318S_PHY_MSCR1_PAD_ODD BIT(6) @@ -2217,6 +2228,7 @@ static int marvell_vct7_cable_test_get_status(struct phy_device *phydev, #ifdef CONFIG_HWMON struct marvell_hwmon_ops { + int (*config)(struct phy_device *phydev); int (*get_temp)(struct phy_device *phydev, long *temp); int (*get_temp_critical)(struct phy_device *phydev, long *temp); int (*set_temp_critical)(struct phy_device *phydev, long temp); @@ -2391,6 +2403,65 @@ error: return ret; } +static int m88e6393_get_temp(struct phy_device *phydev, long *temp) +{ + int err; + + err = m88e1510_get_temp(phydev, temp); + + /* 88E1510 measures T + 25, while the PHY on 88E6393X switch + * T + 75, so we have to subtract another 50 + */ + *temp -= 50000; + + return err; +} + +static int m88e6393_get_temp_critical(struct phy_device *phydev, long *temp) +{ + int ret; + + *temp = 0; + + ret = phy_read_paged(phydev, MII_MARVELL_MISC_TEST_PAGE, + MII_88E6390_TEMP_SENSOR); + if (ret < 0) + return ret; + + *temp = (((ret & MII_88E6393_TEMP_SENSOR_THRESHOLD_MASK) >> + MII_88E6393_TEMP_SENSOR_THRESHOLD_SHIFT) - 75) * 1000; + + return 0; +} + +static int m88e6393_set_temp_critical(struct phy_device *phydev, long temp) +{ + temp = (temp / 1000) + 75; + + return phy_modify_paged(phydev, MII_MARVELL_MISC_TEST_PAGE, + MII_88E6390_TEMP_SENSOR, + MII_88E6393_TEMP_SENSOR_THRESHOLD_MASK, + temp << MII_88E6393_TEMP_SENSOR_THRESHOLD_SHIFT); +} + +static int m88e6393_hwmon_config(struct phy_device *phydev) +{ + int err; + + err = m88e6393_set_temp_critical(phydev, 100000); + if (err) + return err; + + return phy_modify_paged(phydev, MII_MARVELL_MISC_TEST_PAGE, + MII_88E6390_MISC_TEST, + MII_88E6390_MISC_TEST_TEMP_SENSOR_MASK | + MII_88E6393_MISC_TEST_SAMPLES_MASK | + MII_88E6393_MISC_TEST_RATE_MASK, + MII_88E6390_MISC_TEST_TEMP_SENSOR_ENABLE | + MII_88E6393_MISC_TEST_SAMPLES_2048 | + MII_88E6393_MISC_TEST_RATE_2_3MS); +} + static int marvell_hwmon_read(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *temp) { @@ -2535,8 +2606,13 @@ static int marvell_hwmon_probe(struct phy_device *phydev) priv->hwmon_dev = devm_hwmon_device_register_with_info( dev, priv->hwmon_name, phydev, &marvell_hwmon_chip_info, NULL); + if (IS_ERR(priv->hwmon_dev)) + return PTR_ERR(priv->hwmon_dev); - return PTR_ERR_OR_ZERO(priv->hwmon_dev); + if (ops->config) + err = ops->config(phydev); + + return err; } static const struct marvell_hwmon_ops m88e1121_hwmon_ops = { @@ -2554,6 +2630,14 @@ static const struct marvell_hwmon_ops m88e6390_hwmon_ops = { .get_temp = m88e6390_get_temp, }; +static const struct marvell_hwmon_ops m88e6393_hwmon_ops = { + .config = m88e6393_hwmon_config, + .get_temp = m88e6393_get_temp, + .get_temp_critical = m88e6393_get_temp_critical, + .set_temp_critical = m88e6393_set_temp_critical, + .get_temp_alarm = m88e1510_get_temp_alarm, +}; + #define DEF_MARVELL_HWMON_OPS(s) (&(s)) #else @@ -2948,6 +3032,32 @@ static struct phy_driver marvell_drivers[] = { .cable_test_tdr_start = marvell_vct5_cable_test_tdr_start, .cable_test_get_status = marvell_vct7_cable_test_get_status, }, + { + .phy_id = MARVELL_PHY_ID_88E6393_FAMILY, + .phy_id_mask = MARVELL_PHY_ID_MASK, + .name = "Marvell 88E6393 Family", + .driver_data = DEF_MARVELL_HWMON_OPS(m88e6393_hwmon_ops), + /* PHY_GBIT_FEATURES */ + .flags = PHY_POLL_CABLE_TEST, + .probe = marvell_probe, + .config_init = marvell_config_init, + .config_aneg = m88e1510_config_aneg, + .read_status = marvell_read_status, + .config_intr = marvell_config_intr, + .handle_interrupt = marvell_handle_interrupt, + .resume = genphy_resume, + .suspend = genphy_suspend, + .read_page = marvell_read_page, + .write_page = marvell_write_page, + .get_sset_count = marvell_get_sset_count, + .get_strings = marvell_get_strings, + .get_stats = marvell_get_stats, + .get_tunable = m88e1540_get_tunable, + .set_tunable = m88e1540_set_tunable, + .cable_test_start = marvell_vct7_cable_test_start, + .cable_test_tdr_start = marvell_vct5_cable_test_tdr_start, + .cable_test_get_status = marvell_vct7_cable_test_get_status, + }, { .phy_id = MARVELL_PHY_ID_88E1340S, .phy_id_mask = MARVELL_PHY_ID_MASK, @@ -3014,6 +3124,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = { { MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E6341_FAMILY, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E6390_FAMILY, MARVELL_PHY_ID_MASK }, + { MARVELL_PHY_ID_88E6393_FAMILY, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1340S, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E1548P, MARVELL_PHY_ID_MASK }, { } diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index f61d82c53f30..acee44b9db26 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -39,6 +39,7 @@ */ #define MARVELL_PHY_ID_88E6341_FAMILY 0x01410f41 #define MARVELL_PHY_ID_88E6390_FAMILY 0x01410f90 +#define MARVELL_PHY_ID_88E6393_FAMILY 0x002b0b9b #define MARVELL_PHY_FAMILY_ID(id) ((id) >> 4) -- cgit v1.2.3 From deff710703d80c942c9c85a3f00a053025cfb1e4 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Tue, 20 Apr 2021 20:53:10 +0200 Subject: net: dsa: Allow default tag protocol to be overridden from DT Some combinations of tag protocols and Ethernet controllers are incompatible, and it is hard for the driver to keep track of these. Therefore, allow the device tree author (typically the board vendor) to inform the driver of this fact by selecting an alternate protocol that is known to work. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++ net/dsa/dsa2.c | 103 +++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 91 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index b52e9b057be4..507082959aa4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -149,6 +149,11 @@ struct dsa_switch_tree { /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; + /* Default tagging protocol preferred by the switches in this + * tree. + */ + enum dsa_tag_protocol default_proto; + /* * Configuration data for the platform device that owns * this dsa switch tree instance. diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index d7c22e3a1fbf..b71e87909f0e 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -668,6 +668,30 @@ static const struct devlink_ops dsa_devlink_ops = { .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get, }; +static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) +{ + const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; + struct dsa_switch_tree *dst = ds->dst; + int port, err; + + if (tag_ops->proto == dst->default_proto) + return 0; + + for (port = 0; port < ds->num_ports; port++) { + if (!dsa_is_cpu_port(ds, port)) + continue; + + err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + if (err) { + dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", + tag_ops->name, ERR_PTR(err)); + return err; + } + } + + return 0; +} + static int dsa_switch_setup(struct dsa_switch *ds) { struct dsa_devlink_priv *dl_priv; @@ -718,6 +742,10 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err < 0) goto unregister_notifier; + err = dsa_switch_setup_tag_protocol(ds); + if (err) + goto teardown; + devlink_params_publish(ds->devlink); if (!ds->slave_mii_bus && ds->ops->phy_read) { @@ -1068,34 +1096,60 @@ static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); } -static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) +static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, + const char *user_protocol) { struct dsa_switch *ds = dp->ds; struct dsa_switch_tree *dst = ds->dst; const struct dsa_device_ops *tag_ops; - enum dsa_tag_protocol tag_protocol; + enum dsa_tag_protocol default_proto; + + /* Find out which protocol the switch would prefer. */ + default_proto = dsa_get_tag_protocol(dp, master); + if (dst->default_proto) { + if (dst->default_proto != default_proto) { + dev_err(ds->dev, + "A DSA switch tree can have only one tagging protocol\n"); + return -EINVAL; + } + } else { + dst->default_proto = default_proto; + } + + /* See if the user wants to override that preference. */ + if (user_protocol) { + if (!ds->ops->change_tag_protocol) { + dev_err(ds->dev, "Tag protocol cannot be modified\n"); + return -EINVAL; + } + + tag_ops = dsa_find_tagger_by_name(user_protocol); + } else { + tag_ops = dsa_tag_driver_get(default_proto); + } + + if (IS_ERR(tag_ops)) { + if (PTR_ERR(tag_ops) == -ENOPROTOOPT) + return -EPROBE_DEFER; + + dev_warn(ds->dev, "No tagger for this switch\n"); + return PTR_ERR(tag_ops); + } - tag_protocol = dsa_get_tag_protocol(dp, master); if (dst->tag_ops) { - if (dst->tag_ops->proto != tag_protocol) { + if (dst->tag_ops != tag_ops) { dev_err(ds->dev, "A DSA switch tree can have only one tagging protocol\n"); + + dsa_tag_driver_put(tag_ops); return -EINVAL; } + /* In the case of multiple CPU ports per switch, the tagging - * protocol is still reference-counted only per switch tree, so - * nothing to do here. + * protocol is still reference-counted only per switch tree. */ + dsa_tag_driver_put(tag_ops); } else { - tag_ops = dsa_tag_driver_get(tag_protocol); - if (IS_ERR(tag_ops)) { - if (PTR_ERR(tag_ops) == -ENOPROTOOPT) - return -EPROBE_DEFER; - dev_warn(ds->dev, "No tagger for this switch\n"); - dp->master = NULL; - return PTR_ERR(tag_ops); - } - dst->tag_ops = tag_ops; } @@ -1104,6 +1158,19 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master) dsa_port_set_tag_protocol(dp, dst->tag_ops); dp->dst = dst; + /* At this point, the tree may be configured to use a different + * tagger than the one chosen by the switch driver during + * .setup, in the case when a user selects a custom protocol + * through the DT. + * + * This is resolved by syncing the driver with the tree in + * dsa_switch_setup_tag_protocol once .setup has run and the + * driver is ready to accept calls to .change_tag_protocol. If + * the driver does not support the custom protocol at that + * point, the tree is wholly rejected, thereby ensuring that the + * tree and driver are always in agreement on the protocol to + * use. + */ return 0; } @@ -1117,12 +1184,14 @@ static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) if (ethernet) { struct net_device *master; + const char *user_protocol; master = of_find_net_device_by_node(ethernet); if (!master) return -EPROBE_DEFER; - return dsa_port_parse_cpu(dp, master); + user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL); + return dsa_port_parse_cpu(dp, master, user_protocol); } if (link) @@ -1234,7 +1303,7 @@ static int dsa_port_parse(struct dsa_port *dp, const char *name, dev_put(master); - return dsa_port_parse_cpu(dp, master); + return dsa_port_parse_cpu(dp, master, NULL); } if (!strcmp(name, "dsa")) -- cgit v1.2.3 From c0dcaa55f91d925c9ac2c950ff84138534337a6c Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 2 Mar 2021 10:12:01 -0800 Subject: ice: Allow ignoring opcodes on specific VF Declare bitmap of allowed commands on VF. Initialize default opcodes list that should be always supported. Declare array of supported opcodes for each caps used in virtchnl code. Change allowed bitmap by setting or clearing corresponding bit to allowlist (bit set) or denylist (bit clear). Signed-off-by: Michal Swiatkowski Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/Makefile | 1 + .../ethernet/intel/ice/ice_virtchnl_allowlist.c | 165 +++++++++++++++++++++ .../ethernet/intel/ice/ice_virtchnl_allowlist.h | 13 ++ drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 18 +++ drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h | 1 + include/linux/avf/virtchnl.h | 1 + 6 files changed, 199 insertions(+) create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c create mode 100644 drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index f391691e2c7e..07fe857e9e3a 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -26,6 +26,7 @@ ice-y := ice_main.o \ ice_fw_update.o \ ice_lag.o \ ice_ethtool.o +ice-$(CONFIG_PCI_IOV) += ice_virtchnl_allowlist.o ice-$(CONFIG_PCI_IOV) += ice_virtchnl_pf.o ice_sriov.o ice_virtchnl_fdir.o ice-$(CONFIG_DCB) += ice_dcb.o ice_dcb_nl.o ice_dcb_lib.o ice-$(CONFIG_RFS_ACCEL) += ice_arfs.o diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c new file mode 100644 index 000000000000..5a0fbb47346f --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021, Intel Corporation. */ + +#include "ice_virtchnl_allowlist.h" + +/* Purpose of this file is to share functionality to allowlist or denylist + * opcodes used in PF <-> VF communication. Group of opcodes: + * - default -> should be always allowed after creating VF, + * default_allowlist_opcodes + * - opcodes needed by VF to work correctly, but not associated with caps -> + * should be allowed after successful VF resources allocation, + * working_allowlist_opcodes + * - opcodes needed by VF when caps are activated + * + * Caps that don't use new opcodes (no opcodes should be allowed): + * - VIRTCHNL_VF_OFFLOAD_RSS_AQ + * - VIRTCHNL_VF_OFFLOAD_RSS_REG + * - VIRTCHNL_VF_OFFLOAD_WB_ON_ITR + * - VIRTCHNL_VF_OFFLOAD_CRC + * - VIRTCHNL_VF_OFFLOAD_RX_POLLING + * - VIRTCHNL_VF_OFFLOAD_RSS_PCTYPE_V2 + * - VIRTCHNL_VF_OFFLOAD_ENCAP + * - VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM + * - VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM + * - VIRTCHNL_VF_OFFLOAD_USO + */ + +/* default opcodes to communicate with VF */ +static const u32 default_allowlist_opcodes[] = { + VIRTCHNL_OP_GET_VF_RESOURCES, VIRTCHNL_OP_VERSION, VIRTCHNL_OP_RESET_VF, +}; + +/* opcodes supported after successful VIRTCHNL_OP_GET_VF_RESOURCES */ +static const u32 working_allowlist_opcodes[] = { + VIRTCHNL_OP_CONFIG_TX_QUEUE, VIRTCHNL_OP_CONFIG_RX_QUEUE, + VIRTCHNL_OP_CONFIG_VSI_QUEUES, VIRTCHNL_OP_CONFIG_IRQ_MAP, + VIRTCHNL_OP_ENABLE_QUEUES, VIRTCHNL_OP_DISABLE_QUEUES, + VIRTCHNL_OP_GET_STATS, VIRTCHNL_OP_EVENT, +}; + +/* VIRTCHNL_VF_OFFLOAD_L2 */ +static const u32 l2_allowlist_opcodes[] = { + VIRTCHNL_OP_ADD_ETH_ADDR, VIRTCHNL_OP_DEL_ETH_ADDR, + VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE, +}; + +/* VIRTCHNL_VF_OFFLOAD_REQ_QUEUES */ +static const u32 req_queues_allowlist_opcodes[] = { + VIRTCHNL_OP_REQUEST_QUEUES, +}; + +/* VIRTCHNL_VF_OFFLOAD_VLAN */ +static const u32 vlan_allowlist_opcodes[] = { + VIRTCHNL_OP_ADD_VLAN, VIRTCHNL_OP_DEL_VLAN, + VIRTCHNL_OP_ENABLE_VLAN_STRIPPING, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING, +}; + +/* VIRTCHNL_VF_OFFLOAD_RSS_PF */ +static const u32 rss_pf_allowlist_opcodes[] = { + VIRTCHNL_OP_CONFIG_RSS_KEY, VIRTCHNL_OP_CONFIG_RSS_LUT, + VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA, +}; + +/* VIRTCHNL_VF_OFFLOAD_FDIR_PF */ +static const u32 fdir_pf_allowlist_opcodes[] = { + VIRTCHNL_OP_ADD_FDIR_FILTER, VIRTCHNL_OP_DEL_FDIR_FILTER, +}; + +struct allowlist_opcode_info { + const u32 *opcodes; + size_t size; +}; + +#define BIT_INDEX(caps) (HWEIGHT((caps) - 1)) +#define ALLOW_ITEM(caps, list) \ + [BIT_INDEX(caps)] = { \ + .opcodes = list, \ + .size = ARRAY_SIZE(list) \ + } +static const struct allowlist_opcode_info allowlist_opcodes[] = { + ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_L2, l2_allowlist_opcodes), + ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_REQ_QUEUES, req_queues_allowlist_opcodes), + ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN, vlan_allowlist_opcodes), + ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes), + ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes), +}; + +/** + * ice_vc_is_opcode_allowed - check if this opcode is allowed on this VF + * @vf: pointer to VF structure + * @opcode: virtchnl opcode + * + * Return true if message is allowed on this VF + */ +bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode) +{ + if (opcode >= VIRTCHNL_OP_MAX) + return false; + + return test_bit(opcode, vf->opcodes_allowlist); +} + +/** + * ice_vc_allowlist_opcodes - allowlist selected opcodes + * @vf: pointer to VF structure + * @opcodes: array of opocodes to allowlist + * @size: size of opcodes array + * + * Function should be called to allowlist opcodes on VF. + */ +static void +ice_vc_allowlist_opcodes(struct ice_vf *vf, const u32 *opcodes, size_t size) +{ + unsigned int i; + + for (i = 0; i < size; i++) + set_bit(opcodes[i], vf->opcodes_allowlist); +} + +/** + * ice_vc_clear_allowlist - clear all allowlist opcodes + * @vf: pointer to VF structure + */ +static void ice_vc_clear_allowlist(struct ice_vf *vf) +{ + bitmap_zero(vf->opcodes_allowlist, VIRTCHNL_OP_MAX); +} + +/** + * ice_vc_set_default_allowlist - allowlist default opcodes for VF + * @vf: pointer to VF structure + */ +void ice_vc_set_default_allowlist(struct ice_vf *vf) +{ + ice_vc_clear_allowlist(vf); + ice_vc_allowlist_opcodes(vf, default_allowlist_opcodes, + ARRAY_SIZE(default_allowlist_opcodes)); +} + +/** + * ice_vc_set_working_allowlist - allowlist opcodes needed to by VF to work + * @vf: pointer to VF structure + * + * allowlist opcodes that aren't associated with specific caps, but + * are needed by VF to work. + */ +void ice_vc_set_working_allowlist(struct ice_vf *vf) +{ + ice_vc_allowlist_opcodes(vf, working_allowlist_opcodes, + ARRAY_SIZE(working_allowlist_opcodes)); +} + +/** + * ice_vc_set_caps_allowlist - allowlist VF opcodes according caps + * @vf: pointer to VF structure + */ +void ice_vc_set_caps_allowlist(struct ice_vf *vf) +{ + unsigned long caps = vf->driver_caps; + unsigned int i; + + for_each_set_bit(i, &caps, ARRAY_SIZE(allowlist_opcodes)) + ice_vc_allowlist_opcodes(vf, allowlist_opcodes[i].opcodes, + allowlist_opcodes[i].size); +} diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h new file mode 100644 index 000000000000..d3ae86ded219 --- /dev/null +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021, Intel Corporation. */ + +#ifndef _ICE_VIRTCHNL_ALLOWLIST_H_ +#define _ICE_VIRTCHNL_ALLOWLIST_H_ +#include "ice.h" + +bool ice_vc_is_opcode_allowed(struct ice_vf *vf, u32 opcode); + +void ice_vc_set_default_allowlist(struct ice_vf *vf); +void ice_vc_set_working_allowlist(struct ice_vf *vf); +void ice_vc_set_caps_allowlist(struct ice_vf *vf); +#endif /* _ICE_VIRTCHNL_ALLOWLIST_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index a3ed4b84bba6..ccd6b3e8a5a9 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -5,6 +5,7 @@ #include "ice_base.h" #include "ice_lib.h" #include "ice_fltr.h" +#include "ice_virtchnl_allowlist.h" /** * ice_validate_vf_id - helper to check if VF ID is valid @@ -1314,6 +1315,9 @@ bool ice_reset_all_vfs(struct ice_pf *pf, bool is_vflr) ice_for_each_vf(pf, v) { vf = &pf->vf[v]; + vf->driver_caps = 0; + ice_vc_set_default_allowlist(vf); + ice_vf_fdir_exit(vf); /* clean VF control VSI when resetting VFs since it should be * setup only when VF creates its first FDIR rule. @@ -1418,6 +1422,9 @@ bool ice_reset_vf(struct ice_vf *vf, bool is_vflr) usleep_range(10, 20); } + vf->driver_caps = 0; + ice_vc_set_default_allowlist(vf); + /* Display a warning if VF didn't manage to reset in time, but need to * continue on with the operation. */ @@ -1625,6 +1632,7 @@ static void ice_set_dflt_settings_vfs(struct ice_pf *pf) set_bit(ICE_VIRTCHNL_VF_CAP_L2, &vf->vf_caps); vf->spoofchk = true; vf->num_vf_qs = pf->num_qps_per_vf; + ice_vc_set_default_allowlist(vf); /* ctrl_vsi_idx will be set to a valid value only when VF * creates its first fdir rule. @@ -2127,6 +2135,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg) /* match guest capabilities */ vf->driver_caps = vfres->vf_cap_flags; + ice_vc_set_caps_allowlist(vf); + ice_vc_set_working_allowlist(vf); + set_bit(ICE_VF_STATE_ACTIVE, vf->vf_states); err: @@ -3840,6 +3851,13 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event) err = -EINVAL; } + if (!ice_vc_is_opcode_allowed(vf, v_opcode)) { + ice_vc_send_msg_to_vf(vf, v_opcode, + VIRTCHNL_STATUS_ERR_NOT_SUPPORTED, NULL, + 0); + return; + } + error_handler: if (err) { ice_vc_send_msg_to_vf(vf, v_opcode, VIRTCHNL_STATUS_ERR_PARAM, diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h index bcc2890c930a..d800ed83d6c3 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h @@ -103,6 +103,7 @@ struct ice_vf { u16 num_vf_qs; /* num of queue configured per VF */ struct ice_mdd_vf_events mdd_rx_events; struct ice_mdd_vf_events mdd_tx_events; + DECLARE_BITMAP(opcodes_allowlist, VIRTCHNL_OP_MAX); }; #ifdef CONFIG_PCI_IOV diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 40dd6afbfd81..debdd196773b 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -139,6 +139,7 @@ enum virtchnl_ops { /* opcode 34 - 46 are reserved */ VIRTCHNL_OP_ADD_FDIR_FILTER = 47, VIRTCHNL_OP_DEL_FDIR_FILTER = 48, + VIRTCHNL_OP_MAX, }; /* These macros are used to generate compilation errors if a structure/union -- cgit v1.2.3 From 142da08c4dc0afd07f9136b4812d5386bd6e1717 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Tue, 2 Mar 2021 10:12:12 -0800 Subject: ice: Advertise virtchnl UDP segmentation offload capability As the hardware is capable of supporting UDP segmentation offload, add a capability bit to virtchnl.h to communicate this and have the driver advertise its support. Suggested-by: Jesse Brandeburg Signed-off-by: Brett Creeley Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 3 +++ include/linux/avf/virtchnl.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index ccd6b3e8a5a9..1292a0b06eb5 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -2118,6 +2118,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg) if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED) vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED; + if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_USO) + vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_USO; + vfres->num_vsis = 1; /* Tx and Rx queue are equal for VF */ vfres->num_queue_pairs = vsi->num_txq; diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index debdd196773b..9e0341cf2c36 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -251,6 +251,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM 0X00200000 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM 0X00400000 #define VIRTCHNL_VF_OFFLOAD_ADQ 0X00800000 +#define VIRTCHNL_VF_OFFLOAD_USO 0X02000000 #define VIRTCHNL_VF_OFFLOAD_FDIR_PF 0X10000000 /* Define below the capability flags that are not offloads */ -- cgit v1.2.3 From 222a8ab01698148c00c271cda82d96f4e6e7b0a8 Mon Sep 17 00:00:00 2001 From: Qi Zhang Date: Tue, 13 Apr 2021 08:48:39 +0800 Subject: ice: Enable RSS configure for AVF Currently, RSS hash input is not available to AVF by ethtool, it is set by the PF directly. Add the RSS configure support for AVF through new virtchnl message, and define the capability flag VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF to query this new RSS offload support. Signed-off-by: Jia Guo Signed-off-by: Qi Zhang Signed-off-by: Haiyue Wang Tested-by: Bo Chen Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_flow.h | 3 + .../ethernet/intel/ice/ice_virtchnl_allowlist.c | 6 + drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 453 +++++++++++++++++++++ include/linux/avf/virtchnl.h | 25 +- 4 files changed, 486 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ice/ice_flow.h b/drivers/net/ethernet/intel/ice/ice_flow.h index eec9def8ffca..2f68b59ace7e 100644 --- a/drivers/net/ethernet/intel/ice/ice_flow.h +++ b/drivers/net/ethernet/intel/ice/ice_flow.h @@ -8,6 +8,9 @@ #define ICE_FLOW_FLD_OFF_INVAL 0xffff /* Generate flow hash field from flow field type(s) */ +#define ICE_FLOW_HASH_ETH \ + (BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA) | \ + BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA)) #define ICE_FLOW_HASH_IPV4 \ (BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | \ BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c index 5a0fbb47346f..9feebe5f556c 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_allowlist.c @@ -61,6 +61,11 @@ static const u32 rss_pf_allowlist_opcodes[] = { VIRTCHNL_OP_GET_RSS_HENA_CAPS, VIRTCHNL_OP_SET_RSS_HENA, }; +/* VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF */ +static const u32 adv_rss_pf_allowlist_opcodes[] = { + VIRTCHNL_OP_ADD_RSS_CFG, VIRTCHNL_OP_DEL_RSS_CFG, +}; + /* VIRTCHNL_VF_OFFLOAD_FDIR_PF */ static const u32 fdir_pf_allowlist_opcodes[] = { VIRTCHNL_OP_ADD_FDIR_FILTER, VIRTCHNL_OP_DEL_FDIR_FILTER, @@ -82,6 +87,7 @@ static const struct allowlist_opcode_info allowlist_opcodes[] = { ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_REQ_QUEUES, req_queues_allowlist_opcodes), ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_VLAN, vlan_allowlist_opcodes), ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_RSS_PF, rss_pf_allowlist_opcodes), + ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF, adv_rss_pf_allowlist_opcodes), ALLOW_ITEM(VIRTCHNL_VF_OFFLOAD_FDIR_PF, fdir_pf_allowlist_opcodes), }; diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index baada80c98ab..ca778a80d363 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -5,8 +5,248 @@ #include "ice_base.h" #include "ice_lib.h" #include "ice_fltr.h" +#include "ice_flow.h" #include "ice_virtchnl_allowlist.h" +#define FIELD_SELECTOR(proto_hdr_field) \ + BIT((proto_hdr_field) & PROTO_HDR_FIELD_MASK) + +struct ice_vc_hdr_match_type { + u32 vc_hdr; /* virtchnl headers (VIRTCHNL_PROTO_HDR_XXX) */ + u32 ice_hdr; /* ice headers (ICE_FLOW_SEG_HDR_XXX) */ +}; + +static const struct ice_vc_hdr_match_type ice_vc_hdr_list_os[] = { + {VIRTCHNL_PROTO_HDR_NONE, ICE_FLOW_SEG_HDR_NONE}, + {VIRTCHNL_PROTO_HDR_IPV4, ICE_FLOW_SEG_HDR_IPV4 | + ICE_FLOW_SEG_HDR_IPV_OTHER}, + {VIRTCHNL_PROTO_HDR_IPV6, ICE_FLOW_SEG_HDR_IPV6 | + ICE_FLOW_SEG_HDR_IPV_OTHER}, + {VIRTCHNL_PROTO_HDR_TCP, ICE_FLOW_SEG_HDR_TCP}, + {VIRTCHNL_PROTO_HDR_UDP, ICE_FLOW_SEG_HDR_UDP}, + {VIRTCHNL_PROTO_HDR_SCTP, ICE_FLOW_SEG_HDR_SCTP}, +}; + +static const struct ice_vc_hdr_match_type ice_vc_hdr_list_comms[] = { + {VIRTCHNL_PROTO_HDR_NONE, ICE_FLOW_SEG_HDR_NONE}, + {VIRTCHNL_PROTO_HDR_ETH, ICE_FLOW_SEG_HDR_ETH}, + {VIRTCHNL_PROTO_HDR_S_VLAN, ICE_FLOW_SEG_HDR_VLAN}, + {VIRTCHNL_PROTO_HDR_C_VLAN, ICE_FLOW_SEG_HDR_VLAN}, + {VIRTCHNL_PROTO_HDR_IPV4, ICE_FLOW_SEG_HDR_IPV4 | + ICE_FLOW_SEG_HDR_IPV_OTHER}, + {VIRTCHNL_PROTO_HDR_IPV6, ICE_FLOW_SEG_HDR_IPV6 | + ICE_FLOW_SEG_HDR_IPV_OTHER}, + {VIRTCHNL_PROTO_HDR_TCP, ICE_FLOW_SEG_HDR_TCP}, + {VIRTCHNL_PROTO_HDR_UDP, ICE_FLOW_SEG_HDR_UDP}, + {VIRTCHNL_PROTO_HDR_SCTP, ICE_FLOW_SEG_HDR_SCTP}, + {VIRTCHNL_PROTO_HDR_PPPOE, ICE_FLOW_SEG_HDR_PPPOE}, + {VIRTCHNL_PROTO_HDR_GTPU_IP, ICE_FLOW_SEG_HDR_GTPU_IP}, + {VIRTCHNL_PROTO_HDR_GTPU_EH, ICE_FLOW_SEG_HDR_GTPU_EH}, + {VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN, + ICE_FLOW_SEG_HDR_GTPU_DWN}, + {VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP, + ICE_FLOW_SEG_HDR_GTPU_UP}, + {VIRTCHNL_PROTO_HDR_L2TPV3, ICE_FLOW_SEG_HDR_L2TPV3}, + {VIRTCHNL_PROTO_HDR_ESP, ICE_FLOW_SEG_HDR_ESP}, + {VIRTCHNL_PROTO_HDR_AH, ICE_FLOW_SEG_HDR_AH}, + {VIRTCHNL_PROTO_HDR_PFCP, ICE_FLOW_SEG_HDR_PFCP_SESSION}, +}; + +struct ice_vc_hash_field_match_type { + u32 vc_hdr; /* virtchnl headers + * (VIRTCHNL_PROTO_HDR_XXX) + */ + u32 vc_hash_field; /* virtchnl hash fields selector + * FIELD_SELECTOR((VIRTCHNL_PROTO_HDR_ETH_XXX)) + */ + u64 ice_hash_field; /* ice hash fields + * (BIT_ULL(ICE_FLOW_FIELD_IDX_XXX)) + */ +}; + +static const struct +ice_vc_hash_field_match_type ice_vc_hash_field_list_os[] = { + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST), + ICE_FLOW_HASH_IPV4}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST), + ICE_FLOW_HASH_IPV6}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT), + ICE_FLOW_HASH_TCP_PORT}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT), + ICE_FLOW_HASH_UDP_PORT}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT), + ICE_FLOW_HASH_SCTP_PORT}, +}; + +static const struct +ice_vc_hash_field_match_type ice_vc_hash_field_list_comms[] = { + {VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC), + BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_SA)}, + {VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST), + BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_DA)}, + {VIRTCHNL_PROTO_HDR_ETH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_DST), + ICE_FLOW_HASH_ETH}, + {VIRTCHNL_PROTO_HDR_ETH, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ETH_ETHERTYPE), + BIT_ULL(ICE_FLOW_FIELD_IDX_ETH_TYPE)}, + {VIRTCHNL_PROTO_HDR_S_VLAN, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_S_VLAN_ID), + BIT_ULL(ICE_FLOW_FIELD_IDX_S_VLAN)}, + {VIRTCHNL_PROTO_HDR_C_VLAN, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_C_VLAN_ID), + BIT_ULL(ICE_FLOW_FIELD_IDX_C_VLAN)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST), + ICE_FLOW_HASH_IPV4}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + ICE_FLOW_HASH_IPV4 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV4, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV4_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV4_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST), + ICE_FLOW_HASH_IPV6}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_SA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_DA) | + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_SRC) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_DST) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + ICE_FLOW_HASH_IPV6 | BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_IPV6, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_IPV6_PROT), + BIT_ULL(ICE_FLOW_FIELD_IDX_IPV6_PROT)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_SRC_PORT)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_TCP_DST_PORT)}, + {VIRTCHNL_PROTO_HDR_TCP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_TCP_DST_PORT), + ICE_FLOW_HASH_TCP_PORT}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_SRC_PORT)}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_UDP_DST_PORT)}, + {VIRTCHNL_PROTO_HDR_UDP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_UDP_DST_PORT), + ICE_FLOW_HASH_UDP_PORT}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_SRC_PORT)}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT), + BIT_ULL(ICE_FLOW_FIELD_IDX_SCTP_DST_PORT)}, + {VIRTCHNL_PROTO_HDR_SCTP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT) | + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_SCTP_DST_PORT), + ICE_FLOW_HASH_SCTP_PORT}, + {VIRTCHNL_PROTO_HDR_PPPOE, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PPPOE_SESS_ID), + BIT_ULL(ICE_FLOW_FIELD_IDX_PPPOE_SESS_ID)}, + {VIRTCHNL_PROTO_HDR_GTPU_IP, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_GTPU_IP_TEID), + BIT_ULL(ICE_FLOW_FIELD_IDX_GTPU_IP_TEID)}, + {VIRTCHNL_PROTO_HDR_L2TPV3, + FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_L2TPV3_SESS_ID), + BIT_ULL(ICE_FLOW_FIELD_IDX_L2TPV3_SESS_ID)}, + {VIRTCHNL_PROTO_HDR_ESP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_ESP_SPI), + BIT_ULL(ICE_FLOW_FIELD_IDX_ESP_SPI)}, + {VIRTCHNL_PROTO_HDR_AH, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_AH_SPI), + BIT_ULL(ICE_FLOW_FIELD_IDX_AH_SPI)}, + {VIRTCHNL_PROTO_HDR_PFCP, FIELD_SELECTOR(VIRTCHNL_PROTO_HDR_PFCP_SEID), + BIT_ULL(ICE_FLOW_FIELD_IDX_PFCP_SEID)}, +}; + /** * ice_get_vf_vsi - get VF's VSI based on the stored index * @vf: VF used to get VSI @@ -2121,6 +2361,9 @@ static int ice_vc_get_vf_res_msg(struct ice_vf *vf, u8 *msg) if (vf->driver_caps & VIRTCHNL_VF_CAP_ADV_LINK_SPEED) vfres->vf_cap_flags |= VIRTCHNL_VF_CAP_ADV_LINK_SPEED; + if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF) + vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF; + if (vf->driver_caps & VIRTCHNL_VF_OFFLOAD_USO) vfres->vf_cap_flags |= VIRTCHNL_VF_OFFLOAD_USO; @@ -2234,6 +2477,210 @@ static bool ice_vc_isvalid_ring_len(u16 ring_len) !(ring_len % ICE_REQ_DESC_MULTIPLE)); } +/** + * ice_vc_parse_rss_cfg - parses hash fields and headers from + * a specific virtchnl RSS cfg + * @hw: pointer to the hardware + * @rss_cfg: pointer to the virtchnl RSS cfg + * @addl_hdrs: pointer to the protocol header fields (ICE_FLOW_SEG_HDR_*) + * to configure + * @hash_flds: pointer to the hash bit fields (ICE_FLOW_HASH_*) to configure + * + * Return true if all the protocol header and hash fields in the RSS cfg could + * be parsed, else return false + * + * This function parses the virtchnl RSS cfg to be the intended + * hash fields and the intended header for RSS configuration + */ +static bool +ice_vc_parse_rss_cfg(struct ice_hw *hw, struct virtchnl_rss_cfg *rss_cfg, + u32 *addl_hdrs, u64 *hash_flds) +{ + const struct ice_vc_hash_field_match_type *hf_list; + const struct ice_vc_hdr_match_type *hdr_list; + int i, hf_list_len, hdr_list_len; + + if (!strncmp(hw->active_pkg_name, "ICE COMMS Package", + sizeof(hw->active_pkg_name))) { + hf_list = ice_vc_hash_field_list_comms; + hf_list_len = ARRAY_SIZE(ice_vc_hash_field_list_comms); + hdr_list = ice_vc_hdr_list_comms; + hdr_list_len = ARRAY_SIZE(ice_vc_hdr_list_comms); + } else { + hf_list = ice_vc_hash_field_list_os; + hf_list_len = ARRAY_SIZE(ice_vc_hash_field_list_os); + hdr_list = ice_vc_hdr_list_os; + hdr_list_len = ARRAY_SIZE(ice_vc_hdr_list_os); + } + + for (i = 0; i < rss_cfg->proto_hdrs.count; i++) { + struct virtchnl_proto_hdr *proto_hdr = + &rss_cfg->proto_hdrs.proto_hdr[i]; + bool hdr_found = false; + int j; + + /* Find matched ice headers according to virtchnl headers. */ + for (j = 0; j < hdr_list_len; j++) { + struct ice_vc_hdr_match_type hdr_map = hdr_list[j]; + + if (proto_hdr->type == hdr_map.vc_hdr) { + *addl_hdrs |= hdr_map.ice_hdr; + hdr_found = true; + } + } + + if (!hdr_found) + return false; + + /* Find matched ice hash fields according to + * virtchnl hash fields. + */ + for (j = 0; j < hf_list_len; j++) { + struct ice_vc_hash_field_match_type hf_map = hf_list[j]; + + if (proto_hdr->type == hf_map.vc_hdr && + proto_hdr->field_selector == hf_map.vc_hash_field) { + *hash_flds |= hf_map.ice_hash_field; + break; + } + } + } + + return true; +} + +/** + * ice_vf_adv_rss_offload_ena - determine if capabilities support advanced + * RSS offloads + * @caps: VF driver negotiated capabilities + * + * Return true if VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF capability is set, + * else return false + */ +static bool ice_vf_adv_rss_offload_ena(u32 caps) +{ + return !!(caps & VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF); +} + +/** + * ice_vc_handle_rss_cfg + * @vf: pointer to the VF info + * @msg: pointer to the message buffer + * @add: add a RSS config if true, otherwise delete a RSS config + * + * This function adds/deletes a RSS config + */ +static int ice_vc_handle_rss_cfg(struct ice_vf *vf, u8 *msg, bool add) +{ + u32 v_opcode = add ? VIRTCHNL_OP_ADD_RSS_CFG : VIRTCHNL_OP_DEL_RSS_CFG; + struct virtchnl_rss_cfg *rss_cfg = (struct virtchnl_rss_cfg *)msg; + enum virtchnl_status_code v_ret = VIRTCHNL_STATUS_SUCCESS; + struct device *dev = ice_pf_to_dev(vf->pf); + struct ice_hw *hw = &vf->pf->hw; + struct ice_vsi *vsi; + + if (!test_bit(ICE_FLAG_RSS_ENA, vf->pf->flags)) { + dev_dbg(dev, "VF %d attempting to configure RSS, but RSS is not supported by the PF\n", + vf->vf_id); + v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED; + goto error_param; + } + + if (!ice_vf_adv_rss_offload_ena(vf->driver_caps)) { + dev_dbg(dev, "VF %d attempting to configure RSS, but Advanced RSS offload is not supported\n", + vf->vf_id); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto error_param; + } + + if (!test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto error_param; + } + + if (rss_cfg->proto_hdrs.count > VIRTCHNL_MAX_NUM_PROTO_HDRS || + rss_cfg->rss_algorithm < VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC || + rss_cfg->rss_algorithm > VIRTCHNL_RSS_ALG_XOR_SYMMETRIC) { + dev_dbg(dev, "VF %d attempting to configure RSS, but RSS configuration is not valid\n", + vf->vf_id); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto error_param; + } + + vsi = ice_get_vf_vsi(vf); + if (!vsi) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto error_param; + } + + if (rss_cfg->rss_algorithm == VIRTCHNL_RSS_ALG_R_ASYMMETRIC) { + struct ice_vsi_ctx *ctx; + enum ice_status status; + u8 lut_type, hash_type; + + lut_type = ICE_AQ_VSI_Q_OPT_RSS_LUT_VSI; + hash_type = add ? ICE_AQ_VSI_Q_OPT_RSS_XOR : + ICE_AQ_VSI_Q_OPT_RSS_TPLZ; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + v_ret = VIRTCHNL_STATUS_ERR_NO_MEMORY; + goto error_param; + } + + ctx->info.q_opt_rss = ((lut_type << + ICE_AQ_VSI_Q_OPT_RSS_LUT_S) & + ICE_AQ_VSI_Q_OPT_RSS_LUT_M) | + (hash_type & + ICE_AQ_VSI_Q_OPT_RSS_HASH_M); + + /* Preserve existing queueing option setting */ + ctx->info.q_opt_rss |= (vsi->info.q_opt_rss & + ICE_AQ_VSI_Q_OPT_RSS_GBL_LUT_M); + ctx->info.q_opt_tc = vsi->info.q_opt_tc; + ctx->info.q_opt_flags = vsi->info.q_opt_rss; + + ctx->info.valid_sections = + cpu_to_le16(ICE_AQ_VSI_PROP_Q_OPT_VALID); + + status = ice_update_vsi(hw, vsi->idx, ctx, NULL); + if (status) { + dev_err(dev, "update VSI for RSS failed, err %s aq_err %s\n", + ice_stat_str(status), + ice_aq_str(hw->adminq.sq_last_status)); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + } else { + vsi->info.q_opt_rss = ctx->info.q_opt_rss; + } + + kfree(ctx); + } else { + u32 addl_hdrs = ICE_FLOW_SEG_HDR_NONE; + u64 hash_flds = ICE_HASH_INVALID; + + if (!ice_vc_parse_rss_cfg(hw, rss_cfg, &addl_hdrs, + &hash_flds)) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto error_param; + } + + if (add) { + if (ice_add_rss_cfg(hw, vsi->idx, hash_flds, + addl_hdrs)) { + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + dev_err(dev, "ice_add_rss_cfg failed for vsi = %d, v_ret = %d\n", + vsi->vsi_num, v_ret); + } + } else { + v_ret = VIRTCHNL_STATUS_ERR_NOT_SUPPORTED; + dev_err(dev, "RSS removal not supported\n"); + } + } + +error_param: + return ice_vc_send_msg_to_vf(vf, v_opcode, v_ret, NULL, 0); +} + /** * ice_vc_config_rss_key * @vf: pointer to the VF info @@ -3931,6 +4378,12 @@ error_handler: case VIRTCHNL_OP_DEL_FDIR_FILTER: err = ice_vc_del_fdir_fltr(vf, msg); break; + case VIRTCHNL_OP_ADD_RSS_CFG: + err = ice_vc_handle_rss_cfg(vf, msg, true); + break; + case VIRTCHNL_OP_DEL_RSS_CFG: + err = ice_vc_handle_rss_cfg(vf, msg, false); + break; case VIRTCHNL_OP_UNKNOWN: default: dev_err(dev, "Unsupported opcode %d from VF %d\n", v_opcode, diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 9e0341cf2c36..565deea6ffe8 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -136,7 +136,9 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_CHANNELS = 31, VIRTCHNL_OP_ADD_CLOUD_FILTER = 32, VIRTCHNL_OP_DEL_CLOUD_FILTER = 33, - /* opcode 34 - 46 are reserved */ + /* opcode 34 - 44 are reserved */ + VIRTCHNL_OP_ADD_RSS_CFG = 45, + VIRTCHNL_OP_DEL_RSS_CFG = 46, VIRTCHNL_OP_ADD_FDIR_FILTER = 47, VIRTCHNL_OP_DEL_FDIR_FILTER = 48, VIRTCHNL_OP_MAX, @@ -252,6 +254,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM 0X00400000 #define VIRTCHNL_VF_OFFLOAD_ADQ 0X00800000 #define VIRTCHNL_VF_OFFLOAD_USO 0X02000000 +#define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF 0X08000000 #define VIRTCHNL_VF_OFFLOAD_FDIR_PF 0X10000000 /* Define below the capability flags that are not offloads */ @@ -677,6 +680,14 @@ enum virtchnl_vfr_states { VIRTCHNL_VFR_VFACTIVE, }; +/* Type of RSS algorithm */ +enum virtchnl_rss_algorithm { + VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC = 0, + VIRTCHNL_RSS_ALG_R_ASYMMETRIC = 1, + VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC = 2, + VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, +}; + #define VIRTCHNL_MAX_NUM_PROTO_HDRS 32 #define PROTO_HDR_SHIFT 5 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT) @@ -832,6 +843,14 @@ struct virtchnl_proto_hdrs { VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs); +struct virtchnl_rss_cfg { + struct virtchnl_proto_hdrs proto_hdrs; /* protocol headers */ + enum virtchnl_rss_algorithm rss_algorithm; /* RSS algorithm type */ + u8 reserved[128]; /* reserve for future */ +}; + +VIRTCHNL_CHECK_STRUCT_LEN(2444, virtchnl_rss_cfg); + /* action configuration for FDIR */ struct virtchnl_filter_action { enum virtchnl_action type; @@ -1100,6 +1119,10 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DEL_CLOUD_FILTER: valid_len = sizeof(struct virtchnl_filter); break; + case VIRTCHNL_OP_ADD_RSS_CFG: + case VIRTCHNL_OP_DEL_RSS_CFG: + valid_len = sizeof(struct virtchnl_rss_cfg); + break; case VIRTCHNL_OP_ADD_FDIR_FILTER: valid_len = sizeof(struct virtchnl_fdir_add); break; -- cgit v1.2.3 From 96874c619c200bc704ae2d8e34a3746350922135 Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Thu, 22 Apr 2021 15:55:00 +0800 Subject: net: stmmac: Add HW descriptor prefetch setting for DWMAC Core 5.20 onwards DWMAC Core 5.20 onwards supports HW descriptor prefetching. Additionally, it also depends on platform specific RTL configuration. This capability could be enabled by setting DMA_Mode bit-19 (DCHE). So, to enable this cability, platform must set plat->dma_cfg->dche = true and the DWMAC core version must be 5.20 onwards. Else, this capability wouldn`t be configured Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/common.h | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c | 10 ++++++++-- drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +++++ include/linux/stmmac.h | 1 + 5 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index c54a56b732b3..619e3c0760d6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -33,6 +33,7 @@ #define DWMAC_CORE_4_10 0x41 #define DWMAC_CORE_5_00 0x50 #define DWMAC_CORE_5_10 0x51 +#define DWMAC_CORE_5_20 0x52 #define DWXGMAC_CORE_2_10 0x21 #define DWXLGMAC_CORE_2_00 0x20 diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index cb17f6c35e54..a602d16b9e53 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -162,12 +162,18 @@ static void dwmac4_dma_init(void __iomem *ioaddr, writel(value, ioaddr + DMA_SYS_BUS_MODE); + value = readl(ioaddr + DMA_BUS_MODE); + if (dma_cfg->multi_msi_en) { - value = readl(ioaddr + DMA_BUS_MODE); value &= ~DMA_BUS_MODE_INTM_MASK; value |= (DMA_BUS_MODE_INTM_MODE1 << DMA_BUS_MODE_INTM_SHIFT); - writel(value, ioaddr + DMA_BUS_MODE); } + + if (dma_cfg->dche) + value |= DMA_BUS_MODE_DCHE; + + writel(value, ioaddr + DMA_BUS_MODE); + } static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h index 05481eb13ba6..9321879b599c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.h @@ -25,6 +25,7 @@ #define DMA_TBS_CTRL 0x00001050 /* DMA Bus Mode bitmap */ +#define DMA_BUS_MODE_DCHE BIT(19) #define DMA_BUS_MODE_INTM_MASK GENMASK(17, 16) #define DMA_BUS_MODE_INTM_SHIFT 16 #define DMA_BUS_MODE_INTM_MODE1 0x1 diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d1ca07c846e6..372090e8ee6f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6849,6 +6849,11 @@ int stmmac_dvr_probe(struct device *device, if (ret) goto error_hw_init; + /* Only DWMAC core version 5.20 onwards supports HW descriptor prefetch. + */ + if (priv->synopsys_id < DWMAC_CORE_5_20) + priv->plat->dma_cfg->dche = false; + stmmac_check_ether_addr(priv); ndev->netdev_ops = &stmmac_netdev_ops; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 97edb31d6310..0db36360ef21 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -97,6 +97,7 @@ struct stmmac_dma_cfg { bool aal; bool eame; bool multi_msi_en; + bool dche; }; #define AXI_BLEN 7 -- cgit v1.2.3 From 06ec5acc7747f225154fcafaf2afe52324694baa Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 2 Mar 2021 13:54:42 +0200 Subject: net/mlx5: E-Switch, Return eswitch max ports when eswitch is supported mlx5_eswitch_get_total_vports() doesn't honor MLX5_ESWICH Kconfig flag. When MLX5_ESWITCH is disabled, FS layer continues to initialize eswitch specific ACL namespaces. Instead, start honoring MLX5_ESWITCH flag and perform vport specific initialization only when vport count is non zero. Signed-off-by: Parav Pandit Reviewed-by: Roi Dayan Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 13 +++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/vport.c | 14 -------------- include/linux/mlx5/eswitch.h | 11 +++++++++-- 3 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 1bb229ecd43b..c3a58224ae12 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -2205,3 +2205,16 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw) { up_write(&esw->mode_lock); } + +/** + * mlx5_eswitch_get_total_vports - Get total vports of the eswitch + * + * @dev: Pointer to core device + * + * mlx5_eswitch_get_total_vports returns total number of eswitch vports. + */ +u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) +{ + return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev); +} +EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index e05c5c0f3ae1..457ad42eaa2a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1151,20 +1151,6 @@ u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev) } EXPORT_SYMBOL_GPL(mlx5_query_nic_system_image_guid); -/** - * mlx5_eswitch_get_total_vports - Get total vports of the eswitch - * - * @dev: Pointer to core device - * - * mlx5_eswitch_get_total_vports returns total number of vports for - * the eswitch. - */ -u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) -{ - return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev); -} -EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); - int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out) { u16 opmod = (MLX5_CAP_GENERAL << 1) | (HCA_CAP_OPMOD_GET_MAX & 0x01); diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 9cf1da2883c6..17109b65c1ac 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -65,8 +65,6 @@ struct mlx5_flow_handle * mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *on_esw, struct mlx5_eswitch_rep *rep, u32 sqn); -u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); - #ifdef CONFIG_MLX5_ESWITCH enum devlink_eswitch_encap_mode mlx5_eswitch_get_encap_mode(const struct mlx5_core_dev *dev); @@ -126,6 +124,8 @@ u32 mlx5_eswitch_get_vport_metadata_for_set(struct mlx5_eswitch *esw, #define ESW_TUN_SLOW_TABLE_GOTO_VPORT_MARK ESW_TUN_OPTS_MASK u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev); +u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev); + #else /* CONFIG_MLX5_ESWITCH */ static inline u8 mlx5_eswitch_mode(struct mlx5_core_dev *dev) @@ -162,10 +162,17 @@ mlx5_eswitch_get_vport_metadata_mask(void) { return 0; } + +static inline u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) +{ + return 0; +} + #endif /* CONFIG_MLX5_ESWITCH */ static inline bool is_mdev_switchdev_mode(struct mlx5_core_dev *dev) { return mlx5_eswitch_mode(dev) == MLX5_ESWITCH_OFFLOADS; } + #endif -- cgit v1.2.3 From 9f8c7100c8f9879b7e972205cd1f33f0bc1cc8cb Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 2 Mar 2021 14:10:49 +0200 Subject: net/mlx5: E-Switch, Prepare to return total vports from eswitch struct Total vports are already stored during eswitch initialization. Instead of calculating everytime, read directly from eswitch. Additionally, host PF's SF vport information is available using QUERY_HCA_CAP command. It is not available through HCA_CAP of the eswitch manager PF. Hence, this patch prepares the return total eswitch vport count from the existing eswitch struct. This further helps to keep eswitch port counting macros and logic within eswitch. Signed-off-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 9 ++++++--- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 8 ++++++++ include/linux/mlx5/vport.h | 8 -------- 3 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index c3a58224ae12..f0974aa94574 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1573,8 +1573,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev) if (!MLX5_VPORT_MANAGER(dev)) return 0; - total_vports = mlx5_eswitch_get_total_vports(dev); - + total_vports = MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + + mlx5_sf_max_functions(dev); esw_info(dev, "Total vports %d, per vport: max uc(%d) max mc(%d)\n", total_vports, @@ -2215,6 +2215,9 @@ void mlx5_esw_unlock(struct mlx5_eswitch *esw) */ u16 mlx5_eswitch_get_total_vports(const struct mlx5_core_dev *dev) { - return MLX5_SPECIAL_VPORTS(dev) + mlx5_core_max_vfs(dev) + mlx5_sf_max_functions(dev); + struct mlx5_eswitch *esw; + + esw = dev->priv.eswitch; + return mlx5_esw_allowed(esw) ? esw->total_vports : 0; } EXPORT_SYMBOL_GPL(mlx5_eswitch_get_total_vports); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index b289d756a7e4..5ab480a5745d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -545,6 +545,14 @@ static inline u16 mlx5_eswitch_first_host_vport_num(struct mlx5_core_dev *dev) MLX5_VPORT_PF : MLX5_VPORT_FIRST_VF; } +#define MLX5_VPORT_PF_PLACEHOLDER (1u) +#define MLX5_VPORT_UPLINK_PLACEHOLDER (1u) +#define MLX5_VPORT_ECPF_PLACEHOLDER(mdev) (mlx5_ecpf_vport_exists(mdev)) + +#define MLX5_SPECIAL_VPORTS(mdev) (MLX5_VPORT_PF_PLACEHOLDER + \ + MLX5_VPORT_UPLINK_PLACEHOLDER + \ + MLX5_VPORT_ECPF_PLACEHOLDER(mdev)) + static inline int mlx5_esw_sf_start_idx(const struct mlx5_eswitch *esw) { /* PF and VF vports indices start from 0 to max_vfs */ diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 4db87bcfce7b..aad53cb72f17 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -36,14 +36,6 @@ #include #include -#define MLX5_VPORT_PF_PLACEHOLDER (1u) -#define MLX5_VPORT_UPLINK_PLACEHOLDER (1u) -#define MLX5_VPORT_ECPF_PLACEHOLDER(mdev) (mlx5_ecpf_vport_exists(mdev)) - -#define MLX5_SPECIAL_VPORTS(mdev) (MLX5_VPORT_PF_PLACEHOLDER + \ - MLX5_VPORT_UPLINK_PLACEHOLDER + \ - MLX5_VPORT_ECPF_PLACEHOLDER(mdev)) - #define MLX5_VPORT_MANAGER(mdev) \ (MLX5_CAP_GEN(mdev, vport_group_manager) && \ (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ -- cgit v1.2.3 From a1ab3e4554b5342b34845df452601ebd5a310d0a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Mar 2021 15:35:03 +0200 Subject: devlink: Extend SF port attributes to have external attribute Extended SF port attributes to have optional external flag similar to PCI PF and VF port attributes. External atttibute is required to generate unique phys_port_name when PF number and SF number are overlapping between two controllers similar to SR-IOV VFs. When a SF is for external controller an example view of external SF port and config sequence. On eswitch system: $ devlink dev eswitch set pci/0033:01:00.0 mode switchdev $ devlink port show pci/0033:01:00.0/196607: type eth netdev enP51p1s0f0np0 flavour physical port 0 splittable false pci/0033:01:00.0/131072: type eth netdev eth0 flavour pcipf controller 1 pfnum 0 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port add pci/0033:01:00.0 flavour pcisf pfnum 0 sfnum 77 controller 1 pci/0033:01:00.0/163840: type eth netdev eth1 flavour pcisf controller 1 pfnum 0 sfnum 77 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached phys_port_name construction: $ cat /sys/class/net/eth1/phys_port_name c1pf0sf77 Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 2 +- include/net/devlink.h | 5 ++++- net/core/devlink.c | 11 ++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index 8e825ef35cb7..183f782b940f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -141,7 +141,7 @@ int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_p mlx5_esw_get_port_parent_id(dev, &ppid); memcpy(dl_port->attrs.switch_id.id, &ppid.id[0], ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; - devlink_port_attrs_pci_sf_set(dl_port, 0, pfnum, sfnum); + devlink_port_attrs_pci_sf_set(dl_port, 0, pfnum, sfnum, false); devlink = priv_to_devlink(dev); dl_port_index = mlx5_esw_vport_to_devlink_port_index(dev, vport_num); err = devlink_port_register(devlink, dl_port, dl_port_index); diff --git a/include/net/devlink.h b/include/net/devlink.h index 853420db5d32..7c984cadfec4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -98,11 +98,13 @@ struct devlink_port_pci_vf_attrs { * @controller: Associated controller number * @sf: Associated PCI SF for of the PCI PF for this port. * @pf: Associated PCI PF number for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_sf_attrs { u32 controller; u32 sf; u16 pf; + u8 external:1; }; /** @@ -1508,7 +1510,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, - u32 controller, u16 pf, u32 sf); + u32 controller, u16 pf, u32 sf, + bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/net/core/devlink.c b/net/core/devlink.c index 737b61c2976e..4eb969518ee0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8599,9 +8599,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf) + u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -8615,6 +8616,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; + attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); @@ -8667,6 +8669,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (attrs->pci_sf.external) { + n = snprintf(name, len, "c%u", attrs->pci_sf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; -- cgit v1.2.3 From 885e8c68247cc2a9f1761a3d66fd274247a0faaf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 19 Apr 2021 18:16:49 +0200 Subject: netfilter: nat: move nf_xfrm_me_harder to where it is used remove the export and make it static. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat.h | 2 -- net/netfilter/nf_nat_core.c | 37 ------------------------------------- net/netfilter/nf_nat_proto.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index 0d412dd63707..987111ae5240 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -104,8 +104,6 @@ unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); -int nf_xfrm_me_harder(struct net *n, struct sk_buff *s, unsigned int family); - static inline int nf_nat_initialized(struct nf_conn *ct, enum nf_nat_manip_type manip) { diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index b7c3c902290f..7de595ead06a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -146,43 +146,6 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) return; } } - -int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) -{ - struct flowi fl; - unsigned int hh_len; - struct dst_entry *dst; - struct sock *sk = skb->sk; - int err; - - err = xfrm_decode_session(skb, &fl, family); - if (err < 0) - return err; - - dst = skb_dst(skb); - if (dst->xfrm) - dst = ((struct xfrm_dst *)dst)->route; - if (!dst_hold_safe(dst)) - return -EHOSTUNREACH; - - if (sk && !net_eq(net, sock_net(sk))) - sk = NULL; - - dst = xfrm_lookup(net, dst, &fl, sk, 0); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - skb_dst_drop(skb); - skb_dst_set(skb, dst); - - /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; - if (skb_headroom(skb) < hh_len && - pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) - return -ENOMEM; - return 0; -} -EXPORT_SYMBOL(nf_xfrm_me_harder); #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index 4731d21fc3ad..48cc60084d28 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -659,6 +659,44 @@ nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, return ret; } +#ifdef CONFIG_XFRM +static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) +{ + struct sock *sk = skb->sk; + struct dst_entry *dst; + unsigned int hh_len; + struct flowi fl; + int err; + + err = xfrm_decode_session(skb, &fl, family); + if (err < 0) + return err; + + dst = skb_dst(skb); + if (dst->xfrm) + dst = ((struct xfrm_dst *)dst)->route; + if (!dst_hold_safe(dst)) + return -EHOSTUNREACH; + + if (sk && !net_eq(net, sock_net(sk))) + sk = NULL; + + dst = xfrm_lookup(net, dst, &fl, sk, 0); + if (IS_ERR(dst)) + return PTR_ERR(dst); + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + /* Change in oif may mean change in hh_len. */ + hh_len = skb_dst(skb)->dev->hard_header_len; + if (skb_headroom(skb) < hh_len && + pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) + return -ENOMEM; + return 0; +} +#endif + static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) -- cgit v1.2.3 From e0bb96db96f8ca94349344a2ea7bebc6f8cefdae Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Apr 2021 01:12:44 +0200 Subject: netfilter: nft_socket: add support for cgroupsv2 Allow to match on the cgroupsv2 id from ancestor level. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +++ net/netfilter/nft_socket.c | 48 +++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 79bab7a36b30..467365ed59a7 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1014,11 +1014,13 @@ enum nft_rt_attributes { * * @NFTA_SOCKET_KEY: socket key to match * @NFTA_SOCKET_DREG: destination register + * @NFTA_SOCKET_LEVEL: cgroups2 ancestor level (only for cgroupsv2) */ enum nft_socket_attributes { NFTA_SOCKET_UNSPEC, NFTA_SOCKET_KEY, NFTA_SOCKET_DREG, + NFTA_SOCKET_LEVEL, __NFTA_SOCKET_MAX }; #define NFTA_SOCKET_MAX (__NFTA_SOCKET_MAX - 1) @@ -1029,11 +1031,13 @@ enum nft_socket_attributes { * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option * @NFT_SOCKET_MARK: Value of the socket mark * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0) + * @NFT_SOCKET_CGROUPV2: Match on cgroups version 2 */ enum nft_socket_keys { NFT_SOCKET_TRANSPARENT, NFT_SOCKET_MARK, NFT_SOCKET_WILDCARD, + NFT_SOCKET_CGROUPV2, __NFT_SOCKET_MAX }; #define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index c9b8a2b03b71..9c169d100651 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -9,6 +9,7 @@ struct nft_socket { enum nft_socket_keys key:8; + u8 level; union { u8 dreg; }; @@ -33,6 +34,26 @@ static void nft_socket_wildcard(const struct nft_pktinfo *pkt, } } +#ifdef CONFIG_CGROUPS +static noinline bool +nft_sock_get_eval_cgroupv2(u32 *dest, const struct nft_pktinfo *pkt, u32 level) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + if (level > cgrp->level) + return false; + + memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64)); + + return true; +} +#endif + static void nft_socket_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -85,6 +106,14 @@ static void nft_socket_eval(const struct nft_expr *expr, } nft_socket_wildcard(pkt, regs, sk, dest); break; +#ifdef CONFIG_CGROUPS + case NFT_SOCKET_CGROUPV2: + if (!nft_sock_get_eval_cgroupv2(dest, pkt, priv->level)) { + regs->verdict.code = NFT_BREAK; + return; + } + break; +#endif default: WARN_ON(1); regs->verdict.code = NFT_BREAK; @@ -97,6 +126,7 @@ static void nft_socket_eval(const struct nft_expr *expr, static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, + [NFTA_SOCKET_LEVEL] = { .type = NLA_U32 }, }; static int nft_socket_init(const struct nft_ctx *ctx, @@ -104,7 +134,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_socket *priv = nft_expr_priv(expr); - unsigned int len; + unsigned int len, level; if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY]) return -EINVAL; @@ -129,6 +159,19 @@ static int nft_socket_init(const struct nft_ctx *ctx, case NFT_SOCKET_MARK: len = sizeof(u32); break; +#ifdef CONFIG_CGROUPS + case NFT_SOCKET_CGROUPV2: + if (!tb[NFTA_SOCKET_LEVEL]) + return -EINVAL; + + level = ntohl(nla_get_u32(tb[NFTA_SOCKET_LEVEL])); + if (level > 255) + return -EOPNOTSUPP; + + priv->level = level; + len = sizeof(u64); + break; +#endif default: return -EOPNOTSUPP; } @@ -146,6 +189,9 @@ static int nft_socket_dump(struct sk_buff *skb, return -1; if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) return -1; + if (priv->key == NFT_SOCKET_CGROUPV2 && + nla_put_u32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) + return -1; return 0; } -- cgit v1.2.3 From de8c12110a130337c8e7e7b8250de0580e644dee Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:45:40 +0200 Subject: netfilter: disable defrag once its no longer needed When I changed defrag hooks to no longer get registered by default I intentionally made it so that registration can only be un-done by unloading the nf_defrag_ipv4/6 module. In hindsight this was too conservative; there is no reason to keep defrag on while there is no feature dependency anymore. Moreover, this won't work if user isn't allowed to remove nf_defrag module. This adds the disable() functions for both ipv4 and ipv6 and calls them from conntrack, TPROXY and the xtables socket module. ipvs isn't converted here, it will behave as before this patch and will need module removal. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_defrag_ipv4.h | 3 ++- include/net/netfilter/ipv6/nf_defrag_ipv6.h | 3 ++- net/ipv4/netfilter/nf_defrag_ipv4.c | 30 +++++++++++++++++++++++------ net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 29 ++++++++++++++++++++++------ net/netfilter/nf_conntrack_proto.c | 8 ++++++-- net/netfilter/nft_tproxy.c | 24 +++++++++++++++++++++++ net/netfilter/xt_TPROXY.c | 13 +++++++++++++ net/netfilter/xt_socket.c | 14 ++++++++++++++ 8 files changed, 108 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h index bcbd724cc048..7fda9ce9f694 100644 --- a/include/net/netfilter/ipv4/nf_defrag_ipv4.h +++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h @@ -3,6 +3,7 @@ #define _NF_DEFRAG_IPV4_H struct net; -int nf_defrag_ipv4_enable(struct net *); +int nf_defrag_ipv4_enable(struct net *net); +void nf_defrag_ipv4_disable(struct net *net); #endif /* _NF_DEFRAG_IPV4_H */ diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h index ece923e2035b..0fd8a4159662 100644 --- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h @@ -5,7 +5,8 @@ #include #include -int nf_defrag_ipv6_enable(struct net *); +int nf_defrag_ipv6_enable(struct net *net); +void nf_defrag_ipv6_disable(struct net *net); int nf_ct_frag6_init(void); void nf_ct_frag6_cleanup(void); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index ffdcc2b9360f..613432a36f0a 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -141,14 +141,16 @@ int nf_defrag_ipv4_enable(struct net *net) struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; - might_sleep(); - - if (nf_defrag->users) - return 0; - mutex_lock(&defrag4_mutex); - if (nf_defrag->users) + if (nf_defrag->users == UINT_MAX) { + err = -EOVERFLOW; goto out_unlock; + } + + if (nf_defrag->users) { + nf_defrag->users++; + goto out_unlock; + } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); @@ -161,6 +163,22 @@ int nf_defrag_ipv4_enable(struct net *net) } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); +void nf_defrag_ipv4_disable(struct net *net) +{ + struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); + + mutex_lock(&defrag4_mutex); + if (nf_defrag->users) { + nf_defrag->users--; + if (nf_defrag->users == 0) + nf_unregister_net_hooks(net, ipv4_defrag_ops, + ARRAY_SIZE(ipv4_defrag_ops)); + } + + mutex_unlock(&defrag4_mutex); +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable); + module_init(nf_defrag_init); module_exit(nf_defrag_fini); diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 402dc4ca9504..e8a59d8bf2ad 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -137,14 +137,16 @@ int nf_defrag_ipv6_enable(struct net *net) struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; - might_sleep(); - - if (nf_frag->users) - return 0; - mutex_lock(&defrag6_mutex); - if (nf_frag->users) + if (nf_frag->users == UINT_MAX) { + err = -EOVERFLOW; + goto out_unlock; + } + + if (nf_frag->users) { + nf_frag->users++; goto out_unlock; + } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); @@ -157,6 +159,21 @@ int nf_defrag_ipv6_enable(struct net *net) } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); +void nf_defrag_ipv6_disable(struct net *net) +{ + struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); + + mutex_lock(&defrag6_mutex); + if (nf_frag->users) { + nf_frag->users--; + if (nf_frag->users == 0) + nf_unregister_net_hooks(net, ipv6_defrag_ops, + ARRAY_SIZE(ipv6_defrag_ops)); + } + mutex_unlock(&defrag6_mutex); +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable); + module_init(nf_defrag_init); module_exit(nf_defrag_fini); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 47e9319d2cf3..89e5bac384d7 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -536,15 +536,19 @@ static void nf_ct_netns_do_put(struct net *net, u8 nfproto) mutex_lock(&nf_ct_proto_mutex); switch (nfproto) { case NFPROTO_IPV4: - if (cnet->users4 && (--cnet->users4 == 0)) + if (cnet->users4 && (--cnet->users4 == 0)) { nf_unregister_net_hooks(net, ipv4_conntrack_ops, ARRAY_SIZE(ipv4_conntrack_ops)); + nf_defrag_ipv4_disable(net); + } break; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: - if (cnet->users6 && (--cnet->users6 == 0)) + if (cnet->users6 && (--cnet->users6 == 0)) { nf_unregister_net_hooks(net, ipv6_conntrack_ops, ARRAY_SIZE(ipv6_conntrack_ops)); + nf_defrag_ipv6_disable(net); + } break; #endif case NFPROTO_BRIDGE: diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 43a5a780a6d3..accef672088c 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -263,6 +263,29 @@ static int nft_tproxy_init(const struct nft_ctx *ctx, return 0; } +static void nft_tproxy_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_tproxy *priv = nft_expr_priv(expr); + + switch (priv->family) { + case NFPROTO_IPV4: + nf_defrag_ipv4_disable(ctx->net); + break; +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + case NFPROTO_IPV6: + nf_defrag_ipv6_disable(ctx->net); + break; +#endif + case NFPROTO_UNSPEC: + nf_defrag_ipv4_disable(ctx->net); +#if IS_ENABLED(CONFIG_NF_TABLES_IPV6) + nf_defrag_ipv6_disable(ctx->net); +#endif + break; + } +} + static int nft_tproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -288,6 +311,7 @@ static const struct nft_expr_ops nft_tproxy_ops = { .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)), .eval = nft_tproxy_eval, .init = nft_tproxy_init, + .destroy = nft_tproxy_destroy, .dump = nft_tproxy_dump, }; diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 194dc03341f3..459d0696c91a 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -200,6 +200,11 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par) pr_info_ratelimited("Can be used only with -p tcp or -p udp\n"); return -EINVAL; } + +static void tproxy_tg6_destroy(const struct xt_tgdtor_param *par) +{ + nf_defrag_ipv6_disable(par->net); +} #endif static int tproxy_tg4_check(const struct xt_tgchk_param *par) @@ -219,6 +224,11 @@ static int tproxy_tg4_check(const struct xt_tgchk_param *par) return -EINVAL; } +static void tproxy_tg4_destroy(const struct xt_tgdtor_param *par) +{ + nf_defrag_ipv4_disable(par->net); +} + static struct xt_target tproxy_tg_reg[] __read_mostly = { { .name = "TPROXY", @@ -228,6 +238,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 0, .targetsize = sizeof(struct xt_tproxy_target_info), .checkentry = tproxy_tg4_check, + .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, @@ -239,6 +250,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg4_check, + .destroy = tproxy_tg4_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, @@ -251,6 +263,7 @@ static struct xt_target tproxy_tg_reg[] __read_mostly = { .revision = 1, .targetsize = sizeof(struct xt_tproxy_target_info_v1), .checkentry = tproxy_tg6_check, + .destroy = tproxy_tg6_destroy, .hooks = 1 << NF_INET_PRE_ROUTING, .me = THIS_MODULE, }, diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 5f973987265d..5e6459e11605 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -216,6 +216,14 @@ static int socket_mt_v3_check(const struct xt_mtchk_param *par) return 0; } +static void socket_mt_destroy(const struct xt_mtdtor_param *par) +{ + if (par->family == NFPROTO_IPV4) + nf_defrag_ipv4_disable(par->net); + else if (par->family == NFPROTO_IPV6) + nf_defrag_ipv4_disable(par->net); +} + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", @@ -231,6 +239,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .revision = 1, .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, + .destroy = socket_mt_destroy, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -245,6 +254,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v1_check, .matchsize = sizeof(struct xt_socket_mtinfo1), + .destroy = socket_mt_destroy, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, @@ -256,6 +266,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v2_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -268,6 +279,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v2_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -280,6 +292,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV4, .match = socket_mt4_v1_v2_v3, .checkentry = socket_mt_v3_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), @@ -292,6 +305,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .family = NFPROTO_IPV6, .match = socket_mt6_v1_v2_v3, .checkentry = socket_mt_v3_check, + .destroy = socket_mt_destroy, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), -- cgit v1.2.3 From 4c95e0728eee33df6b029a5fca82a67daeca201e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:50:59 +0200 Subject: netfilter: ebtables: remove the 3 ebtables pointers from struct net ebtables stores the table internal data (what gets passed to the ebt_do_table() interpreter) in struct net. nftables keeps the internal interpreter format in pernet lists and passes it via the netfilter core infrastructure (priv pointer). Do the same for ebtables: the nf_hook_ops are duplicated via kmemdup, then the ops->priv pointer is set to the table that is being registered. After that, the netfilter core passes this table info to the hookfn. This allows to remove the pointers from struct net. Same pattern can be applied to ip/ip6/arptables. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge/ebtables.h | 9 +++---- include/net/netns/x_tables.h | 8 ------ net/bridge/netfilter/ebtable_broute.c | 10 +++----- net/bridge/netfilter/ebtable_filter.c | 26 +++++++------------ net/bridge/netfilter/ebtable_nat.c | 27 +++++++------------- net/bridge/netfilter/ebtables.c | 42 +++++++++++++++++++++++-------- 6 files changed, 58 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 3a956145a25c..a8178253ce53 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -100,6 +100,7 @@ struct ebt_table { unsigned int valid_hooks); /* the data used by the kernel */ struct ebt_table_info *private; + struct nf_hook_ops *ops; struct module *me; }; @@ -108,11 +109,9 @@ struct ebt_table { extern int ebt_register_table(struct net *net, const struct ebt_table *table, - const struct nf_hook_ops *ops, - struct ebt_table **res); -extern void ebt_unregister_table(struct net *net, struct ebt_table *table); -void ebt_unregister_table_pre_exit(struct net *net, const char *tablename, - const struct nf_hook_ops *ops); + const struct nf_hook_ops *ops); +extern void ebt_unregister_table(struct net *net, const char *tablename); +void ebt_unregister_table_pre_exit(struct net *net, const char *tablename); extern unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct ebt_table *table); diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h index 83c8ea2e87a6..d02316ec2906 100644 --- a/include/net/netns/x_tables.h +++ b/include/net/netns/x_tables.h @@ -5,16 +5,8 @@ #include #include -struct ebt_table; - struct netns_xt { bool notrack_deprecated_warning; bool clusterip_deprecated_warning; -#if defined(CONFIG_BRIDGE_NF_EBTABLES) || \ - defined(CONFIG_BRIDGE_NF_EBTABLES_MODULE) - struct ebt_table *broute_table; - struct ebt_table *frame_filter; - struct ebt_table *frame_nat; -#endif }; #endif diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 32bc2821027f..020b1487ee0c 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -66,8 +66,7 @@ static unsigned int ebt_broute(void *priv, struct sk_buff *skb, NFPROTO_BRIDGE, s->in, NULL, NULL, s->net, NULL); - ret = ebt_do_table(skb, &state, state.net->xt.broute_table); - + ret = ebt_do_table(skb, &state, priv); if (ret != NF_DROP) return ret; @@ -101,18 +100,17 @@ static const struct nf_hook_ops ebt_ops_broute = { static int __net_init broute_net_init(struct net *net) { - return ebt_register_table(net, &broute_table, &ebt_ops_broute, - &net->xt.broute_table); + return ebt_register_table(net, &broute_table, &ebt_ops_broute); } static void __net_exit broute_net_pre_exit(struct net *net) { - ebt_unregister_table_pre_exit(net, "broute", &ebt_ops_broute); + ebt_unregister_table_pre_exit(net, "broute"); } static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.broute_table); + ebt_unregister_table(net, "broute"); } static struct pernet_operations broute_net_ops = { diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index bcf982e12f16..8ec0b3736803 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -59,34 +59,27 @@ static const struct ebt_table frame_filter = { }; static unsigned int -ebt_in_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +ebt_filter_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { - return ebt_do_table(skb, state, state->net->xt.frame_filter); -} - -static unsigned int -ebt_out_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, state->net->xt.frame_filter); + return ebt_do_table(skb, state, priv); } static const struct nf_hook_ops ebt_ops_filter[] = { { - .hook = ebt_in_hook, + .hook = ebt_filter_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_in_hook, + .hook = ebt_filter_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_out_hook, + .hook = ebt_filter_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_FILTER_OTHER, @@ -95,18 +88,17 @@ static const struct nf_hook_ops ebt_ops_filter[] = { static int __net_init frame_filter_net_init(struct net *net) { - return ebt_register_table(net, &frame_filter, ebt_ops_filter, - &net->xt.frame_filter); + return ebt_register_table(net, &frame_filter, ebt_ops_filter); } static void __net_exit frame_filter_net_pre_exit(struct net *net) { - ebt_unregister_table_pre_exit(net, "filter", ebt_ops_filter); + ebt_unregister_table_pre_exit(net, "filter"); } static void __net_exit frame_filter_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_filter); + ebt_unregister_table(net, "filter"); } static struct pernet_operations frame_filter_net_ops = { diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 0d092773f816..7c8a1064a531 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -58,35 +58,27 @@ static const struct ebt_table frame_nat = { .me = THIS_MODULE, }; -static unsigned int -ebt_nat_in(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) +static unsigned int ebt_nat_hook(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { - return ebt_do_table(skb, state, state->net->xt.frame_nat); -} - -static unsigned int -ebt_nat_out(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, state->net->xt.frame_nat); + return ebt_do_table(skb, state, priv); } static const struct nf_hook_ops ebt_ops_nat[] = { { - .hook = ebt_nat_out, + .hook = ebt_nat_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_NAT_DST_OTHER, }, { - .hook = ebt_nat_out, + .hook = ebt_nat_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_NAT_SRC, }, { - .hook = ebt_nat_in, + .hook = ebt_nat_hook, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_NAT_DST_BRIDGED, @@ -95,18 +87,17 @@ static const struct nf_hook_ops ebt_ops_nat[] = { static int __net_init frame_nat_net_init(struct net *net) { - return ebt_register_table(net, &frame_nat, ebt_ops_nat, - &net->xt.frame_nat); + return ebt_register_table(net, &frame_nat, ebt_ops_nat); } static void __net_exit frame_nat_net_pre_exit(struct net *net) { - ebt_unregister_table_pre_exit(net, "nat", ebt_ops_nat); + ebt_unregister_table_pre_exit(net, "nat"); } static void __net_exit frame_nat_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_nat); + ebt_unregister_table(net, "nat"); } static struct pernet_operations frame_nat_net_ops = { diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 96d789c8d1c7..a04596bb2a6e 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1136,15 +1136,18 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) vfree(table->private->entries); ebt_free_table_info(table->private); vfree(table->private); + kfree(table->ops); kfree(table); } int ebt_register_table(struct net *net, const struct ebt_table *input_table, - const struct nf_hook_ops *ops, struct ebt_table **res) + const struct nf_hook_ops *template_ops) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); struct ebt_table_info *newinfo; struct ebt_table *t, *table; + struct nf_hook_ops *ops; + unsigned int num_ops; struct ebt_replace_kernel *repl; int ret, i, countersize; void *p; @@ -1213,15 +1216,31 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table, ret = -ENOENT; goto free_unlock; } + + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto free_unlock; + } + + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + if (newinfo->nentries) + module_put(table->me); + goto free_unlock; + } + + for (i = 0; i < num_ops; i++) + ops[i].priv = table; + list_add(&table->list, &ebt_net->tables); mutex_unlock(&ebt_mutex); - WRITE_ONCE(*res, table); - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret) { + table->ops = ops; + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret) __ebt_unregister_table(net, table); - *res = NULL; - } audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries, AUDIT_XT_OP_REGISTER, GFP_KERNEL); @@ -1257,18 +1276,21 @@ static struct ebt_table *__ebt_find_table(struct net *net, const char *name) return NULL; } -void ebt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) +void ebt_unregister_table_pre_exit(struct net *net, const char *name) { struct ebt_table *table = __ebt_find_table(net, name); if (table) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(ebt_unregister_table_pre_exit); -void ebt_unregister_table(struct net *net, struct ebt_table *table) +void ebt_unregister_table(struct net *net, const char *name) { - __ebt_unregister_table(net, table); + struct ebt_table *table = __ebt_find_table(net, name); + + if (table) + __ebt_unregister_table(net, table); } /* userspace just supplied us with counters */ -- cgit v1.2.3 From 7716bf090e97aec45e97907ec6a382e4610bdd8f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:00 +0200 Subject: netfilter: x_tables: remove ipt_unregister_table Its the same function as ipt_unregister_table_exit. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4/ip_tables.h | 3 --- include/linux/netfilter_ipv6/ip6_tables.h | 2 -- net/ipv4/netfilter/ip_tables.c | 9 --------- net/ipv4/netfilter/iptable_nat.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 9 --------- net/ipv6/netfilter/ip6table_nat.c | 2 +- 6 files changed, 2 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index c4676d6feeff..9f440eb6cf6c 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -31,9 +31,6 @@ void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, void ipt_unregister_table_exit(struct net *net, struct xt_table *table); -void ipt_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops); - /* Standard entry. */ struct ipt_standard { struct ipt_entry entry; diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 1547d5f9ae06..b88a27ce61b0 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,8 +27,6 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res); -void ip6t_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops); void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops); void ip6t_unregister_table_exit(struct net *net, struct xt_table *table); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f77ea0dbe656..2fa7f28b88e3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1770,14 +1770,6 @@ void ipt_unregister_table_exit(struct net *net, struct xt_table *table) __ipt_unregister_table(net, table); } -void ipt_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) -{ - if (ops) - ipt_unregister_table_pre_exit(net, table, ops); - __ipt_unregister_table(net, table); -} - /* Returns 1 if the type and code is matched by the range, 0 otherwise */ static inline bool icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, @@ -1924,7 +1916,6 @@ static void __exit ip_tables_fini(void) } EXPORT_SYMBOL(ipt_register_table); -EXPORT_SYMBOL(ipt_unregister_table); EXPORT_SYMBOL(ipt_unregister_table_pre_exit); EXPORT_SYMBOL(ipt_unregister_table_exit); EXPORT_SYMBOL(ipt_do_table); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index b0143b109f25..a89c1b9f94c2 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -105,7 +105,7 @@ static int __net_init iptable_nat_table_init(struct net *net) ret = ipt_nat_register_lookups(net); if (ret < 0) { - ipt_unregister_table(net, net->ipv4.nat_table, NULL); + ipt_unregister_table_exit(net, net->ipv4.nat_table); net->ipv4.nat_table = NULL; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index eb2b5404806c..e605c28cfed5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1780,14 +1780,6 @@ void ip6t_unregister_table_exit(struct net *net, struct xt_table *table) __ip6t_unregister_table(net, table); } -void ip6t_unregister_table(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops) -{ - if (ops) - ip6t_unregister_table_pre_exit(net, table, ops); - __ip6t_unregister_table(net, table); -} - /* Returns 1 if the type and code is matched by the range, 0 otherwise */ static inline bool icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, @@ -1935,7 +1927,6 @@ static void __exit ip6_tables_fini(void) } EXPORT_SYMBOL(ip6t_register_table); -EXPORT_SYMBOL(ip6t_unregister_table); EXPORT_SYMBOL(ip6t_unregister_table_pre_exit); EXPORT_SYMBOL(ip6t_unregister_table_exit); EXPORT_SYMBOL(ip6t_do_table); diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 0a23265e3caa..4cef1b405074 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -107,7 +107,7 @@ static int __net_init ip6table_nat_table_init(struct net *net) ret = ip6t_nat_register_lookups(net); if (ret < 0) { - ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL); + ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat); net->ipv6.ip6table_nat = NULL; } kfree(repl); -- cgit v1.2.3 From 1ef4d6d1af2d0c0c7c9b391365a3894bea291e34 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:01 +0200 Subject: netfilter: x_tables: add xt_find_table This will be used to obtain the xt_table struct given address family and table name. Followup patches will reduce the number of direct accesses to the xt_table structures via net->ipv{4,6}.ip(6)table_{nat,mangle,...} pointers, then remove them. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + net/netfilter/x_tables.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 8ec48466410a..b2eec7de5280 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -322,6 +322,7 @@ struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err); +struct xt_table *xt_find_table(struct net *net, u8 af, const char *name); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index b7f8d2ed3cc2..1caba9507228 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1199,6 +1199,23 @@ void xt_free_table_info(struct xt_table_info *info) } EXPORT_SYMBOL(xt_free_table_info); +struct xt_table *xt_find_table(struct net *net, u8 af, const char *name) +{ + struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); + struct xt_table *t; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_net->tables[af], list) { + if (strcmp(t->name, name) == 0) { + mutex_unlock(&xt[af].mutex); + return t; + } + } + mutex_unlock(&xt[af].mutex); + return NULL; +} +EXPORT_SYMBOL(xt_find_table); + /* Find table by name, grabs mutex & ref. Returns ERR_PTR on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) -- cgit v1.2.3 From 20a9df33594fe643f9cf46375a9243e3ab8ed3a6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:02 +0200 Subject: netfilter: iptables: unregister the tables by name xtables stores the xt_table structs in the struct net. This isn't needed anymore, the structures could be passed via the netfilter hook 'private' pointer to the hook functions, which would allow us to remove those pointers from struct net. As a first step, reduce the number of accesses to the net->ipv4.ip6table_{raw,filter,...} pointers. This allows the tables to get unregistered by name instead of having to pass the raw address. The xt_table structure cane looked up by name+address family instead. This patch is useless as-is (the backends still have the raw pointer address), but it lowers the bar to remove those. It also allows to put the 'was table registered in the first place' check into ip_tables.c rather than have it in each table sub module. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv4/ip_tables.h | 6 +++--- net/ipv4/netfilter/ip_tables.c | 14 ++++++++++---- net/ipv4/netfilter/iptable_filter.c | 8 ++------ net/ipv4/netfilter/iptable_mangle.c | 8 ++------ net/ipv4/netfilter/iptable_nat.c | 6 ++---- net/ipv4/netfilter/iptable_raw.c | 8 ++------ net/ipv4/netfilter/iptable_security.c | 8 ++------ 7 files changed, 23 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 9f440eb6cf6c..73bcf7f261d2 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -26,10 +26,10 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res); -void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, - const struct nf_hook_ops *ops); +void ipt_unregister_table_pre_exit(struct net *net, const char *name, + const struct nf_hook_ops *ops); -void ipt_unregister_table_exit(struct net *net, struct xt_table *table); +void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ struct ipt_standard { diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2fa7f28b88e3..0b859ec2d3f8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1759,15 +1759,21 @@ out_free: return ret; } -void ipt_unregister_table_pre_exit(struct net *net, struct xt_table *table, +void ipt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + + if (table) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); } -void ipt_unregister_table_exit(struct net *net, struct xt_table *table) +void ipt_unregister_table_exit(struct net *net, const char *name) { - __ipt_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); + + if (table) + __ipt_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 8f7bc1ee7453..a39998c7977f 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -74,16 +74,12 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_filter) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_filter, - filter_ops); + ipt_unregister_table_pre_exit(net, "filter", filter_ops); } static void __net_exit iptable_filter_net_exit(struct net *net) { - if (!net->ipv4.iptable_filter) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_filter); + ipt_unregister_table_exit(net, "filter"); net->ipv4.iptable_filter = NULL; } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 833079589273..7d1713e22553 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -102,16 +102,12 @@ static int __net_init iptable_mangle_table_init(struct net *net) static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_mangle) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_mangle, - mangle_ops); + ipt_unregister_table_pre_exit(net, "mangle", mangle_ops); } static void __net_exit iptable_mangle_net_exit(struct net *net) { - if (!net->ipv4.iptable_mangle) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_mangle); + ipt_unregister_table_exit(net, "mangle"); net->ipv4.iptable_mangle = NULL; } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a89c1b9f94c2..16bf3009642e 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -105,7 +105,7 @@ static int __net_init iptable_nat_table_init(struct net *net) ret = ipt_nat_register_lookups(net); if (ret < 0) { - ipt_unregister_table_exit(net, net->ipv4.nat_table); + ipt_unregister_table_exit(net, "nat"); net->ipv4.nat_table = NULL; } @@ -121,9 +121,7 @@ static void __net_exit iptable_nat_net_pre_exit(struct net *net) static void __net_exit iptable_nat_net_exit(struct net *net) { - if (!net->ipv4.nat_table) - return; - ipt_unregister_table_exit(net, net->ipv4.nat_table); + ipt_unregister_table_exit(net, "nat"); net->ipv4.nat_table = NULL; } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 9abfe6bf2cb9..a1f556464b93 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -69,16 +69,12 @@ static int __net_init iptable_raw_table_init(struct net *net) static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_raw) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_raw, - rawtable_ops); + ipt_unregister_table_pre_exit(net, "raw", rawtable_ops); } static void __net_exit iptable_raw_net_exit(struct net *net) { - if (!net->ipv4.iptable_raw) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_raw); + ipt_unregister_table_exit(net, "raw"); net->ipv4.iptable_raw = NULL; } diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 415c1975d770..33eded4f9080 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -64,16 +64,12 @@ static int __net_init iptable_security_table_init(struct net *net) static void __net_exit iptable_security_net_pre_exit(struct net *net) { - if (net->ipv4.iptable_security) - ipt_unregister_table_pre_exit(net, net->ipv4.iptable_security, - sectbl_ops); + ipt_unregister_table_pre_exit(net, "security", sectbl_ops); } static void __net_exit iptable_security_net_exit(struct net *net) { - if (!net->ipv4.iptable_security) - return; - ipt_unregister_table_exit(net, net->ipv4.iptable_security); + ipt_unregister_table_exit(net, "security"); net->ipv4.iptable_security = NULL; } -- cgit v1.2.3 From 6c0717545f2ca61c95f5f739da845e77cc8bd498 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:03 +0200 Subject: netfilter: ip6tables: unregister the tables by name Same as the previous patch, but for ip6tables. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6/ip6_tables.h | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 14 ++++++++++---- net/ipv6/netfilter/ip6table_filter.c | 9 +++------ net/ipv6/netfilter/ip6table_mangle.c | 9 ++------- net/ipv6/netfilter/ip6table_nat.c | 6 ++---- net/ipv6/netfilter/ip6table_raw.c | 9 +++------ net/ipv6/netfilter/ip6table_security.c | 8 ++------ 7 files changed, 24 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index b88a27ce61b0..8c07426e18a8 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -27,9 +27,9 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res); -void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table, +void ip6t_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops); -void ip6t_unregister_table_exit(struct net *net, struct xt_table *table); +void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e605c28cfed5..11c80da12ee3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1769,15 +1769,21 @@ out_free: return ret; } -void ip6t_unregister_table_pre_exit(struct net *net, struct xt_table *table, +void ip6t_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + + if (table) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); } -void ip6t_unregister_table_exit(struct net *net, struct xt_table *table) +void ip6t_unregister_table_exit(struct net *net, const char *name) { - __ip6t_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); + + if (table) + __ip6t_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 88337b51ffbf..0c9f75e23ca0 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -75,16 +75,13 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_filter) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_filter, - filter_ops); + ip6t_unregister_table_pre_exit(net, "filter", + filter_ops); } static void __net_exit ip6table_filter_net_exit(struct net *net) { - if (!net->ipv6.ip6table_filter) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_filter); + ip6t_unregister_table_exit(net, "filter"); net->ipv6.ip6table_filter = NULL; } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index cee74803d7a1..9a2266662508 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -95,17 +95,12 @@ static int __net_init ip6table_mangle_table_init(struct net *net) static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_mangle) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_mangle, - mangle_ops); + ip6t_unregister_table_pre_exit(net, "mangle", mangle_ops); } static void __net_exit ip6table_mangle_net_exit(struct net *net) { - if (!net->ipv6.ip6table_mangle) - return; - - ip6t_unregister_table_exit(net, net->ipv6.ip6table_mangle); + ip6t_unregister_table_exit(net, "mangle"); net->ipv6.ip6table_mangle = NULL; } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 4cef1b405074..7eb61e6b1e52 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -107,7 +107,7 @@ static int __net_init ip6table_nat_table_init(struct net *net) ret = ip6t_nat_register_lookups(net); if (ret < 0) { - ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat); + ip6t_unregister_table_exit(net, "nat"); net->ipv6.ip6table_nat = NULL; } kfree(repl); @@ -122,9 +122,7 @@ static void __net_exit ip6table_nat_net_pre_exit(struct net *net) static void __net_exit ip6table_nat_net_exit(struct net *net) { - if (!net->ipv6.ip6table_nat) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_nat); + ip6t_unregister_table_exit(net, "nat"); net->ipv6.ip6table_nat = NULL; } diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index 8f9e742226f7..c9a4aada40ba 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -68,16 +68,13 @@ static int __net_init ip6table_raw_table_init(struct net *net) static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_raw) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_raw, - rawtable_ops); + ip6t_unregister_table_pre_exit(net, "raw", + rawtable_ops); } static void __net_exit ip6table_raw_net_exit(struct net *net) { - if (!net->ipv6.ip6table_raw) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_raw); + ip6t_unregister_table_exit(net, "raw"); net->ipv6.ip6table_raw = NULL; } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 5e8c48fed032..73067e08662f 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -63,16 +63,12 @@ static int __net_init ip6table_security_table_init(struct net *net) static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - if (net->ipv6.ip6table_security) - ip6t_unregister_table_pre_exit(net, net->ipv6.ip6table_security, - sectbl_ops); + ip6t_unregister_table_pre_exit(net, "security", sectbl_ops); } static void __net_exit ip6table_security_net_exit(struct net *net) { - if (!net->ipv6.ip6table_security) - return; - ip6t_unregister_table_exit(net, net->ipv6.ip6table_security); + ip6t_unregister_table_exit(net, "security"); net->ipv6.ip6table_security = NULL; } -- cgit v1.2.3 From 4d705399191c3cfe1264588b3a4a8115e6c3b161 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:04 +0200 Subject: netfilter: arptables: unregister the tables by name and again, this time for arptables. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_arp/arp_tables.h | 4 ++-- net/ipv4/netfilter/arp_tables.c | 14 ++++++++++---- net/ipv4/netfilter/arptable_filter.c | 8 ++------ 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 26a13294318c..9ec73dcc8fd6 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -52,8 +52,8 @@ extern void *arpt_alloc_initial_table(const struct xt_table *); int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res); -void arpt_unregister_table(struct net *net, struct xt_table *table); -void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table, +void arpt_unregister_table(struct net *net, const char *name); +void arpt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops); extern unsigned int arpt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d6d45d820d79..8a16b0dc5271 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1541,16 +1541,22 @@ out_free: return ret; } -void arpt_unregister_table_pre_exit(struct net *net, struct xt_table *table, +void arpt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops) { - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + + if (table) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(arpt_unregister_table_pre_exit); -void arpt_unregister_table(struct net *net, struct xt_table *table) +void arpt_unregister_table(struct net *net, const char *name) { - __arpt_unregister_table(net, table); + struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); + + if (table) + __arpt_unregister_table(net, table); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 6c300ba5634e..c121e13dc78c 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -58,16 +58,12 @@ static int __net_init arptable_filter_table_init(struct net *net) static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - if (net->ipv4.arptable_filter) - arpt_unregister_table_pre_exit(net, net->ipv4.arptable_filter, - arpfilter_ops); + arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops); } static void __net_exit arptable_filter_net_exit(struct net *net) { - if (!net->ipv4.arptable_filter) - return; - arpt_unregister_table(net, net->ipv4.arptable_filter); + arpt_unregister_table(net, "filter"); net->ipv4.arptable_filter = NULL; } -- cgit v1.2.3 From ae689334225ff0e4ef112459ecd24aea932c2b00 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:07 +0200 Subject: netfilter: ip_tables: pass table pointer via nf_hook_ops iptable_x modules rely on 'struct net' to contain a pointer to the table that should be evaluated. In order to remove these pointers from struct net, pass them via the 'priv' pointer in a similar fashion as nf_tables passes the rule data. To do that, duplicate the nf_hook_info array passed in from the iptable_x modules, update the ops->priv pointers of the copy to refer to the table and then change the hookfn implementations to just pass the 'priv' argument to the traverser. After this patch, the xt_table pointers can already be removed from struct net. However, changes to struct net result in re-compile of the entire network stack, so do the removal after arptables and ip6tables have been converted as well. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 3 ++ include/linux/netfilter_ipv4/ip_tables.h | 6 ++-- net/ipv4/netfilter/ip_tables.c | 53 ++++++++++++++++++++++---------- net/ipv4/netfilter/iptable_filter.c | 8 ++--- net/ipv4/netfilter/iptable_mangle.c | 14 ++++----- net/ipv4/netfilter/iptable_nat.c | 26 ++++++++-------- net/ipv4/netfilter/iptable_raw.c | 8 ++--- net/ipv4/netfilter/iptable_security.c | 8 ++--- net/netfilter/x_tables.c | 1 + 9 files changed, 71 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index b2eec7de5280..a52cc22f806a 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -229,6 +229,9 @@ struct xt_table { /* Man behind the curtain... */ struct xt_table_info *private; + /* hook ops that register the table with the netfilter core */ + struct nf_hook_ops *ops; + /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 73bcf7f261d2..0fdab3246ef5 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -24,11 +24,9 @@ int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res); - -void ipt_unregister_table_pre_exit(struct net *net, const char *name, - const struct nf_hook_ops *ops); + const struct nf_hook_ops *ops); +void ipt_unregister_table_pre_exit(struct net *net, const char *name); void ipt_unregister_table_exit(struct net *net, const char *name); /* Standard entry. */ diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 0b859ec2d3f8..d6caaed5dd45 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1716,9 +1716,11 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table) int ipt_register_table(struct net *net, const struct xt_table *table, const struct ipt_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1732,40 +1734,57 @@ int ipt_register_table(struct net *net, const struct xt_table *table, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); - if (!ops) + /* No template? No need to do anything. This is used by 'nat' table, it registers + * with the nat core instead of the netfilter core. + */ + if (!template_ops) return 0; - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __ipt_unregister_table(net, new_table); - *res = NULL; + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; } + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __ipt_unregister_table(net, new_table); return ret; } -void ipt_unregister_table_pre_exit(struct net *net, const char *name, - const struct nf_hook_ops *ops) +void ipt_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name); if (table) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ipt_unregister_table_exit(struct net *net, const char *name) diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 84573fa78d1e..8272df7c6ad5 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -34,7 +34,7 @@ static unsigned int iptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_filter); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *filter_ops __read_mostly; @@ -55,8 +55,7 @@ static int __net_init iptable_filter_table_init(struct net *net) ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - err = ipt_register_table(net, &packet_filter, repl, filter_ops, - &net->ipv4.iptable_filter); + err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } @@ -71,13 +70,12 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "filter", filter_ops); + ipt_unregister_table_pre_exit(net, "filter"); } static void __net_exit iptable_filter_net_exit(struct net *net) { ipt_unregister_table_exit(net, "filter"); - net->ipv4.iptable_filter = NULL; } static struct pernet_operations iptable_filter_net_ops = { diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 98e9e9053d85..2abc3836f391 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -37,7 +37,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) +ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; const struct iphdr *iph; @@ -53,7 +53,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); + ret = ipt_do_table(skb, state, priv); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -78,8 +78,8 @@ iptable_mangle_hook(void *priv, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ipt_mangle_out(skb, state); - return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle); + return ipt_mangle_out(skb, state, priv); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; @@ -91,21 +91,19 @@ static int __net_init iptable_mangle_table_init(struct net *net) repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, - &net->ipv4.iptable_mangle); + ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "mangle", mangle_ops); + ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { ipt_unregister_table_exit(net, "mangle"); - net->ipv4.iptable_mangle = NULL; } static struct pernet_operations iptable_mangle_net_ops = { diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index dfa9dc63a7b5..a9913842ef18 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -66,12 +66,19 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = { }, }; -static int ipt_nat_register_lookups(struct net *net, struct xt_table *table) +static int ipt_nat_register_lookups(struct net *net) { - struct nf_hook_ops *ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL); - struct iptable_nat_pernet *xt_nat_net = net_generic(net, iptable_nat_net_id); + struct iptable_nat_pernet *xt_nat_net; + struct nf_hook_ops *ops; + struct xt_table *table; int i, ret; + xt_nat_net = net_generic(net, iptable_nat_net_id); + table = xt_find_table(net, NFPROTO_IPV4, "nat"); + if (WARN_ON_ONCE(!table)) + return -ENOENT; + + ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -109,25 +116,21 @@ static void ipt_nat_unregister_lookups(struct net *net) static int __net_init iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; - struct xt_table *table; int ret; repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, - NULL, &table); + + ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, NULL); if (ret < 0) { kfree(repl); return ret; } - ret = ipt_nat_register_lookups(net, table); - if (ret < 0) { + ret = ipt_nat_register_lookups(net); + if (ret < 0) ipt_unregister_table_exit(net, "nat"); - } else { - net->ipv4.nat_table = table; - } kfree(repl); return ret; @@ -141,7 +144,6 @@ static void __net_exit iptable_nat_net_pre_exit(struct net *net) static void __net_exit iptable_nat_net_exit(struct net *net) { ipt_unregister_table_exit(net, "nat"); - net->ipv4.nat_table = NULL; } static struct pernet_operations iptable_nat_net_ops = { diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 18776f5a4055..ceef397c1f5f 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -41,7 +41,7 @@ static unsigned int iptable_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_raw); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *rawtable_ops __read_mostly; @@ -58,21 +58,19 @@ static int __net_init iptable_raw_table_init(struct net *net) repl = ipt_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, table, repl, rawtable_ops, - &net->ipv4.iptable_raw); + ret = ipt_register_table(net, table, repl, rawtable_ops); kfree(repl); return ret; } static void __net_exit iptable_raw_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "raw", rawtable_ops); + ipt_unregister_table_pre_exit(net, "raw"); } static void __net_exit iptable_raw_net_exit(struct net *net) { ipt_unregister_table_exit(net, "raw"); - net->ipv4.iptable_raw = NULL; } static struct pernet_operations iptable_raw_net_ops = { diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 3df92fb394c5..77973f5fd8f6 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -40,7 +40,7 @@ static unsigned int iptable_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ipt_do_table(skb, state, state->net->ipv4.iptable_security); + return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -53,21 +53,19 @@ static int __net_init iptable_security_table_init(struct net *net) repl = ipt_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - ret = ipt_register_table(net, &security_table, repl, sectbl_ops, - &net->ipv4.iptable_security); + ret = ipt_register_table(net, &security_table, repl, sectbl_ops); kfree(repl); return ret; } static void __net_exit iptable_security_net_pre_exit(struct net *net) { - ipt_unregister_table_pre_exit(net, "security", sectbl_ops); + ipt_unregister_table_pre_exit(net, "security"); } static void __net_exit iptable_security_net_exit(struct net *net) { ipt_unregister_table_exit(net, "security"); - net->ipv4.iptable_security = NULL; } static struct pernet_operations iptable_security_net_ops = { diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 1caba9507228..ef37deff8405 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1498,6 +1498,7 @@ void *xt_unregister_table(struct xt_table *table) mutex_unlock(&xt[table->af].mutex); audit_log_nfcfg(table->name, table->af, private->number, AUDIT_XT_OP_UNREGISTER, GFP_KERNEL); + kfree(table->ops); kfree(table); return private; -- cgit v1.2.3 From f9006acc8dfe59e25aa75729728ac57a8d84fc32 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:08 +0200 Subject: netfilter: arp_tables: pass table pointer via nf_hook_ops Same change as previous patch. Only difference: no need to handle NULL template_ops parameter, the only caller (arptable_filter) always passes non-NULL argument. This removes all remaining accesses to net->ipv4.arptable_filter. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_arp/arp_tables.h | 2 +- net/ipv4/netfilter/arp_tables.c | 43 +++++++++++++++++++++----------- net/ipv4/netfilter/arptable_filter.c | 6 ++--- 3 files changed, 32 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 9ec73dcc8fd6..a0474b4e7782 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -51,7 +51,7 @@ struct arpt_error { extern void *arpt_alloc_initial_table(const struct xt_table *); int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res); + const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); void arpt_unregister_table_pre_exit(struct net *net, const char *name, const struct nf_hook_ops *ops); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 8a16b0dc5271..b1bb6a7e2dd7 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1499,10 +1499,11 @@ static void __arpt_unregister_table(struct net *net, struct xt_table *table) int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, - const struct nf_hook_ops *ops, - struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1516,28 +1517,42 @@ int arpt_register_table(struct net *net, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; + } - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __arpt_unregister_table(net, new_table); - *res = NULL; + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; } + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __arpt_unregister_table(net, new_table); return ret; } diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 924f096a6d89..b8f45e9bbec8 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -34,7 +34,7 @@ static unsigned int arptable_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return arpt_do_table(skb, state, state->net->ipv4.arptable_filter); + return arpt_do_table(skb, state, priv); } static struct nf_hook_ops *arpfilter_ops __read_mostly; @@ -47,8 +47,7 @@ static int __net_init arptable_filter_table_init(struct net *net) repl = arpt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; - err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, - &net->ipv4.arptable_filter); + err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops); kfree(repl); return err; } @@ -61,7 +60,6 @@ static void __net_exit arptable_filter_net_pre_exit(struct net *net) static void __net_exit arptable_filter_net_exit(struct net *net) { arpt_unregister_table(net, "filter"); - net->ipv4.arptable_filter = NULL; } static struct pernet_operations arptable_filter_net_ops = { -- cgit v1.2.3 From ee177a54413a33fe474d55fabb5f8ff390bb27d7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:09 +0200 Subject: netfilter: ip6_tables: pass table pointer via nf_hook_ops Same patch as the ip_tables one: removal of all accesses to ip6_tables xt_table pointers. After this patch the struct net xt_table anchors can be removed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6/ip6_tables.h | 5 ++- net/ipv6/netfilter/ip6_tables.c | 51 ++++++++++++++++++++----------- net/ipv6/netfilter/ip6table_filter.c | 9 ++---- net/ipv6/netfilter/ip6table_mangle.c | 14 ++++----- net/ipv6/netfilter/ip6table_nat.c | 24 ++++++++------- net/ipv6/netfilter/ip6table_raw.c | 9 ++---- net/ipv6/netfilter/ip6table_security.c | 8 ++--- 7 files changed, 63 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 8c07426e18a8..11d0e725fe79 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -26,9 +26,8 @@ extern void *ip6t_alloc_initial_table(const struct xt_table *); int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, - const struct nf_hook_ops *ops, struct xt_table **res); -void ip6t_unregister_table_pre_exit(struct net *net, const char *name, - const struct nf_hook_ops *ops); + const struct nf_hook_ops *ops); +void ip6t_unregister_table_pre_exit(struct net *net, const char *name); void ip6t_unregister_table_exit(struct net *net, const char *name); extern unsigned int ip6t_do_table(struct sk_buff *skb, const struct nf_hook_state *state, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 11c80da12ee3..e763716ffa25 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1725,10 +1725,11 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table) int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, - const struct nf_hook_ops *ops, - struct xt_table **res) + const struct nf_hook_ops *template_ops) { - int ret; + struct nf_hook_ops *ops; + unsigned int num_ops; + int ret, i; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; @@ -1742,40 +1743,54 @@ int ip6t_register_table(struct net *net, const struct xt_table *table, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); - if (ret != 0) - goto out_free; + if (ret != 0) { + xt_free_table_info(newinfo); + return ret; + } new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { - ret = PTR_ERR(new_table); - goto out_free; + xt_free_table_info(newinfo); + return PTR_ERR(new_table); } - /* set res now, will see skbs right after nf_register_net_hooks */ - WRITE_ONCE(*res, new_table); - if (!ops) + if (!template_ops) return 0; - ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); - if (ret != 0) { - __ip6t_unregister_table(net, new_table); - *res = NULL; + num_ops = hweight32(table->valid_hooks); + if (num_ops == 0) { + ret = -EINVAL; + goto out_free; } + ops = kmemdup(template_ops, sizeof(*ops) * num_ops, GFP_KERNEL); + if (!ops) { + ret = -ENOMEM; + goto out_free; + } + + for (i = 0; i < num_ops; i++) + ops[i].priv = new_table; + + new_table->ops = ops; + + ret = nf_register_net_hooks(net, ops, num_ops); + if (ret != 0) + goto out_free; + return ret; out_free: - xt_free_table_info(newinfo); + __ip6t_unregister_table(net, new_table); return ret; } -void ip6t_unregister_table_pre_exit(struct net *net, const char *name, - const struct nf_hook_ops *ops) +void ip6t_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name); if (table) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } void ip6t_unregister_table_exit(struct net *net, const char *name) diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index 2bcafa3e2d35..bb784ea7bbd3 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -35,7 +35,7 @@ static unsigned int ip6table_filter_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *filter_ops __read_mostly; @@ -56,8 +56,7 @@ static int __net_init ip6table_filter_table_init(struct net *net) ((struct ip6t_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - err = ip6t_register_table(net, &packet_filter, repl, filter_ops, - &net->ipv6.ip6table_filter); + err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); return err; } @@ -72,14 +71,12 @@ static int __net_init ip6table_filter_net_init(struct net *net) static void __net_exit ip6table_filter_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "filter", - filter_ops); + ip6t_unregister_table_pre_exit(net, "filter"); } static void __net_exit ip6table_filter_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "filter"); - net->ipv6.ip6table_filter = NULL; } static struct pernet_operations ip6table_filter_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index 14e22022bf41..c76cffd63041 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -32,7 +32,7 @@ static const struct xt_table packet_mangler = { }; static unsigned int -ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) +ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; struct in6_addr saddr, daddr; @@ -49,7 +49,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state) /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); + ret = ip6t_do_table(skb, state, priv); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || @@ -71,8 +71,8 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ip6t_mangle_out(skb, state); - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle); + return ip6t_mangle_out(skb, state, priv); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; @@ -84,21 +84,19 @@ static int __net_init ip6table_mangle_table_init(struct net *net) repl = ip6t_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops, - &net->ipv6.ip6table_mangle); + ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit ip6table_mangle_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "mangle", mangle_ops); + ip6t_unregister_table_pre_exit(net, "mangle"); } static void __net_exit ip6table_mangle_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "mangle"); - net->ipv6.ip6table_mangle = NULL; } static struct pernet_operations ip6table_mangle_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index 69b7f9601d03..b0292251e655 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -68,12 +68,19 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = { }, }; -static int ip6t_nat_register_lookups(struct net *net, struct xt_table *table) +static int ip6t_nat_register_lookups(struct net *net) { - struct nf_hook_ops *ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL); - struct ip6table_nat_pernet *xt_nat_net = net_generic(net, ip6table_nat_net_id); + struct ip6table_nat_pernet *xt_nat_net; + struct nf_hook_ops *ops; + struct xt_table *table; int i, ret; + table = xt_find_table(net, NFPROTO_IPV6, "nat"); + if (WARN_ON_ONCE(!table)) + return -ENOENT; + + xt_nat_net = net_generic(net, ip6table_nat_net_id); + ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -111,25 +118,21 @@ static void ip6t_nat_unregister_lookups(struct net *net) static int __net_init ip6table_nat_table_init(struct net *net) { struct ip6t_replace *repl; - struct xt_table *table; int ret; repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table); if (repl == NULL) return -ENOMEM; ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl, - NULL, &table); + NULL); if (ret < 0) { kfree(repl); return ret; } - ret = ip6t_nat_register_lookups(net, table); - if (ret < 0) { + ret = ip6t_nat_register_lookups(net); + if (ret < 0) ip6t_unregister_table_exit(net, "nat"); - } else { - net->ipv6.ip6table_nat = table; - } kfree(repl); return ret; @@ -143,7 +146,6 @@ static void __net_exit ip6table_nat_net_pre_exit(struct net *net) static void __net_exit ip6table_nat_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "nat"); - net->ipv6.ip6table_nat = NULL; } static struct pernet_operations ip6table_nat_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index ae3df59f0350..f63c106c521e 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -40,7 +40,7 @@ static unsigned int ip6table_raw_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *rawtable_ops __read_mostly; @@ -57,22 +57,19 @@ static int __net_init ip6table_raw_table_init(struct net *net) repl = ip6t_alloc_initial_table(table); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, table, repl, rawtable_ops, - &net->ipv6.ip6table_raw); + ret = ip6t_register_table(net, table, repl, rawtable_ops); kfree(repl); return ret; } static void __net_exit ip6table_raw_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "raw", - rawtable_ops); + ip6t_unregister_table_pre_exit(net, "raw"); } static void __net_exit ip6table_raw_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "raw"); - net->ipv6.ip6table_raw = NULL; } static struct pernet_operations ip6table_raw_net_ops = { diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 83ca632cbf88..8dc335cf450b 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -39,7 +39,7 @@ static unsigned int ip6table_security_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security); + return ip6t_do_table(skb, state, priv); } static struct nf_hook_ops *sectbl_ops __read_mostly; @@ -52,21 +52,19 @@ static int __net_init ip6table_security_table_init(struct net *net) repl = ip6t_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - ret = ip6t_register_table(net, &security_table, repl, sectbl_ops, - &net->ipv6.ip6table_security); + ret = ip6t_register_table(net, &security_table, repl, sectbl_ops); kfree(repl); return ret; } static void __net_exit ip6table_security_net_pre_exit(struct net *net) { - ip6t_unregister_table_pre_exit(net, "security", sectbl_ops); + ip6t_unregister_table_pre_exit(net, "security"); } static void __net_exit ip6table_security_net_exit(struct net *net) { ip6t_unregister_table_exit(net, "security"); - net->ipv6.ip6table_security = NULL; } static struct pernet_operations ip6table_security_net_ops = { -- cgit v1.2.3 From f7163c4882e883fabdafb894176994fd2ade33e2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Apr 2021 09:51:10 +0200 Subject: netfilter: remove all xt_table anchors from struct net No longer needed, table pointer arg is now passed via netfilter core. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netns/ipv4.h | 10 ---------- include/net/netns/ipv6.h | 9 --------- 2 files changed, 19 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 87e1612497ea..f6af8d96d3c6 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -76,16 +76,6 @@ struct netns_ipv4 { struct inet_peer_base *peers; struct sock * __percpu *tcp_sk; struct fqdir *fqdir; -#ifdef CONFIG_NETFILTER - struct xt_table *iptable_filter; - struct xt_table *iptable_mangle; - struct xt_table *iptable_raw; - struct xt_table *arptable_filter; -#ifdef CONFIG_SECURITY - struct xt_table *iptable_security; -#endif - struct xt_table *nat_table; -#endif u8 sysctl_icmp_echo_ignore_all; u8 sysctl_icmp_echo_enable_probe; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 808f0f79ea9c..6153c8067009 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -63,15 +63,6 @@ struct netns_ipv6 { struct ipv6_devconf *devconf_dflt; struct inet_peer_base *peers; struct fqdir *fqdir; -#ifdef CONFIG_NETFILTER - struct xt_table *ip6table_filter; - struct xt_table *ip6table_mangle; - struct xt_table *ip6table_raw; -#ifdef CONFIG_SECURITY - struct xt_table *ip6table_security; -#endif - struct xt_table *ip6table_nat; -#endif struct fib6_info *fib6_null_entry; struct rt6_info *ip6_null_entry; struct rt6_statistics *rt6_stats; -- cgit v1.2.3 From 95aafe911db602d19b00d2a88c3d54a84119f5dc Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 25 Apr 2021 02:30:38 +0200 Subject: net: ethernet: ixp4xx: Support device tree probing This adds device tree probing to the IXP4xx ethernet driver. Add a platform data bool to tell us whether to register an MDIO bus for the device or not, as well as the corresponding NPE. We need to drop the memory region request as part of this since the OF core will request the memory for the device. Cc: Zoltan HERPAI Cc: Raylynn Knight Signed-off-by: Linus Walleij Signed-off-by: David S. Miller --- drivers/net/ethernet/xscale/Kconfig | 1 + drivers/net/ethernet/xscale/ixp4xx_eth.c | 210 +++++++++++++++++++++---------- include/linux/platform_data/eth_ixp4xx.h | 2 + 3 files changed, 150 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/xscale/Kconfig b/drivers/net/ethernet/xscale/Kconfig index 7b83a6e5d894..468ffe3d1707 100644 --- a/drivers/net/ethernet/xscale/Kconfig +++ b/drivers/net/ethernet/xscale/Kconfig @@ -22,6 +22,7 @@ config IXP4XX_ETH tristate "Intel IXP4xx Ethernet support" depends on ARM && ARCH_IXP4XX && IXP4XX_NPE && IXP4XX_QMGR select PHYLIB + select OF_MDIO if OF select NET_PTP_CLASSIFY help Say Y here if you want to use built-in Ethernet ports diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c index 9d323e8595e2..1149e88e6454 100644 --- a/drivers/net/ethernet/xscale/ixp4xx_eth.c +++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -165,7 +166,6 @@ struct eth_regs { }; struct port { - struct resource *mem_res; struct eth_regs __iomem *regs; struct npe *npe; struct net_device *netdev; @@ -250,6 +250,7 @@ static inline void memcpy_swab32(u32 *dest, u32 *src, int cnt) static DEFINE_SPINLOCK(mdio_lock); static struct eth_regs __iomem *mdio_regs; /* mdio command and status only */ static struct mii_bus *mdio_bus; +static struct device_node *mdio_bus_np; static int ports_open; static struct port *npe_port_tab[MAX_NPES]; static struct dma_pool *dma_pool; @@ -533,7 +534,8 @@ static int ixp4xx_mdio_register(struct eth_regs __iomem *regs) mdio_bus->write = &ixp4xx_mdio_write; snprintf(mdio_bus->id, MII_BUS_ID_SIZE, "ixp4xx-eth-0"); - if ((err = mdiobus_register(mdio_bus))) + err = of_mdiobus_register(mdio_bus, mdio_bus_np); + if (err) mdiobus_free(mdio_bus); return err; } @@ -1358,18 +1360,118 @@ static const struct net_device_ops ixp4xx_netdev_ops = { .ndo_validate_addr = eth_validate_addr, }; +#ifdef CONFIG_OF +static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev) +{ + struct device_node *np = dev->of_node; + struct of_phandle_args queue_spec; + struct of_phandle_args npe_spec; + struct device_node *mdio_np; + struct eth_plat_info *plat; + int ret; + + plat = devm_kzalloc(dev, sizeof(*plat), GFP_KERNEL); + if (!plat) + return NULL; + + ret = of_parse_phandle_with_fixed_args(np, "intel,npe-handle", 1, 0, + &npe_spec); + if (ret) { + dev_err(dev, "no NPE engine specified\n"); + return NULL; + } + /* NPE ID 0x00, 0x10, 0x20... */ + plat->npe = (npe_spec.args[0] << 4); + + /* Check if this device has an MDIO bus */ + mdio_np = of_get_child_by_name(np, "mdio"); + if (mdio_np) { + plat->has_mdio = true; + mdio_bus_np = mdio_np; + /* DO NOT put the mdio_np, it will be used */ + } + + /* Get the rx queue as a resource from queue manager */ + ret = of_parse_phandle_with_fixed_args(np, "queue-rx", 1, 0, + &queue_spec); + if (ret) { + dev_err(dev, "no rx queue phandle\n"); + return NULL; + } + plat->rxq = queue_spec.args[0]; + + /* Get the txready queue as resource from queue manager */ + ret = of_parse_phandle_with_fixed_args(np, "queue-txready", 1, 0, + &queue_spec); + if (ret) { + dev_err(dev, "no txready queue phandle\n"); + return NULL; + } + plat->txreadyq = queue_spec.args[0]; + + return plat; +} +#else +static struct eth_plat_info *ixp4xx_of_get_platdata(struct device *dev) +{ + return NULL; +} +#endif + static int ixp4xx_eth_probe(struct platform_device *pdev) { struct phy_device *phydev = NULL; struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; struct eth_plat_info *plat; - resource_size_t regs_phys; struct net_device *ndev; struct resource *res; struct port *port; int err; - plat = dev_get_platdata(dev); + if (np) { + plat = ixp4xx_of_get_platdata(dev); + if (!plat) + return -ENODEV; + } else { + plat = dev_get_platdata(dev); + if (!plat) + return -ENODEV; + plat->npe = pdev->id; + switch (plat->npe) { + case IXP4XX_ETH_NPEA: + /* If the MDIO bus is not up yet, defer probe */ + break; + case IXP4XX_ETH_NPEB: + /* On all except IXP43x, NPE-B is used for the MDIO bus. + * If there is no NPE-B in the feature set, bail out, + * else we have the MDIO bus here. + */ + if (!cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEB_ETH0)) + return -ENODEV; + /* Else register the MDIO bus on NPE-B */ + plat->has_mdio = true; + } + break; + case IXP4XX_ETH_NPEC: + /* IXP43x lacks NPE-B and uses NPE-C for the MDIO bus + * access, if there is no NPE-C, no bus, nothing works, + * so bail out. + */ + if (cpu_is_ixp43x()) { + if (!(ixp4xx_read_feature_bits() & + IXP4XX_FEATURE_NPEC_ETH)) + return -ENODEV; + /* Else register the MDIO bus on NPE-B */ + plat->has_mdio = true; + } + break; + default: + return -ENODEV; + } + } if (!(ndev = devm_alloc_etherdev(dev, sizeof(struct port)))) return -ENOMEM; @@ -1377,59 +1479,29 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) SET_NETDEV_DEV(ndev, dev); port = netdev_priv(ndev); port->netdev = ndev; - port->id = pdev->id; + port->id = plat->npe; /* Get the port resource and remap */ res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) return -ENODEV; - regs_phys = res->start; port->regs = devm_ioremap_resource(dev, res); if (IS_ERR(port->regs)) return PTR_ERR(port->regs); - switch (port->id) { - case IXP4XX_ETH_NPEA: - /* If the MDIO bus is not up yet, defer probe */ - if (!mdio_bus) - return -EPROBE_DEFER; - break; - case IXP4XX_ETH_NPEB: - /* - * On all except IXP43x, NPE-B is used for the MDIO bus. - * If there is no NPE-B in the feature set, bail out, else - * register the MDIO bus. - */ - if (!cpu_is_ixp43x()) { - if (!(ixp4xx_read_feature_bits() & - IXP4XX_FEATURE_NPEB_ETH0)) - return -ENODEV; - /* Else register the MDIO bus on NPE-B */ - if ((err = ixp4xx_mdio_register(port->regs))) - return err; - } - if (!mdio_bus) - return -EPROBE_DEFER; - break; - case IXP4XX_ETH_NPEC: - /* - * IXP43x lacks NPE-B and uses NPE-C for the MDIO bus access, - * of there is no NPE-C, no bus, nothing works, so bail out. - */ - if (cpu_is_ixp43x()) { - if (!(ixp4xx_read_feature_bits() & - IXP4XX_FEATURE_NPEC_ETH)) - return -ENODEV; - /* Else register the MDIO bus on NPE-C */ - if ((err = ixp4xx_mdio_register(port->regs))) - return err; + /* Register the MDIO bus if we have it */ + if (plat->has_mdio) { + err = ixp4xx_mdio_register(port->regs); + if (err) { + dev_err(dev, "failed to register MDIO bus\n"); + return err; } - if (!mdio_bus) - return -EPROBE_DEFER; - break; - default: - return -ENODEV; } + /* If the instance with the MDIO bus has not yet appeared, + * defer probing until it gets probed. + */ + if (!mdio_bus) + return -EPROBE_DEFER; ndev->netdev_ops = &ixp4xx_netdev_ops; ndev->ethtool_ops = &ixp4xx_ethtool_ops; @@ -1440,12 +1512,6 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) if (!(port->npe = npe_request(NPE_ID(port->id)))) return -EIO; - port->mem_res = request_mem_region(regs_phys, REGS_SIZE, ndev->name); - if (!port->mem_res) { - err = -EBUSY; - goto err_npe_rel; - } - port->plat = plat; npe_port_tab[NPE_ID(port->id)] = port; memcpy(ndev->dev_addr, plat->hwaddr, ETH_ALEN); @@ -1458,15 +1524,26 @@ static int ixp4xx_eth_probe(struct platform_device *pdev) __raw_writel(DEFAULT_CORE_CNTRL, &port->regs->core_control); udelay(50); - phydev = mdiobus_get_phy(mdio_bus, plat->phy); - if (IS_ERR(phydev)) { - err = PTR_ERR(phydev); - goto err_free_mem; + if (np) { + phydev = of_phy_get_and_connect(ndev, np, ixp4xx_adjust_link); + } else { + phydev = mdiobus_get_phy(mdio_bus, plat->phy); + if (IS_ERR(phydev)) { + err = PTR_ERR(phydev); + dev_err(dev, "could not connect phydev (%d)\n", err); + goto err_free_mem; + } + err = phy_connect_direct(ndev, phydev, ixp4xx_adjust_link, + PHY_INTERFACE_MODE_MII); + if (err) + goto err_free_mem; + } - err = phy_connect_direct(ndev, phydev, ixp4xx_adjust_link, - PHY_INTERFACE_MODE_MII); - if (err) + if (!phydev) { + err = -ENODEV; + dev_err(dev, "no phydev\n"); goto err_free_mem; + } phydev->irq = PHY_POLL; @@ -1482,8 +1559,6 @@ err_phy_dis: phy_disconnect(phydev); err_free_mem: npe_port_tab[NPE_ID(port->id)] = NULL; - release_resource(port->mem_res); -err_npe_rel: npe_release(port->npe); return err; } @@ -1499,12 +1574,21 @@ static int ixp4xx_eth_remove(struct platform_device *pdev) ixp4xx_mdio_remove(); npe_port_tab[NPE_ID(port->id)] = NULL; npe_release(port->npe); - release_resource(port->mem_res); return 0; } +static const struct of_device_id ixp4xx_eth_of_match[] = { + { + .compatible = "intel,ixp4xx-ethernet", + }, + { }, +}; + static struct platform_driver ixp4xx_eth_driver = { - .driver.name = DRV_NAME, + .driver = { + .name = DRV_NAME, + .of_match_table = of_match_ptr(ixp4xx_eth_of_match), + }, .probe = ixp4xx_eth_probe, .remove = ixp4xx_eth_remove, }; diff --git a/include/linux/platform_data/eth_ixp4xx.h b/include/linux/platform_data/eth_ixp4xx.h index 6f652ea0c6ae..114b0940729f 100644 --- a/include/linux/platform_data/eth_ixp4xx.h +++ b/include/linux/platform_data/eth_ixp4xx.h @@ -14,6 +14,8 @@ struct eth_plat_info { u8 rxq; /* configurable, currently 0 - 31 only */ u8 txreadyq; u8 hwaddr[6]; + u8 npe; /* NPE instance used by this interface */ + bool has_mdio; /* If this instance has an MDIO bus */ }; #endif -- cgit v1.2.3 From 427f0c8c194b22edcafef1b0a42995ddc5c2227d Mon Sep 17 00:00:00 2001 From: Jethro Beekman Date: Sun, 25 Apr 2021 11:22:03 +0200 Subject: macvlan: Add nodst option to macvlan type source The default behavior for source MACVLAN is to duplicate packets to appropriate type source devices, and then do the normal destination MACVLAN flow. This patch adds an option to skip destination MACVLAN processing if any matching source MACVLAN device has the option set. This allows setting up a "catch all" device for source MACVLAN: create one or more devices with type source nodst, and one device with e.g. type vepa, and incoming traffic will be received on exactly one device. v2: netdev wants non-standard line length Signed-off-by: Jethro Beekman Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 19 ++++++++++++++----- include/uapi/linux/if_link.h | 1 + 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 9a9a5cf36a4b..7427b989607e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -423,18 +423,24 @@ static void macvlan_forward_source_one(struct sk_buff *skb, macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false); } -static void macvlan_forward_source(struct sk_buff *skb, +static bool macvlan_forward_source(struct sk_buff *skb, struct macvlan_port *port, const unsigned char *addr) { struct macvlan_source_entry *entry; u32 idx = macvlan_eth_hash(addr); struct hlist_head *h = &port->vlan_source_hash[idx]; + bool consume = false; hlist_for_each_entry_rcu(entry, h, hlist) { - if (ether_addr_equal_64bits(entry->addr, addr)) + if (ether_addr_equal_64bits(entry->addr, addr)) { + if (entry->vlan->flags & MACVLAN_FLAG_NODST) + consume = true; macvlan_forward_source_one(skb, entry->vlan); + } } + + return consume; } /* called under rcu_read_lock() from netif_receive_skb */ @@ -463,7 +469,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_CONSUMED; *pskb = skb; eth = eth_hdr(skb); - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; src = macvlan_hash_lookup(port, eth->h_source); if (src && src->mode != MACVLAN_MODE_VEPA && src->mode != MACVLAN_MODE_BRIDGE) { @@ -482,7 +489,8 @@ static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; } - macvlan_forward_source(skb, port, eth->h_source); + if (macvlan_forward_source(skb, port, eth->h_source)) + return RX_HANDLER_CONSUMED; if (macvlan_passthru(port)) vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list); @@ -1286,7 +1294,8 @@ static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[], return 0; if (data[IFLA_MACVLAN_FLAGS] && - nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~MACVLAN_FLAG_NOPROMISC) + nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC | + MACVLAN_FLAG_NODST)) return -EINVAL; if (data[IFLA_MACVLAN_MODE]) { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 91c8dda6d95d..cd5b382a4138 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -614,6 +614,7 @@ enum macvlan_macaddr_mode { }; #define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ /* VRF section */ enum { -- cgit v1.2.3 From d59d2f82f984df44b31c5d7837fc2f62268b7571 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Apr 2021 00:17:08 +0200 Subject: netfilter: nftables: add nft_pernet() helper function Consolidate call to net_generic(net, nf_tables_net_id) in this wrapper function. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 +++ net/netfilter/nf_tables_api.c | 112 +++++++++++++++++++------------------- net/netfilter/nf_tables_offload.c | 10 ++-- net/netfilter/nft_chain_filter.c | 5 +- net/netfilter/nft_dynset.c | 5 +- 5 files changed, 69 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 4a75da2a2e1d..eb708b77c4a5 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -13,6 +13,7 @@ #include #include #include +#include #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) @@ -1580,4 +1581,11 @@ struct nftables_pernet { u8 validate_state; }; +extern unsigned int nf_tables_net_id; + +static inline struct nftables_pernet *nft_pernet(const struct net *net) +{ + return net_generic(net, nf_tables_net_id); +} + #endif /* _NET_NF_TABLES_H */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 357443b3c0e4..155b85553fcc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) @@ -106,7 +105,7 @@ static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types static void nft_validate_state_update(struct net *net, u8 new_validate_state) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); switch (nft_net->validate_state) { case NFT_VALIDATE_SKIP: @@ -181,7 +180,7 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) if (!nft_set_is_anonymous(set)) return; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry_reverse(trans, &nft_net->commit_list, list) { switch (trans->msg_type) { case NFT_MSG_NEWSET: @@ -278,9 +277,8 @@ static void nf_tables_unregister_hook(struct net *net, static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans) { - struct nftables_pernet *nft_net; + struct nftables_pernet *nft_net = nft_pernet(net); - nft_net = net_generic(net, nf_tables_net_id); list_add_tail(&trans->list, &nft_net->commit_list); } @@ -566,7 +564,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry_rcu(table, &nft_net->tables, list, lockdep_is_held(&nft_net->commit_mutex)) { if (!nla_strcmp(nla, table->name) && @@ -590,7 +588,7 @@ static struct nft_table *nft_table_lookup_byhandle(const struct net *net, struct nftables_pernet *nft_net; struct nft_table *table; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry(table, &nft_net->tables, list) { if (be64_to_cpu(nla_get_be64(nla)) == table->handle && nft_active_genmask(table, genmask)) @@ -655,7 +653,7 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, if (ret >= MODULE_NAME_LEN) return 0; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry(req, &nft_net->module_list, list) { if (!strcmp(req->module, module_name)) { if (req->done) @@ -711,7 +709,7 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, static __be16 nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); return htons(nft_net->base_seq & 0xffff); } @@ -793,7 +791,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: @@ -811,7 +809,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, int family = nfmsg->nfgen_family; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -1062,7 +1060,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -1221,9 +1219,9 @@ out: static int nft_flush(struct nft_ctx *ctx, int family) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); - struct nft_table *table, *nt; + struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nlattr * const *nla = ctx->nla; + struct nft_table *table, *nt; int err = 0; list_for_each_entry_safe(table, nt, &nft_net->tables, list) { @@ -1345,7 +1343,7 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask) static bool lockdep_commit_lock_is_held(const struct net *net) { #ifdef CONFIG_PROVE_LOCKING - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); return lockdep_is_held(&nft_net->commit_mutex); #else @@ -1570,7 +1568,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) goto err; } - nft_net = net_generic(ctx->net, nf_tables_net_id); + nft_net = nft_pernet(ctx->net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: @@ -1581,15 +1579,15 @@ static int nf_tables_dump_chains(struct sk_buff *skb, struct netlink_callback *cb) { const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); - const struct nft_table *table; - const struct nft_chain *chain; unsigned int idx = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; + const struct nft_table *table; + const struct nft_chain *chain; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -1908,7 +1906,7 @@ static int nft_chain_parse_hook(struct net *net, struct nft_chain_hook *hook, u8 family, bool autoload) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nlattr *ha[NFTA_HOOK_MAX + 1]; const struct nft_chain_type *type; int err; @@ -2302,7 +2300,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nla[NFTA_CHAIN_HANDLE] && nla[NFTA_CHAIN_NAME]) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_trans *tmp; char *name; @@ -2338,7 +2336,7 @@ err: static struct nft_chain *nft_chain_lookup_byid(const struct net *net, const struct nlattr *nla) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; @@ -2357,7 +2355,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -2908,7 +2906,7 @@ nla_put_failure: static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; int err; @@ -2989,7 +2987,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, struct nftables_pernet *nft_net; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -3223,7 +3221,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); u8 genmask = nft_genmask_next(net); struct nft_expr_info *info = NULL; @@ -3442,7 +3440,7 @@ err1: static struct nft_rule *nft_rule_lookup_byid(const struct net *net, const struct nlattr *nla) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); struct nft_trans *trans; @@ -3559,7 +3557,7 @@ nft_select_set_ops(const struct nft_ctx *ctx, const struct nft_set_desc *desc, enum nft_set_policies policy) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; @@ -3704,9 +3702,9 @@ static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table, static struct nft_set *nft_set_lookup_byid(const struct net *net, const struct nlattr *nla, u8 genmask) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); - struct nft_trans *trans; + struct nftables_pernet *nft_net = nft_pernet(net); u32 id = ntohl(nla_get_be32(nla)); + struct nft_trans *trans; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->msg_type == NFT_MSG_NEWSET) { @@ -3942,7 +3940,7 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, const struct nft_set *set, int event, gfp_t gfp_flags) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; u32 portid = ctx->portid; int err; @@ -3980,7 +3978,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -4833,7 +4831,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) int event; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && dump_ctx->ctx.family != table->family) @@ -5138,7 +5136,7 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, goto err; } - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list); return; err: @@ -5660,7 +5658,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, const struct nlattr * const nla[], struct netlink_ext_ack *extack) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); u8 genmask = nft_genmask_next(net); const struct nlattr *attr; struct nft_set *set; @@ -6323,7 +6321,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) reset = true; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -6473,7 +6471,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, const struct nftables_pernet *nft_net; char *buf; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq); audit_log_nfcfg(buf, @@ -6560,7 +6558,7 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; int err; char *buf = kasprintf(gfp, "%s:%u", @@ -7246,7 +7244,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, const struct nft_table *table; rcu_read_lock(); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); cb->seq = nft_net->base_seq; list_for_each_entry_rcu(table, &nft_net->tables, list) { @@ -7384,7 +7382,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct list_head *hook_list, int event) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; int err; @@ -7429,7 +7427,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); @@ -7482,7 +7480,7 @@ static int nf_tables_flowtable_event(struct notifier_block *this, return 0; net = dev_net(dev); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { list_for_each_entry(flowtable, &table->flowtables, list) { @@ -7670,7 +7668,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { static int nf_tables_validate(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; switch (nft_net->validate_state) { @@ -7855,7 +7853,7 @@ static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *cha static void nf_tables_commit_chain_prepare_cancel(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) { @@ -7967,7 +7965,7 @@ static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, static void nf_tables_module_autoload_cleanup(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); @@ -7980,7 +7978,7 @@ static void nf_tables_module_autoload_cleanup(struct net *net) static void nf_tables_commit_release(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; /* all side effects have to be made visible. @@ -8014,7 +8012,7 @@ static void nf_tables_commit_release(struct net *net) static void nft_commit_notify(struct net *net, u32 portid) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *batch_skb = NULL, *nskb, *skb; unsigned char *data; int len; @@ -8101,7 +8099,7 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) static int nf_tables_commit(struct net *net, struct sk_buff *skb) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_chain *chain; @@ -8322,7 +8320,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) static void nf_tables_module_autoload(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_module_request *req, *next; LIST_HEAD(module_list); @@ -8370,7 +8368,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; struct nft_trans_elem *te; struct nft_hook *hook; @@ -8524,7 +8522,7 @@ static void nf_tables_cleanup(struct net *net) static int nf_tables_abort(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); int ret = __nf_tables_abort(net, action); mutex_unlock(&nft_net->commit_mutex); @@ -8534,7 +8532,7 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, static bool nf_tables_valid_genid(struct net *net, u32 genid) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); bool genid_ok; mutex_lock(&nft_net->commit_mutex); @@ -9096,7 +9094,7 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) static void __nft_release_hooks(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table; list_for_each_entry(table, &nft_net->tables, list) { @@ -9156,7 +9154,7 @@ static void __nft_release_table(struct net *net, struct nft_table *table) static void __nft_release_tables(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_table *table, *nt; list_for_each_entry_safe(table, nt, &nft_net->tables, list) { @@ -9179,7 +9177,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && @@ -9207,7 +9205,7 @@ static struct notifier_block nft_nl_notifier = { static int __net_init nf_tables_init_net(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); @@ -9227,7 +9225,7 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) static void __net_exit nf_tables_exit_net(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); if (!list_empty(&nft_net->commit_list)) diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 19215e81dd66..a48c5fd53a80 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -7,8 +7,6 @@ #include #include -extern unsigned int nf_tables_net_id; - static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; @@ -389,7 +387,7 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); @@ -490,7 +488,7 @@ static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { @@ -539,7 +537,7 @@ static void nft_flow_rule_offload_abort(struct net *net, int nft_flow_rule_offload_commit(struct net *net) { - struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; @@ -663,7 +661,7 @@ static int nft_offload_netdev_event(struct notifier_block *this, if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - nft_net = net_generic(net, nf_tables_net_id); + nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 7a9aa57b195b..363bdd7044ec 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include @@ -11,8 +10,6 @@ #include #include -extern unsigned int nf_tables_net_id; - #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, @@ -369,7 +366,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; - nft_net = net_generic(ctx.net, nf_tables_net_id); + nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index f9437a0dcfef..6ba3256fa844 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -11,9 +11,6 @@ #include #include #include -#include - -extern unsigned int nf_tables_net_id; struct nft_dynset { struct nft_set *set; @@ -164,7 +161,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nftables_pernet *nft_net = net_generic(ctx->net, nf_tables_net_id); + struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct nft_dynset *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_set *set; -- cgit v1.2.3 From a655536571747575fcaac3c93252b0032d878545 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Apr 2021 00:17:09 +0200 Subject: netfilter: nfnetlink: add struct nfnl_info and pass it to callbacks Add a new structure to reduce callback footprint and to facilite extensions of the nfnetlink callback interface in the future. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 13 ++- net/netfilter/ipset/ip_set_core.c | 149 ++++++++++-------------- net/netfilter/nf_conntrack_netlink.c | 214 +++++++++++++++++------------------ net/netfilter/nfnetlink.c | 18 ++- net/netfilter/nfnetlink_acct.c | 44 ++++--- net/netfilter/nfnetlink_cthelper.c | 30 ++--- net/netfilter/nfnetlink_cttimeout.c | 101 ++++++++--------- net/netfilter/nfnetlink_log.c | 26 ++--- net/netfilter/nfnetlink_osf.c | 19 ++-- net/netfilter/nfnetlink_queue.c | 12 +- 10 files changed, 286 insertions(+), 340 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index d4c14257db5d..1baa3205b199 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -7,11 +7,16 @@ #include #include +struct nfnl_info { + struct net *net; + struct sock *sk; + const struct nlmsghdr *nlh; + struct netlink_ext_ack *extack; +}; + struct nfnl_callback { - int (*call)(struct net *net, struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack); + int (*call)(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]); int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[], diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 359ff8ec236a..bf9902c1daa8 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1031,26 +1031,22 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index, return 0; } -static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_none(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { return -EOPNOTSUPP; } -static int ip_set_create(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *clash = NULL; ip_set_id_t index = IPSET_INVALID_ID; struct nlattr *tb[IPSET_ATTR_CREATE_MAX + 1] = {}; const char *name, *typename; u8 family, revision; - u32 flags = flag_exist(nlh); + u32 flags = flag_exist(info->nlh); int ret = 0; if (unlikely(protocol_min_failed(attr) || @@ -1101,7 +1097,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Set create flags depending on the type revision */ set->flags |= set->type->create_flags[revision]; - ret = set->type->create(net, set, tb, flags); + ret = set->type->create(info->net, set, tb, flags); if (ret != 0) goto put_out; @@ -1183,12 +1179,10 @@ ip_set_destroy_set(struct ip_set *set) kfree(set); } -static int ip_set_destroy(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; int ret = 0; @@ -1230,7 +1224,7 @@ static int ip_set_destroy(struct net *net, struct sock *ctnl, /* Modified by ip_set_destroy() only, which is serialized */ inst->is_destroyed = false; } else { - u32 flags = flag_exist(nlh); + u32 flags = flag_exist(info->nlh); s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), &i); if (!s) { @@ -1264,12 +1258,10 @@ ip_set_flush_set(struct ip_set *set) ip_set_unlock(set); } -static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_flush(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *s; ip_set_id_t i; @@ -1304,12 +1296,10 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = { .len = IPSET_MAXNAMELEN - 1 }, }; -static int ip_set_rename(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_rename(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set, *s; const char *name2; ip_set_id_t i; @@ -1354,12 +1344,10 @@ out: * so the ip_set_list always contains valid pointers to the sets. */ -static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *from, *to; ip_set_id_t from_id, to_id; char from_name[IPSET_MAXNAMELEN]; @@ -1669,10 +1657,8 @@ out: return ret < 0 ? ret : skb->len; } -static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_dump(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { if (unlikely(protocol_min_failed(attr))) return -IPSET_ERR_PROTOCOL; @@ -1683,7 +1669,7 @@ static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb, .dump = ip_set_dump_do, .done = ip_set_dump_done, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } } @@ -1817,30 +1803,24 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, return ret; } -static int ip_set_uadd(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_uadd(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - return ip_set_ad(net, ctnl, skb, - IPSET_ADD, nlh, attr, extack); + return ip_set_ad(info->net, info->sk, skb, + IPSET_ADD, info->nlh, attr, info->extack); } -static int ip_set_udel(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_udel(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - return ip_set_ad(net, ctnl, skb, - IPSET_DEL, nlh, attr, extack); + return ip_set_ad(info->net, info->sk, skb, + IPSET_DEL, info->nlh, attr, info->extack); } -static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_utest(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct ip_set *set; struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {}; int ret = 0; @@ -1872,12 +1852,10 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, /* Get headed data of a set */ -static int ip_set_header(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_header(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1895,7 +1873,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_HEADER); if (!nlh2) goto nlmsg_failure; @@ -1907,7 +1885,8 @@ static int ip_set_header(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -1929,10 +1908,8 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_FAMILY] = { .type = NLA_U8 }, }; -static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_type(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -1955,7 +1932,7 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_TYPE); if (!nlh2) goto nlmsg_failure; @@ -1968,7 +1945,8 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, nlmsg_end(skb2, nlh2); pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -1988,10 +1966,8 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 }, }; -static int ip_set_protocol(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_protocol(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { struct sk_buff *skb2; struct nlmsghdr *nlh2; @@ -2004,7 +1980,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_PROTOCOL); if (!nlh2) goto nlmsg_failure; @@ -2014,7 +1990,8 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -2029,12 +2006,10 @@ nlmsg_failure: /* Get set by name or index, from userspace */ -static int ip_set_byname(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_byname(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; @@ -2053,7 +2028,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYNAME); if (!nlh2) goto nlmsg_failure; @@ -2063,7 +2038,8 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; @@ -2081,12 +2057,10 @@ static const struct nla_policy ip_set_index_policy[IPSET_ATTR_CMD_MAX + 1] = { [IPSET_ATTR_INDEX] = { .type = NLA_U16 }, }; -static int ip_set_byindex(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const attr[], - struct netlink_ext_ack *extack) +static int ip_set_byindex(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const attr[]) { - struct ip_set_net *inst = ip_set_pernet(net); + struct ip_set_net *inst = ip_set_pernet(info->net); struct sk_buff *skb2; struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; @@ -2108,7 +2082,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, if (!skb2) return -ENOMEM; - nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + nlh2 = start_msg(skb2, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, 0, IPSET_CMD_GET_BYINDEX); if (!nlh2) goto nlmsg_failure; @@ -2117,7 +2091,8 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret < 0) return ret; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 44e3cb80e2e0..5147a63b3d1b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1524,17 +1524,15 @@ static int ctnetlink_flush_conntrack(struct net *net, return 0; } -static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_del_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - struct nf_conn *ct; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_conntrack_zone zone; + struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1550,15 +1548,15 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, else { u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; - return ctnetlink_flush_conntrack(net, cda, + return ctnetlink_flush_conntrack(info->net, cda, NETLINK_CB(skb).portid, - nlmsg_report(nlh), u3); + nlmsg_report(info->nlh), u3); } if (err < 0) return err; - h = nf_conntrack_find_get(net, &zone, &tuple); + h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; @@ -1578,28 +1576,26 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, } } - nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(nlh)); + nf_ct_delete(ct, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_put(ct); return 0; } -static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple tuple; - struct nf_conn *ct; - struct sk_buff *skb2 = NULL; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct sk_buff *skb2; + struct nf_conn *ct; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = ctnetlink_start, .dump = ctnetlink_dump_table, @@ -1607,7 +1603,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, .data = (void *)cda, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1626,7 +1622,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, if (err < 0) return err; - h = nf_conntrack_find_get(net, &zone, &tuple); + h = nf_conntrack_find_get(info->net, &zone, &tuple); if (!h) return -ENOENT; @@ -1639,13 +1635,16 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, return -ENOMEM; } - err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), ct, + true, 0); nf_ct_put(ct); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (err < 0) goto out; @@ -1743,18 +1742,16 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) return ctnetlink_dump_list(skb, cb, true); } -static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_ct_dying(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, .done = ctnetlink_done_list, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; @@ -1766,18 +1763,16 @@ ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb) return ctnetlink_dump_list(skb, cb, false); } -static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, .done = ctnetlink_done_list, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return -EOPNOTSUPP; @@ -2374,18 +2369,16 @@ err1: return ERR_PTR(err); } -static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_new_conntrack(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct nf_conn *ct; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -2407,13 +2400,13 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, } if (cda[CTA_TUPLE_ORIG]) - h = nf_conntrack_find_get(net, &zone, &otuple); + h = nf_conntrack_find_get(info->net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = nf_conntrack_find_get(net, &zone, &rtuple); + h = nf_conntrack_find_get(info->net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) { + if (info->nlh->nlmsg_flags & NLM_F_CREATE) { enum ip_conntrack_events events; if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) @@ -2421,8 +2414,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, if (otuple.dst.protonum != rtuple.dst.protonum) return -EINVAL; - ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple, - &rtuple, u3); + ct = ctnetlink_create_conntrack(info->net, &zone, cda, + &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); @@ -2445,7 +2438,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_SYNPROXY) | events, ct, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); nf_ct_put(ct); } @@ -2455,7 +2448,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { + if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) { err = ctnetlink_change_conntrack(ct, cda); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | @@ -2467,7 +2460,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, (1 << IPCT_MARK) | (1 << IPCT_SYNPROXY), ct, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } } @@ -2539,17 +2532,15 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_ct_cpu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_ct_stat_cpu_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; @@ -2585,10 +2576,8 @@ nlmsg_failure: return -1; } -static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_ct(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]) { struct sk_buff *skb2; int err; @@ -2598,13 +2587,14 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, return -ENOMEM; err = ctnetlink_stat_ct_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), sock_net(skb->sk)); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (err < 0) goto out; @@ -3284,29 +3274,29 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, return err; } -static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_get_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; - struct sk_buff *skb2; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; + struct sk_buff *skb2; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { if (cda[CTA_EXPECT_MASTER]) - return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda, - extack); + return ctnetlink_dump_exp_ct(info->net, info->sk, skb, + info->nlh, cda, + info->extack); else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, .done = ctnetlink_exp_done, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } } @@ -3326,7 +3316,7 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, if (err < 0) return err; - exp = nf_ct_expect_find_get(net, &zone, &tuple); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; @@ -3348,13 +3338,15 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, rcu_read_lock(); err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); + info->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + exp); rcu_read_unlock(); nf_ct_expect_put(exp); if (err <= 0) goto free; - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + err = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (err < 0) goto out; @@ -3382,15 +3374,14 @@ static bool expect_iter_all(struct nf_conntrack_expect *exp, void *data) return true; } -static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_del_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; int err; @@ -3406,7 +3397,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(net, &zone, &tuple); + exp = nf_ct_expect_find_get(info->net, &zone, &tuple); if (!exp) return -ENOENT; @@ -3422,7 +3413,7 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); @@ -3432,14 +3423,14 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, } else if (cda[CTA_EXPECT_HELP_NAME]) { char *name = nla_data(cda[CTA_EXPECT_HELP_NAME]); - nf_ct_expect_iterate_net(net, expect_iter_name, name, + nf_ct_expect_iterate_net(info->net, expect_iter_name, name, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } else { /* This basically means we have to flush everything*/ - nf_ct_expect_iterate_net(net, expect_iter_all, NULL, + nf_ct_expect_iterate_net(info->net, expect_iter_all, NULL, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } return 0; @@ -3635,15 +3626,14 @@ err_ct: return err; } -static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_new_expect(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_tuple tuple; struct nf_conntrack_expect *exp; - struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; struct nf_conntrack_zone zone; int err; @@ -3662,20 +3652,20 @@ static int ctnetlink_new_expect(struct net *net, struct sock *ctnl, return err; spin_lock_bh(&nf_conntrack_expect_lock); - exp = __nf_ct_expect_find(net, &zone, &tuple); + exp = __nf_ct_expect_find(info->net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; - if (nlh->nlmsg_flags & NLM_F_CREATE) { - err = ctnetlink_create_expect(net, &zone, cda, u3, + if (info->nlh->nlmsg_flags & NLM_F_CREATE) { + err = ctnetlink_create_expect(info->net, &zone, cda, u3, NETLINK_CB(skb).portid, - nlmsg_report(nlh)); + nlmsg_report(info->nlh)); } return err; } err = -EEXIST; - if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + if (!(info->nlh->nlmsg_flags & NLM_F_EXCL)) err = ctnetlink_change_expect(exp, cda); spin_unlock_bh(&nf_conntrack_expect_lock); @@ -3736,17 +3726,15 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_exp_stat_cpu_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } return 0; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 06f5886f652e..5f04b67bf47e 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -252,6 +252,12 @@ replay: struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); + struct nfnl_info info = { + .net = net, + .sk = nfnlnet->nfnl, + .nlh = nlh, + .extack = extack, + }; /* Sanity-check NFNL_MAX_ATTR_COUNT */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { @@ -276,14 +282,14 @@ replay: rcu_read_unlock(); nfnl_lock(subsys_id); if (nfnl_dereference_protected(subsys_id) != ss || - nfnetlink_find_client(type, ss) != nc) + nfnetlink_find_client(type, ss) != nc) { err = -EAGAIN; - else if (nc->call) - err = nc->call(net, nfnlnet->nfnl, skb, nlh, - (const struct nlattr **)cda, - extack); - else + } else if (nc->call) { + err = nc->call(skb, &info, + (const struct nlattr **)cda); + } else { err = -EINVAL; + } nfnl_unlock(subsys_id); } if (err == -EAGAIN) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 6895f31c5fbb..9cb4b21b8e95 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -56,15 +56,13 @@ static inline struct nfnl_acct_net *nfnl_acct_pernet(struct net *net) #define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES) #define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */ -static int nfnl_acct_new(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_new(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { - struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *nfacct, *matching = NULL; - char *acct_name; unsigned int size = 0; + char *acct_name; u32 flags = 0; if (!tb[NFACCT_NAME]) @@ -78,7 +76,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = nfacct; @@ -86,7 +84,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, } if (matching) { - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* reset counters if you request a replacement. */ atomic64_set(&matching->pkts, 0); atomic64_set(&matching->bytes, 0); @@ -273,17 +271,15 @@ static int nfnl_acct_start(struct netlink_callback *cb) return 0; } -static int nfnl_acct_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_get(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { - struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_acct_dump, .start = nfnl_acct_start, @@ -291,7 +287,7 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, .data = (void *)tb[NFACCT_FILTER], }; - return netlink_dump_start(nfnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!tb[NFACCT_NAME]) @@ -311,15 +307,15 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, } ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), - NFNL_MSG_ACCT_NEW, cur); + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), + NFNL_MSG_ACCT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -347,12 +343,10 @@ static int nfnl_acct_try_del(struct nf_acct *cur) return ret; } -static int nfnl_acct_del(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_acct_del(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { - struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(net); + struct nfnl_acct_net *nfnl_acct_net = nfnl_acct_pernet(info->net); struct nf_acct *cur, *tmp; int ret = -ENOENT; char *acct_name; diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 22f6f7fcc724..3d1a5215177b 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -408,10 +408,8 @@ nfnl_cthelper_update(const struct nlattr * const tb[], return 0; } -static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_new(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { const char *helper_name; struct nf_conntrack_helper *cur, *helper = NULL; @@ -441,7 +439,7 @@ static int nfnl_cthelper_new(struct net *net, struct sock *nfnl, tuple.dst.protonum != cur->tuple.dst.protonum)) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; helper = cur; @@ -607,10 +605,8 @@ out: return skb->len; } -static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_get(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { int ret = -ENOENT; struct nf_conntrack_helper *cur; @@ -623,11 +619,11 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nfnl_cthelper_dump_table, }; - return netlink_dump_start(nfnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (tb[NFCTH_NAME]) @@ -659,15 +655,15 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, } ret = nfnl_cthelper_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_CTHELPER_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -678,10 +674,8 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, return ret; } -static int nfnl_cthelper_del(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_cthelper_del(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const tb[]) { char *helper_name = NULL; struct nf_conntrack_helper *cur; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 46da5548d0b3..994f3172bf42 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -83,13 +83,11 @@ err: return ret; } -static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_new_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); __u16 l3num; __u8 l4num; const struct nf_conntrack_l4proto *l4proto; @@ -111,7 +109,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; matching = timeout; @@ -119,7 +117,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, } if (matching) { - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { /* You cannot replace one timeout policy by another of * different kind, sorry. */ @@ -129,7 +127,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, return ctnl_timeout_parse_policy(&matching->timeout.data, matching->timeout.l4proto, - net, cda[CTA_TIMEOUT_DATA]); + info->net, + cda[CTA_TIMEOUT_DATA]); } return -EBUSY; @@ -150,8 +149,8 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, goto err_proto_put; } - ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, net, - cda[CTA_TIMEOUT_DATA]); + ret = ctnl_timeout_parse_policy(&timeout->timeout.data, l4proto, + info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -248,22 +247,20 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_get_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); int ret = -ENOENT; char *name; struct ctnl_timeout *cur; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnl_timeout_dump, }; - return netlink_dump_start(ctnl, skb, nlh, &c); + return netlink_dump_start(info->sk, skb, info->nlh, &c); } if (!cda[CTA_TIMEOUT_NAME]) @@ -283,15 +280,15 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, } ret = ctnl_timeout_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_NEW, cur); if (ret <= 0) { kfree_skb(skb2); break; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; @@ -320,13 +317,11 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) return ret; } -static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_del_timeout(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { - struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net); + struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(info->net); struct ctnl_timeout *cur, *tmp; int ret = -ENOENT; char *name; @@ -334,7 +329,7 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, if (!cda[CTA_TIMEOUT_NAME]) { list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) - ctnl_timeout_try_del(net, cur); + ctnl_timeout_try_del(info->net, cur); return 0; } @@ -344,7 +339,7 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0) continue; - ret = ctnl_timeout_try_del(net, cur); + ret = ctnl_timeout_try_del(info->net, cur); if (ret < 0) return ret; @@ -353,11 +348,9 @@ static int cttimeout_del_timeout(struct net *net, struct sock *ctnl, return ret; } -static int cttimeout_default_set(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_default_set(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; __u8 l4num; @@ -377,7 +370,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl, goto err; } - ret = ctnl_timeout_parse_policy(NULL, l4proto, net, + ret = ctnl_timeout_parse_policy(NULL, l4proto, info->net, cda[CTA_TIMEOUT_DATA]); if (ret < 0) goto err; @@ -427,11 +420,9 @@ nla_put_failure: return -1; } -static int cttimeout_default_get(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack) +static int cttimeout_default_get(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const cda[]) { const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; @@ -453,35 +444,35 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, switch (l4proto->l4proto) { case IPPROTO_ICMP: - timeouts = &nf_icmp_pernet(net)->timeout; + timeouts = &nf_icmp_pernet(info->net)->timeout; break; case IPPROTO_TCP: - timeouts = nf_tcp_pernet(net)->timeouts; + timeouts = nf_tcp_pernet(info->net)->timeouts; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: - timeouts = nf_udp_pernet(net)->timeouts; + timeouts = nf_udp_pernet(info->net)->timeouts; break; case IPPROTO_DCCP: #ifdef CONFIG_NF_CT_PROTO_DCCP - timeouts = nf_dccp_pernet(net)->dccp_timeout; + timeouts = nf_dccp_pernet(info->net)->dccp_timeout; #endif break; case IPPROTO_ICMPV6: - timeouts = &nf_icmpv6_pernet(net)->timeout; + timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; case IPPROTO_SCTP: #ifdef CONFIG_NF_CT_PROTO_SCTP - timeouts = nf_sctp_pernet(net)->timeouts; + timeouts = nf_sctp_pernet(info->net)->timeouts; #endif break; case IPPROTO_GRE: #ifdef CONFIG_NF_CT_PROTO_GRE - timeouts = nf_gre_pernet(net)->timeouts; + timeouts = nf_gre_pernet(info->net)->timeouts; #endif break; case 255: - timeouts = &nf_generic_pernet(net)->timeout; + timeouts = &nf_generic_pernet(info->net)->timeout; break; default: WARN_ONCE(1, "Missing timeouts for proto %d", l4proto->l4proto); @@ -497,9 +488,10 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, goto err; } - ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + ret = cttimeout_default_fill_info(info->net, skb2, + NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), IPCTNL_MSG_TIMEOUT_DEFAULT_SET, l3num, l4proto, timeouts); if (ret <= 0) { @@ -507,7 +499,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, err = -ENOMEM; goto err; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index d5f458d0ff3d..81630600b4ef 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -845,10 +845,8 @@ static struct notifier_block nfulnl_rtnl_notifier = { .notifier_call = nfulnl_rcv_nl_event, }; -static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfulnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfula[]) { return -ENOTSUPP; } @@ -869,18 +867,16 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = { [NFULA_CFG_FLAGS] = { .type = NLA_U16 }, }; -static int nfulnl_recv_config(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfula[], - struct netlink_ext_ack *extack) +static int nfulnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfula[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_log_net *log = nfnl_log_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u_int16_t group_num = ntohs(nfmsg->res_id); - struct nfulnl_instance *inst; struct nfulnl_msg_config_cmd *cmd = NULL; - struct nfnl_log_net *log = nfnl_log_pernet(net); - int ret = 0; + struct nfulnl_instance *inst; u16 flags = 0; + int ret = 0; if (nfula[NFULA_CFG_CMD]) { u_int8_t pf = nfmsg->nfgen_family; @@ -889,9 +885,9 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl, /* Commands without queue context */ switch (cmd->command) { case NFULNL_CFG_CMD_PF_BIND: - return nf_log_bind_pf(net, pf, &nfulnl_logger); + return nf_log_bind_pf(info->net, pf, &nfulnl_logger); case NFULNL_CFG_CMD_PF_UNBIND: - nf_log_unbind_pf(net, pf); + nf_log_unbind_pf(info->net, pf); return 0; } } @@ -932,7 +928,7 @@ static int nfulnl_recv_config(struct net *net, struct sock *ctnl, goto out_put; } - inst = instance_create(net, group_num, + inst = instance_create(info->net, group_num, NETLINK_CB(skb).portid, sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 916a3c7f9eaf..1fd537ef4496 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -292,10 +292,9 @@ static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; -static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) +static int nfnl_osf_add_callback(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; @@ -307,7 +306,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); @@ -325,7 +324,7 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, kfree(kf); kf = NULL; - if (nlh->nlmsg_flags & NLM_F_EXCL) + if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } @@ -339,11 +338,9 @@ static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl, return err; } -static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const osf_attrs[], - struct netlink_ext_ack *extack) +static int nfnl_osf_remove_callback(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 37e81d895e61..9d7e06d85199 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1245,16 +1245,14 @@ static const struct nf_queue_handler nfqh = { .nf_hook_drop = nfqnl_nf_hook_drop, }; -static int nfqnl_recv_config(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); - struct nfqnl_instance *queue; struct nfqnl_msg_config_cmd *cmd = NULL; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); + struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; -- cgit v1.2.3 From 797d49805ddc6595b2fafe3e9ceff7f562be1f2c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Apr 2021 00:17:10 +0200 Subject: netfilter: nfnetlink: pass struct nfnl_info to rcu callbacks Update rcu callbacks to use the nfnl_info structure. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 6 +- net/netfilter/nf_tables_api.c | 152 ++++++++++++++++++------------------ net/netfilter/nfnetlink.c | 5 +- net/netfilter/nfnetlink_queue.c | 40 ++++------ net/netfilter/nft_compat.c | 24 +++--- 5 files changed, 107 insertions(+), 120 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 1baa3205b199..c11f2f99eac4 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -17,10 +17,8 @@ struct nfnl_info { struct nfnl_callback { int (*call)(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]); - int (*call_rcu)(struct net *net, struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack); + int (*call_rcu)(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]); int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[], diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 155b85553fcc..f7c4e6f14130 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -858,25 +858,25 @@ static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, } /* called with rcu_read_lock held */ -static int nf_tables_gettable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + int family = nfmsg->nfgen_family; const struct nft_table *table; + struct net *net = info->net; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_tables, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0); @@ -890,8 +890,8 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWTABLE, 0, - family, table); + info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE, + 0, family, table); if (err < 0) goto err_fill_table_info; @@ -1623,26 +1623,26 @@ done: } /* called with rcu_read_lock held */ -static int nf_tables_getchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + int family = nfmsg->nfgen_family; const struct nft_chain *chain; + struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_chains, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0); @@ -1662,8 +1662,8 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, 0, - family, table, chain); + info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, + 0, family, table, chain); if (err < 0) goto err_fill_chain_info; @@ -3076,21 +3076,21 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + int family = nfmsg->nfgen_family; const struct nft_chain *chain; const struct nft_rule *rule; + struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; - int family = nfmsg->nfgen_family; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start= nf_tables_dump_rules_start, .dump = nf_tables_dump_rules, @@ -3099,7 +3099,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, .data = (void *)nla, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0); @@ -3125,7 +3125,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, + info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, family, table, chain, rule, NULL); if (err < 0) goto err_fill_rule_info; @@ -4045,25 +4045,25 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + struct net *net = info->net; const struct nft_set *set; - struct nft_ctx ctx; struct sk_buff *skb2; - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_ctx ctx; int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, genmask, 0); if (err < 0) return err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_sets_start, .dump = nf_tables_dump_sets, @@ -4072,7 +4072,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, .module = THIS_MODULE, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } /* Only accept unspec with dump */ @@ -5063,18 +5063,19 @@ err_fill_setelem: } /* called with rcu_read_lock held */ -static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_cur(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); + struct net *net = info->net; struct nft_set *set; struct nlattr *attr; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5083,7 +5084,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, if (IS_ERR(set)) return PTR_ERR(set); - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_set_start, .dump = nf_tables_dump_set, @@ -5096,7 +5097,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, }; c.data = &dump_ctx; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) @@ -6416,22 +6417,22 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_cur(info->net); int family = nfmsg->nfgen_family; const struct nft_table *table; + struct net *net = info->net; struct nft_object *obj; struct sk_buff *skb2; bool reset = false; u32 objtype; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_obj_start, .dump = nf_tables_dump_obj, @@ -6440,7 +6441,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, .data = (void *)nla, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_OBJ_NAME] || @@ -6464,7 +6465,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, if (!skb2) return -ENOMEM; - if (NFNL_MSG_TYPE(nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET) reset = true; if (reset) { @@ -6483,7 +6484,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, } err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, + info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0, family, table, obj, reset); if (err < 0) goto err_fill_obj_info; @@ -7320,21 +7321,20 @@ static int nf_tables_dump_flowtable_done(struct netlink_callback *cb) } /* called with rcu_read_lock held */ -static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_cur(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u8 genmask = nft_genmask_cur(info->net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; const struct nft_table *table; + struct net *net = info->net; struct sk_buff *skb2; int err; - if (nlh->nlmsg_flags & NLM_F_DUMP) { + if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = nf_tables_dump_flowtable_start, .dump = nf_tables_dump_flowtable, @@ -7343,7 +7343,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, .data = (void *)nla, }; - return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c); } if (!nla[NFTA_FLOWTABLE_NAME]) @@ -7364,7 +7364,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, return -ENOMEM; err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, + info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, flowtable, &flowtable->hook_list); if (err < 0) @@ -7526,10 +7526,8 @@ err: -ENOBUFS); } -static int nf_tables_getgen(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { struct sk_buff *skb2; int err; @@ -7538,12 +7536,12 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, if (skb2 == NULL) return -ENOMEM; - err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid, - nlh->nlmsg_seq); + err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid, + info->nlh->nlmsg_seq); if (err < 0) goto err_fill_gen_info; - return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); err_fill_gen_info: kfree_skb(skb2); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 5f04b67bf47e..7920f6c4ff69 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -274,9 +274,8 @@ replay: } if (nc->call_rcu) { - err = nc->call_rcu(net, nfnlnet->nfnl, skb, nlh, - (const struct nlattr **)cda, - extack); + err = nc->call_rcu(skb, &info, + (const struct nlattr **)cda); rcu_read_unlock(); } else { rcu_read_unlock(); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 9d7e06d85199..ede9252c8de1 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1046,20 +1046,18 @@ static int nfq_id_after(unsigned int id, unsigned int max) return (int)(id - max) > 0; } -static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_verdict_batch(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u16 queue_num = ntohs(nfmsg->res_id); struct nf_queue_entry *entry, *tmp; - unsigned int verdict, maxid; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; + unsigned int verdict, maxid; LIST_HEAD(batch_list); - u16 queue_num = ntohs(nfmsg->res_id); - struct nfnl_queue_net *q = nfnl_queue_pernet(net); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); @@ -1158,22 +1156,19 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, return 0; } -static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nfqa[]) { - struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); + struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); u_int16_t queue_num = ntohs(nfmsg->res_id); struct nfqnl_msg_verdict_hdr *vhdr; + enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; - unsigned int verdict; struct nf_queue_entry *entry; - enum ip_conntrack_info ctinfo; struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; - struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, @@ -1196,7 +1191,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) - ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); + ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, + &ctinfo); } if (entry->state.pf == PF_BRIDGE) { @@ -1224,10 +1220,8 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, return 0; } -static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nfqa[], - struct netlink_ext_ack *extack) +static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]) { return -ENOTSUPP; } diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index b8dbd20a6a4c..4c0657245d5a 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -613,17 +613,15 @@ nla_put_failure: return -1; } -static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_compat_get_rcu(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const tb[]) { - int ret = 0, target; struct nfgenmsg *nfmsg; - const char *fmt; - const char *name; - u32 rev; + const char *name, *fmt; struct sk_buff *skb2; + int ret = 0, target; + u32 rev; if (tb[NFTA_COMPAT_NAME] == NULL || tb[NFTA_COMPAT_REV] == NULL || @@ -634,7 +632,7 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); - nfmsg = nlmsg_data(nlh); + nfmsg = nlmsg_data(info->nlh); switch(nfmsg->nfgen_family) { case AF_INET: @@ -673,8 +671,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, /* include the best revision for this extension in the message */ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), + info->nlh->nlmsg_seq, + NFNL_MSG_TYPE(info->nlh->nlmsg_type), NFNL_MSG_COMPAT_GET, nfmsg->nfgen_family, name, ret, target) <= 0) { @@ -682,8 +680,8 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, goto out_put; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, + MSG_DONTWAIT); if (ret > 0) ret = 0; out_put: -- cgit v1.2.3 From 7dab8ee3b6e7ec856a616d07ebb9ebd736c92520 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Apr 2021 00:17:11 +0200 Subject: netfilter: nfnetlink: pass struct nfnl_info to batch callbacks Update batch callbacks to use the nfnl_info structure. Rename one clashing info variable to expr_info. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 6 +- net/netfilter/nf_tables_api.c | 338 ++++++++++++++++++------------------ net/netfilter/nfnetlink.c | 14 +- 3 files changed, 182 insertions(+), 176 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index c11f2f99eac4..df0e3254c57b 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -19,10 +19,8 @@ struct nfnl_callback { const struct nlattr * const cda[]); int (*call_rcu)(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]); - int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const cda[], - struct netlink_ext_ack *extack); + int (*call_batch)(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const cda[]); const struct nla_policy *policy; /* netlink attribute policy */ const u_int16_t attr_count; /* number of nlattr's */ }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f7c4e6f14130..280ca136df56 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1055,15 +1055,15 @@ static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg, return strcmp(obj->key.name, k->name); } -static int nf_tables_newtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = nft_pernet(net); - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + struct nftables_pernet *nft_net = nft_pernet(info->net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; @@ -1078,14 +1078,15 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (PTR_ERR(table) != -ENOENT) return PTR_ERR(table); } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); + return nf_tables_updtable(&ctx); } @@ -1126,7 +1127,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table->flags & NFT_TABLE_F_OWNER) table->nlpid = NETLINK_CB(skb).portid; - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err_trans; @@ -1250,19 +1251,19 @@ out: return err; } -static int nf_tables_deltable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; - nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla); if (family == AF_UNSPEC || (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE])) return nft_flush(&ctx, family); @@ -1281,7 +1282,7 @@ static int nf_tables_deltable(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - if (nlh->nlmsg_flags & NLM_F_NONREC && + if (info->nlh->nlmsg_flags & NLM_F_NONREC && table->use > 0) return -EBUSY; @@ -2350,16 +2351,16 @@ static struct nft_chain *nft_chain_lookup_byid(const struct net *net, return ERR_PTR(-ENOENT); } -static int nf_tables_newchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = nft_pernet(net); - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + struct nftables_pernet *nft_net = nft_pernet(info->net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; struct nft_chain *chain = NULL; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; u8 policy = NF_ACCEPT; @@ -2431,14 +2432,14 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, if (flags & ~NFT_CHAIN_FLAGS) return -EOPNOTSUPP; - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain != NULL) { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; flags |= chain->flags & NFT_CHAIN_BASE; @@ -2449,14 +2450,14 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return nf_tables_addchain(&ctx, family, genmask, policy, flags); } -static int nf_tables_delchain(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_chain *chain; @@ -2486,11 +2487,11 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } - if (nlh->nlmsg_flags & NLM_F_NONREC && + if (info->nlh->nlmsg_flags & NLM_F_NONREC && chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); use = chain->use; list_for_each_entry(rule, &chain->rules, list) { @@ -2713,15 +2714,15 @@ err1: } static int nf_tables_newexpr(const struct nft_ctx *ctx, - const struct nft_expr_info *info, + const struct nft_expr_info *expr_info, struct nft_expr *expr) { - const struct nft_expr_ops *ops = info->ops; + const struct nft_expr_ops *ops = expr_info->ops; int err; expr->ops = ops; if (ops->init) { - err = ops->init(ctx, expr, (const struct nlattr **)info->tb); + err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb); if (err < 0) goto err1; } @@ -2745,21 +2746,21 @@ static void nf_tables_expr_destroy(const struct nft_ctx *ctx, static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, const struct nlattr *nla) { - struct nft_expr_info info; + struct nft_expr_info expr_info; struct nft_expr *expr; struct module *owner; int err; - err = nf_tables_expr_parse(ctx, nla, &info); + err = nf_tables_expr_parse(ctx, nla, &expr_info); if (err < 0) goto err1; err = -ENOMEM; - expr = kzalloc(info.ops->size, GFP_KERNEL); + expr = kzalloc(expr_info.ops->size, GFP_KERNEL); if (expr == NULL) goto err2; - err = nf_tables_newexpr(ctx, &info, expr); + err = nf_tables_newexpr(ctx, &expr_info, expr); if (err < 0) goto err3; @@ -2767,9 +2768,9 @@ static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, err3: kfree(expr); err2: - owner = info.ops->type->owner; - if (info.ops->type->release_ops) - info.ops->type->release_ops(info.ops); + owner = expr_info.ops->type->owner; + if (expr_info.ops->type->release_ops) + expr_info.ops->type->release_ops(expr_info.ops); module_put(owner); err1: @@ -3216,28 +3217,28 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, #define NFT_RULE_MAXEXPRS 128 -static int nf_tables_newrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = nft_pernet(net); - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - struct nft_expr_info *info = NULL; + struct nftables_pernet *nft_net = nft_pernet(info->net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + unsigned int size, i, n, ulen = 0, usize = 0; + u8 genmask = nft_genmask_next(info->net); + struct nft_rule *rule, *old_rule = NULL; + struct nft_expr_info *expr_info = NULL; int family = nfmsg->nfgen_family; + struct net *net = info->net; struct nft_flow_rule *flow; + struct nft_userdata *udata; struct nft_table *table; struct nft_chain *chain; - struct nft_rule *rule, *old_rule = NULL; - struct nft_userdata *udata; - struct nft_trans *trans = NULL; + struct nft_trans *trans; + u64 handle, pos_handle; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n, ulen = 0, usize = 0; int err, rem; - u64 handle, pos_handle; lockdep_assert_held(&nft_net->commit_mutex); @@ -3276,17 +3277,17 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, return PTR_ERR(rule); } - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) old_rule = rule; else return -EOPNOTSUPP; } else { - if (!(nlh->nlmsg_flags & NLM_F_CREATE) || - nlh->nlmsg_flags & NLM_F_REPLACE) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) || + info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EINVAL; handle = nf_tables_alloc_handle(table); @@ -3309,15 +3310,15 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } } - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); n = 0; size = 0; if (nla[NFTA_RULE_EXPRESSIONS]) { - info = kvmalloc_array(NFT_RULE_MAXEXPRS, - sizeof(struct nft_expr_info), - GFP_KERNEL); - if (!info) + expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS, + sizeof(struct nft_expr_info), + GFP_KERNEL); + if (!expr_info) return -ENOMEM; nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) { @@ -3326,10 +3327,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, goto err1; if (n == NFT_RULE_MAXEXPRS) goto err1; - err = nf_tables_expr_parse(&ctx, tmp, &info[n]); + err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]); if (err < 0) goto err1; - size += info[n].ops->size; + size += expr_info[n].ops->size; n++; } } @@ -3363,20 +3364,20 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, expr = nft_expr_first(rule); for (i = 0; i < n; i++) { - err = nf_tables_newexpr(&ctx, &info[i], expr); + err = nf_tables_newexpr(&ctx, &expr_info[i], expr); if (err < 0) { - NL_SET_BAD_ATTR(extack, info[i].attr); + NL_SET_BAD_ATTR(extack, expr_info[i].attr); goto err2; } - if (info[i].ops->validate) + if (expr_info[i].ops->validate) nft_validate_state_update(net, NFT_VALIDATE_NEED); - info[i].ops = NULL; + expr_info[i].ops = NULL; expr = nft_expr_next(expr); } - if (nlh->nlmsg_flags & NLM_F_REPLACE) { + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; @@ -3396,7 +3397,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, goto err2; } - if (nlh->nlmsg_flags & NLM_F_APPEND) { + if (info->nlh->nlmsg_flags & NLM_F_APPEND) { if (old_rule) list_add_rcu(&rule->list, &old_rule->list); else @@ -3408,7 +3409,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, list_add_rcu(&rule->list, &chain->rules); } } - kvfree(info); + kvfree(expr_info); chain->use++; if (nft_net->validate_state == NFT_VALIDATE_DO) @@ -3427,13 +3428,14 @@ err2: nf_tables_rule_release(&ctx, rule); err1: for (i = 0; i < n; i++) { - if (info[i].ops) { - module_put(info[i].ops->type->owner); - if (info[i].ops->type->release_ops) - info[i].ops->type->release_ops(info[i].ops); + if (expr_info[i].ops) { + module_put(expr_info[i].ops->type->owner); + if (expr_info[i].ops->type->release_ops) + expr_info[i].ops->type->release_ops(expr_info[i].ops); } } - kvfree(info); + kvfree(expr_info); + return err; } @@ -3454,17 +3456,17 @@ static struct nft_rule *nft_rule_lookup_byid(const struct net *net, return ERR_PTR(-ENOENT); } -static int nf_tables_delrule(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); - struct nft_table *table; + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + int family = nfmsg->nfgen_family, err = 0; + u8 genmask = nft_genmask_next(info->net); struct nft_chain *chain = NULL; + struct net *net = info->net; + struct nft_table *table; struct nft_rule *rule; - int family = nfmsg->nfgen_family, err = 0; struct nft_ctx ctx; table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, @@ -3485,7 +3487,7 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, chain, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -4166,28 +4168,27 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, return err; } -static int nf_tables_newset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + u32 ktype, dtype, flags, policy, gc_int, objtype; + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; const struct nft_set_ops *ops; struct nft_expr *expr = NULL; + struct net *net = info->net; + struct nft_set_desc desc; struct nft_table *table; + unsigned char *udata; struct nft_set *set; struct nft_ctx ctx; - char *name; - u64 size; u64 timeout; - u32 ktype, dtype, flags, policy, gc_int, objtype; - struct nft_set_desc desc; - unsigned char *udata; + char *name; + int err, i; u16 udlen; - int err; - int i; + u64 size; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -4295,7 +4296,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { @@ -4304,17 +4305,17 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, return PTR_ERR(set); } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; return 0; } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; ops = nft_select_set_ops(&ctx, nla, &desc, policy); @@ -4448,13 +4449,13 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) kvfree(set); } -static int nf_tables_delset(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + struct net *net = info->net; const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -4465,7 +4466,7 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_setattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -4483,7 +4484,8 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk, return PTR_ERR(set); } if (set->use || - (nlh->nlmsg_flags & NLM_F_NONREC && atomic_read(&set->nelems) > 0)) { + (info->nlh->nlmsg_flags & NLM_F_NONREC && + atomic_read(&set->nelems) > 0)) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } @@ -5654,13 +5656,14 @@ err_set_elem_expr_clone: return err; } -static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - struct nftables_pernet *nft_net = nft_pernet(net); - u8 genmask = nft_genmask_next(net); + struct nftables_pernet *nft_net = nft_pernet(info->net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + struct net *net = info->net; const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -5669,7 +5672,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -5683,7 +5686,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, return -EBUSY; nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); + err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); if (err < 0) return err; } @@ -5866,18 +5869,19 @@ err1: return err; } -static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delsetelem(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - u8 genmask = nft_genmask_next(net); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); + struct net *net = info->net; const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; int rem, err = 0; - err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, extack, + err = nft_ctx_init_from_elemattr(&ctx, net, skb, info->nlh, nla, extack, genmask, NETLINK_CB(skb).portid); if (err < 0) return err; @@ -6161,15 +6165,15 @@ err_free_trans: return err; } -static int nf_tables_newobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); const struct nft_object_type *type; - u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; + struct net *net = info->net; struct nft_table *table; struct nft_object *obj; struct nft_ctx ctx; @@ -6197,20 +6201,20 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, return err; } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]); return -EEXIST; } - if (nlh->nlmsg_flags & NLM_F_REPLACE) + if (info->nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; type = __nft_obj_type_get(objtype); - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); type = nft_obj_type_get(net, objtype); if (IS_ERR(type)) @@ -6507,14 +6511,14 @@ static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) kfree(obj); } -static int nf_tables_delobj(struct net *net, struct sock *nlsk, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_object *obj; @@ -6550,7 +6554,7 @@ static int nf_tables_delobj(struct net *net, struct sock *nlsk, return -EBUSY; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nft_delobj(&ctx, obj); } @@ -6937,19 +6941,19 @@ err_flowtable_update_hook: } -static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_newflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; struct nft_flowtable_hook flowtable_hook; + u8 genmask = nft_genmask_next(info->net); const struct nf_flowtable_type *type; - u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; struct nft_hook *hook, *next; + struct net *net = info->net; struct nft_table *table; struct nft_ctx ctx; int err; @@ -6975,17 +6979,17 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return err; } } else { - if (nlh->nlmsg_flags & NLM_F_EXCL) { + if (info->nlh->nlmsg_flags & NLM_F_EXCL) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); return -EEXIST; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - return nft_flowtable_update(&ctx, nlh, flowtable); + return nft_flowtable_update(&ctx, info->nlh, flowtable); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL); if (!flowtable) @@ -7126,16 +7130,16 @@ err_flowtable_del_hook: return err; } -static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, - struct sk_buff *skb, - const struct nlmsghdr *nlh, - const struct nlattr * const nla[], - struct netlink_ext_ack *extack) +static int nf_tables_delflowtable(struct sk_buff *skb, + const struct nfnl_info *info, + const struct nlattr * const nla[]) { - const struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u8 genmask = nft_genmask_next(net); + const struct nfgenmsg *nfmsg = nlmsg_data(info->nlh); + struct netlink_ext_ack *extack = info->extack; + u8 genmask = nft_genmask_next(info->net); int family = nfmsg->nfgen_family; struct nft_flowtable *flowtable; + struct net *net = info->net; const struct nlattr *attr; struct nft_table *table; struct nft_ctx ctx; @@ -7165,7 +7169,7 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, return PTR_ERR(flowtable); } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); if (nla[NFTA_FLOWTABLE_HOOK]) return nft_delflowtable_hook(&ctx, flowtable); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 7920f6c4ff69..e62c5af4b631 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -469,10 +469,17 @@ replay_abort: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); + struct nfnl_net *nfnlnet = nfnl_pernet(net); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; + struct nfnl_info info = { + .net = net, + .sk = nfnlnet->nfnl, + .nlh = nlh, + .extack = &extack, + }; /* Sanity-check NFTA_MAX_ATTR */ if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { @@ -488,11 +495,8 @@ replay_abort: goto ack; if (nc->call_batch) { - struct nfnl_net *nfnlnet = nfnl_pernet(net); - - err = nc->call_batch(net, nfnlnet->nfnl, skb, nlh, - (const struct nlattr **)cda, - &extack); + err = nc->call_batch(skb, &info, + (const struct nlattr **)cda); } /* The lock was released to autoload some module, we -- cgit v1.2.3 From 50f2db9e368f73ecbbaa92da365183fa953aaba7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 23 Apr 2021 00:17:12 +0200 Subject: netfilter: nfnetlink: consolidate callback types Add enum nfnl_callback_type to identify the callback type to provide one single callback. Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 16 ++++--- net/netfilter/ipset/ip_set_core.c | 16 +++++++ net/netfilter/nf_conntrack_netlink.c | 88 +++++++++++++++++++++++++----------- net/netfilter/nf_tables_api.c | 69 ++++++++++++++++++---------- net/netfilter/nfnetlink.c | 37 +++++++++------ net/netfilter/nfnetlink_acct.c | 36 ++++++++++----- net/netfilter/nfnetlink_cthelper.c | 27 +++++++---- net/netfilter/nfnetlink_cttimeout.c | 45 ++++++++++++------ net/netfilter/nfnetlink_log.c | 16 +++++-- net/netfilter/nfnetlink_osf.c | 2 + net/netfilter/nfnetlink_queue.c | 34 +++++++++----- net/netfilter/nft_compat.c | 9 ++-- 12 files changed, 271 insertions(+), 124 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index df0e3254c57b..515ce53aa20d 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -14,15 +14,19 @@ struct nfnl_info { struct netlink_ext_ack *extack; }; +enum nfnl_callback_type { + NFNL_CB_UNSPEC = 0, + NFNL_CB_MUTEX, + NFNL_CB_RCU, + NFNL_CB_BATCH, +}; + struct nfnl_callback { int (*call)(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]); - int (*call_rcu)(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const cda[]); - int (*call_batch)(struct sk_buff *skb, const struct nfnl_info *info, - const struct nlattr * const cda[]); - const struct nla_policy *policy; /* netlink attribute policy */ - const u_int16_t attr_count; /* number of nlattr's */ + const struct nla_policy *policy; + enum nfnl_callback_type type; + __u16 attr_count; }; enum nfnl_abort_action { diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index bf9902c1daa8..de2d20c37cda 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -2108,80 +2108,96 @@ nlmsg_failure: static const struct nfnl_callback ip_set_netlink_subsys_cb[IPSET_MSG_MAX] = { [IPSET_CMD_NONE] = { .call = ip_set_none, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, }, [IPSET_CMD_CREATE] = { .call = ip_set_create, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_create_policy, }, [IPSET_CMD_DESTROY] = { .call = ip_set_destroy, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_FLUSH] = { .call = ip_set_flush, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_RENAME] = { .call = ip_set_rename, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_SWAP] = { .call = ip_set_swap, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname2_policy, }, [IPSET_CMD_LIST] = { .call = ip_set_dump, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_dump_policy, }, [IPSET_CMD_SAVE] = { .call = ip_set_dump, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_ADD] = { .call = ip_set_uadd, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_DEL] = { .call = ip_set_udel, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_TEST] = { .call = ip_set_utest, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_adt_policy, }, [IPSET_CMD_HEADER] = { .call = ip_set_header, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_TYPE] = { .call = ip_set_type, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_type_policy, }, [IPSET_CMD_PROTOCOL] = { .call = ip_set_protocol, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_protocol_policy, }, [IPSET_CMD_GET_BYNAME] = { .call = ip_set_byname, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_setname_policy, }, [IPSET_CMD_GET_BYINDEX] = { .call = ip_set_byindex, + .type = NFNL_CB_MUTEX, .attr_count = IPSET_ATTR_CMD_MAX, .policy = ip_set_index_policy, }, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5147a63b3d1b..8690fc07030f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3751,35 +3751,71 @@ static struct nf_exp_event_notifier ctnl_notifier_exp = { #endif static const struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { - [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, - .attr_count = CTA_MAX, - .policy = ct_nla_policy }, - [IPCTNL_MSG_CT_GET_STATS_CPU] = { .call = ctnetlink_stat_ct_cpu }, - [IPCTNL_MSG_CT_GET_STATS] = { .call = ctnetlink_stat_ct }, - [IPCTNL_MSG_CT_GET_DYING] = { .call = ctnetlink_get_ct_dying }, - [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { .call = ctnetlink_get_ct_unconfirmed }, + [IPCTNL_MSG_CT_NEW] = { + .call = ctnetlink_new_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET] = { + .call = ctnetlink_get_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_DELETE] = { + .call = ctnetlink_del_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { + .call = ctnetlink_get_conntrack, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_MAX, + .policy = ct_nla_policy + }, + [IPCTNL_MSG_CT_GET_STATS_CPU] = { + .call = ctnetlink_stat_ct_cpu, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_STATS] = { + .call = ctnetlink_stat_ct, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_DYING] = { + .call = ctnetlink_get_ct_dying, + .type = NFNL_CB_MUTEX, + }, + [IPCTNL_MSG_CT_GET_UNCONFIRMED] = { + .call = ctnetlink_get_ct_unconfirmed, + .type = NFNL_CB_MUTEX, + }, }; static const struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { - [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, - .attr_count = CTA_EXPECT_MAX, - .policy = exp_nla_policy }, - [IPCTNL_MSG_EXP_GET_STATS_CPU] = { .call = ctnetlink_stat_exp_cpu }, + [IPCTNL_MSG_EXP_GET] = { + .call = ctnetlink_get_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_NEW] = { + .call = ctnetlink_new_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_DELETE] = { + .call = ctnetlink_del_expect, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_EXPECT_MAX, + .policy = exp_nla_policy + }, + [IPCTNL_MSG_EXP_GET_STATS_CPU] = { + .call = ctnetlink_stat_exp_cpu, + .type = NFNL_CB_MUTEX, + }, }; static const struct nfnetlink_subsystem ctnl_subsys = { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 280ca136df56..1050f23c0d29 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7554,115 +7554,138 @@ err_fill_gen_info: static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { [NFT_MSG_NEWTABLE] = { - .call_batch = nf_tables_newtable, + .call = nf_tables_newtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_GETTABLE] = { - .call_rcu = nf_tables_gettable, + .call = nf_tables_gettable, + .type = NFNL_CB_RCU, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_DELTABLE] = { - .call_batch = nf_tables_deltable, + .call = nf_tables_deltable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, [NFT_MSG_NEWCHAIN] = { - .call_batch = nf_tables_newchain, + .call = nf_tables_newchain, + .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_GETCHAIN] = { - .call_rcu = nf_tables_getchain, + .call = nf_tables_getchain, + .type = NFNL_CB_RCU, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_DELCHAIN] = { - .call_batch = nf_tables_delchain, + .call = nf_tables_delchain, + .type = NFNL_CB_BATCH, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, [NFT_MSG_NEWRULE] = { - .call_batch = nf_tables_newrule, + .call = nf_tables_newrule, + .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_GETRULE] = { - .call_rcu = nf_tables_getrule, + .call = nf_tables_getrule, + .type = NFNL_CB_RCU, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_DELRULE] = { - .call_batch = nf_tables_delrule, + .call = nf_tables_delrule, + .type = NFNL_CB_BATCH, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, [NFT_MSG_NEWSET] = { - .call_batch = nf_tables_newset, + .call = nf_tables_newset, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_GETSET] = { - .call_rcu = nf_tables_getset, + .call = nf_tables_getset, + .type = NFNL_CB_RCU, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_DELSET] = { - .call_batch = nf_tables_delset, + .call = nf_tables_delset, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, [NFT_MSG_NEWSETELEM] = { - .call_batch = nf_tables_newsetelem, + .call = nf_tables_newsetelem, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM] = { - .call_rcu = nf_tables_getsetelem, + .call = nf_tables_getsetelem, + .type = NFNL_CB_RCU, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_DELSETELEM] = { - .call_batch = nf_tables_delsetelem, + .call = nf_tables_delsetelem, + .type = NFNL_CB_BATCH, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETGEN] = { - .call_rcu = nf_tables_getgen, + .call = nf_tables_getgen, + .type = NFNL_CB_RCU, }, [NFT_MSG_NEWOBJ] = { - .call_batch = nf_tables_newobj, + .call = nf_tables_newobj, + .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ] = { - .call_rcu = nf_tables_getobj, + .call = nf_tables_getobj, + .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_DELOBJ] = { - .call_batch = nf_tables_delobj, + .call = nf_tables_delobj, + .type = NFNL_CB_BATCH, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call_rcu = nf_tables_getobj, + .call = nf_tables_getobj, + .type = NFNL_CB_RCU, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, [NFT_MSG_NEWFLOWTABLE] = { - .call_batch = nf_tables_newflowtable, + .call = nf_tables_newflowtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_GETFLOWTABLE] = { - .call_rcu = nf_tables_getflowtable, + .call = nf_tables_getflowtable, + .type = NFNL_CB_RCU, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, [NFT_MSG_DELFLOWTABLE] = { - .call_batch = nf_tables_delflowtable, + .call = nf_tables_delflowtable, + .type = NFNL_CB_BATCH, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e62c5af4b631..d7a9628b6cee 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -273,23 +273,30 @@ replay: return err; } - if (nc->call_rcu) { - err = nc->call_rcu(skb, &info, - (const struct nlattr **)cda); + if (!nc->call) { rcu_read_unlock(); - } else { + return -EINVAL; + } + + switch (nc->type) { + case NFNL_CB_RCU: + err = nc->call(skb, &info, (const struct nlattr **)cda); + rcu_read_unlock(); + break; + case NFNL_CB_MUTEX: rcu_read_unlock(); nfnl_lock(subsys_id); if (nfnl_dereference_protected(subsys_id) != ss || nfnetlink_find_client(type, ss) != nc) { err = -EAGAIN; - } else if (nc->call) { - err = nc->call(skb, &info, - (const struct nlattr **)cda); - } else { - err = -EINVAL; + break; } + err = nc->call(skb, &info, (const struct nlattr **)cda); nfnl_unlock(subsys_id); + break; + default: + err = -EINVAL; + break; } if (err == -EAGAIN) goto replay; @@ -467,12 +474,17 @@ replay_abort: goto ack; } + if (nc->type != NFNL_CB_BATCH) { + err = -EINVAL; + goto ack; + } + { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); struct nfnl_net *nfnlnet = nfnl_pernet(net); - u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; + u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); int attrlen = nlh->nlmsg_len - min_len; struct nfnl_info info = { .net = net, @@ -494,10 +506,7 @@ replay_abort: if (err < 0) goto ack; - if (nc->call_batch) { - err = nc->call_batch(skb, &info, - (const struct nlattr **)cda); - } + err = nc->call(skb, &info, (const struct nlattr **)cda); /* The lock was released to autoload some module, we * have to abort and start from scratch using the diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 9cb4b21b8e95..3c8cf8748cfb 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -382,18 +382,30 @@ static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { }; static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { - [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, - [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, - .attr_count = NFACCT_MAX, - .policy = nfnl_acct_policy }, + [NFNL_MSG_ACCT_NEW] = { + .call = nfnl_acct_new, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_GET] = { + .call = nfnl_acct_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_GET_CTRZERO] = { + .call = nfnl_acct_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, + [NFNL_MSG_ACCT_DEL] = { + .call = nfnl_acct_del, + .type = NFNL_CB_MUTEX, + .attr_count = NFACCT_MAX, + .policy = nfnl_acct_policy + }, }; static const struct nfnetlink_subsystem nfnl_acct_subsys = { diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 3d1a5215177b..322ac5dd5402 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -737,15 +737,24 @@ static const struct nla_policy nfnl_cthelper_policy[NFCTH_MAX+1] = { }; static const struct nfnl_callback nfnl_cthelper_cb[NFNL_MSG_CTHELPER_MAX] = { - [NFNL_MSG_CTHELPER_NEW] = { .call = nfnl_cthelper_new, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, - [NFNL_MSG_CTHELPER_GET] = { .call = nfnl_cthelper_get, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, - [NFNL_MSG_CTHELPER_DEL] = { .call = nfnl_cthelper_del, - .attr_count = NFCTH_MAX, - .policy = nfnl_cthelper_policy }, + [NFNL_MSG_CTHELPER_NEW] = { + .call = nfnl_cthelper_new, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, + [NFNL_MSG_CTHELPER_GET] = { + .call = nfnl_cthelper_get, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, + [NFNL_MSG_CTHELPER_DEL] = { + .call = nfnl_cthelper_del, + .type = NFNL_CB_MUTEX, + .attr_count = NFCTH_MAX, + .policy = nfnl_cthelper_policy + }, }; static const struct nfnetlink_subsystem nfnl_cthelper_subsys = { diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 994f3172bf42..38848ad68899 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -546,21 +546,36 @@ static void ctnl_timeout_put(struct nf_ct_timeout *t) } static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = { - [IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_GET] = { .call = cttimeout_get_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DELETE] = { .call = cttimeout_del_timeout, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DEFAULT_SET]= { .call = cttimeout_default_set, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, - [IPCTNL_MSG_TIMEOUT_DEFAULT_GET]= { .call = cttimeout_default_get, - .attr_count = CTA_TIMEOUT_MAX, - .policy = cttimeout_nla_policy }, + [IPCTNL_MSG_TIMEOUT_NEW] = { + .call = cttimeout_new_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_GET] = { + .call = cttimeout_get_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DELETE] = { + .call = cttimeout_del_timeout, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_SET] = { + .call = cttimeout_default_set, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, + [IPCTNL_MSG_TIMEOUT_DEFAULT_GET] = { + .call = cttimeout_default_get, + .type = NFNL_CB_MUTEX, + .attr_count = CTA_TIMEOUT_MAX, + .policy = cttimeout_nla_policy + }, }; static const struct nfnetlink_subsystem cttimeout_subsys = { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 81630600b4ef..587086b18c36 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -989,11 +989,17 @@ out: } static const struct nfnl_callback nfulnl_cb[NFULNL_MSG_MAX] = { - [NFULNL_MSG_PACKET] = { .call = nfulnl_recv_unsupp, - .attr_count = NFULA_MAX, }, - [NFULNL_MSG_CONFIG] = { .call = nfulnl_recv_config, - .attr_count = NFULA_CFG_MAX, - .policy = nfula_cfg_policy }, + [NFULNL_MSG_PACKET] = { + .call = nfulnl_recv_unsupp, + .type = NFNL_CB_MUTEX, + .attr_count = NFULA_MAX, + }, + [NFULNL_MSG_CONFIG] = { + .call = nfulnl_recv_config, + .type = NFNL_CB_MUTEX, + .attr_count = NFULA_CFG_MAX, + .policy = nfula_cfg_policy + }, }; static const struct nfnetlink_subsystem nfulnl_subsys = { diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 1fd537ef4496..e8f8875c6884 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -374,11 +374,13 @@ static int nfnl_osf_remove_callback(struct sk_buff *skb, static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, + .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, + .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index ede9252c8de1..f37a575ebd7f 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1365,17 +1365,29 @@ err_out_unlock: } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { - [NFQNL_MSG_PACKET] = { .call_rcu = nfqnl_recv_unsupp, - .attr_count = NFQA_MAX, }, - [NFQNL_MSG_VERDICT] = { .call_rcu = nfqnl_recv_verdict, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_policy }, - [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, - .attr_count = NFQA_CFG_MAX, - .policy = nfqa_cfg_policy }, - [NFQNL_MSG_VERDICT_BATCH]={ .call_rcu = nfqnl_recv_verdict_batch, - .attr_count = NFQA_MAX, - .policy = nfqa_verdict_batch_policy }, + [NFQNL_MSG_PACKET] = { + .call = nfqnl_recv_unsupp, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + }, + [NFQNL_MSG_VERDICT] = { + .call = nfqnl_recv_verdict, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_policy + }, + [NFQNL_MSG_CONFIG] = { + .call = nfqnl_recv_config, + .type = NFNL_CB_MUTEX, + .attr_count = NFQA_CFG_MAX, + .policy = nfqa_cfg_policy + }, + [NFQNL_MSG_VERDICT_BATCH] = { + .call = nfqnl_recv_verdict_batch, + .type = NFNL_CB_RCU, + .attr_count = NFQA_MAX, + .policy = nfqa_verdict_batch_policy + }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 4c0657245d5a..5415ab14400d 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -698,9 +698,12 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { }; static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { - [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu, - .attr_count = NFTA_COMPAT_MAX, - .policy = nfnl_compat_policy_get }, + [NFNL_MSG_COMPAT_GET] = { + .call = nfnl_compat_get_rcu, + .type = NFNL_CB_RCU, + .attr_count = NFTA_COMPAT_MAX, + .policy = nfnl_compat_policy_get + }, }; static const struct nfnetlink_subsystem nfnl_compat_subsys = { -- cgit v1.2.3 From 47a6959fa331fe892a4fc3b48ca08e92045c6bda Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 26 Apr 2021 12:14:40 +0200 Subject: netfilter: allow to turn off xtables compat layer The compat layer needs to parse untrusted input (the ruleset) to translate it to a 64bit compatible format. We had a number of bugs in this department in the past, so allow users to turn this feature off. Add CONFIG_NETFILTER_XTABLES_COMPAT kconfig knob and make it default to y to keep existing behaviour. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 12 ++++++------ include/linux/netfilter_arp/arp_tables.h | 2 +- include/linux/netfilter_ipv4/ip_tables.h | 2 +- include/linux/netfilter_ipv6/ip6_tables.h | 2 +- net/bridge/netfilter/ebt_limit.c | 4 ++-- net/bridge/netfilter/ebt_mark.c | 4 ++-- net/bridge/netfilter/ebt_mark_m.c | 4 ++-- net/bridge/netfilter/ebtables.c | 12 ++++++------ net/ipv4/netfilter/arp_tables.c | 16 ++++++++-------- net/ipv4/netfilter/ip_tables.c | 16 ++++++++-------- net/ipv4/netfilter/ipt_CLUSTERIP.c | 8 ++++---- net/ipv6/netfilter/ip6_tables.c | 16 ++++++++-------- net/netfilter/Kconfig | 10 ++++++++++ net/netfilter/x_tables.c | 16 ++++++++-------- net/netfilter/xt_limit.c | 6 +++--- 15 files changed, 70 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index a52cc22f806a..07c6ad8f2a02 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -158,7 +158,7 @@ struct xt_match { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); @@ -169,7 +169,7 @@ struct xt_match { const char *table; unsigned int matchsize; unsigned int usersize; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; @@ -199,7 +199,7 @@ struct xt_target { /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); @@ -210,7 +210,7 @@ struct xt_target { const char *table; unsigned int targetsize; unsigned int usersize; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; @@ -452,7 +452,7 @@ xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include struct compat_xt_entry_match { @@ -533,5 +533,5 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ #endif /* _X_TABLES_H */ diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index a0474b4e7782..2aab9612f6ab 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -59,7 +59,7 @@ extern unsigned int arpt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include struct compat_arpt_entry { diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h index 0fdab3246ef5..8d09bfe850dc 100644 --- a/include/linux/netfilter_ipv4/ip_tables.h +++ b/include/linux/netfilter_ipv4/ip_tables.h @@ -67,7 +67,7 @@ extern unsigned int ipt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include struct compat_ipt_entry { diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index 11d0e725fe79..79e73fd7d965 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -33,7 +33,7 @@ extern unsigned int ip6t_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include struct compat_ip6t_entry { diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c index fa199556e122..e16183bd1bb8 100644 --- a/net/bridge/netfilter/ebt_limit.c +++ b/net/bridge/netfilter/ebt_limit.c @@ -87,7 +87,7 @@ static int ebt_limit_mt_check(const struct xt_mtchk_param *par) } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* * no conversion function needed -- * only avg/burst have meaningful values in userspace. @@ -107,7 +107,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = { .checkentry = ebt_limit_mt_check, .matchsize = sizeof(struct ebt_limit_info), .usersize = offsetof(struct ebt_limit_info, prev), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct ebt_compat_limit_info), #endif .me = THIS_MODULE, diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c index 21fd3d3d77f6..8cf653c72fd8 100644 --- a/net/bridge/netfilter/ebt_mark.c +++ b/net/bridge/netfilter/ebt_mark.c @@ -53,7 +53,7 @@ static int ebt_mark_tg_check(const struct xt_tgchk_param *par) return -EINVAL; return 0; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ebt_mark_t_info { compat_ulong_t mark; compat_uint_t target; @@ -87,7 +87,7 @@ static struct xt_target ebt_mark_tg_reg __read_mostly = { .target = ebt_mark_tg, .checkentry = ebt_mark_tg_check, .targetsize = sizeof(struct ebt_mark_t_info), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ebt_mark_t_info), .compat_from_user = mark_tg_compat_from_user, .compat_to_user = mark_tg_compat_to_user, diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c index 81fb59dec499..5872e73c741e 100644 --- a/net/bridge/netfilter/ebt_mark_m.c +++ b/net/bridge/netfilter/ebt_mark_m.c @@ -37,7 +37,7 @@ static int ebt_mark_mt_check(const struct xt_mtchk_param *par) } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ebt_mark_m_info { compat_ulong_t mark, mask; uint8_t invert, bitmask; @@ -75,7 +75,7 @@ static struct xt_match ebt_mark_mt_reg __read_mostly = { .match = ebt_mark_mt, .checkentry = ebt_mark_mt_check, .matchsize = sizeof(struct ebt_mark_m_info), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ebt_mark_m_info), .compat_from_user = mark_mt_compat_from_user, .compat_to_user = mark_mt_compat_to_user, diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index a04596bb2a6e..f022deb3721e 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -47,7 +47,7 @@ struct ebt_pernet { static unsigned int ebt_pernet_id __read_mostly; static DEFINE_MUTEX(ebt_mutex); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void ebt_standard_compat_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -73,7 +73,7 @@ static struct xt_target ebt_standard_target = { .revision = 0, .family = NFPROTO_BRIDGE, .targetsize = sizeof(int), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = ebt_standard_compat_from_user, .compat_to_user = ebt_standard_compat_to_user, @@ -1502,7 +1502,7 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user, ebt_entry_to_user, entries, tmp.entries); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* 32 bit-userspace compatibility definitions. */ struct compat_ebt_replace { char name[EBT_TABLE_MAXNAMELEN]; @@ -2367,7 +2367,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* try real handler in case userland supplied needed padding */ if (in_compat_syscall() && ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) || @@ -2434,7 +2434,7 @@ static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, switch (cmd) { case EBT_SO_SET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(net, arg, len); else @@ -2442,7 +2442,7 @@ static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, ret = do_replace(net, arg, len); break; case EBT_SO_SET_COUNTERS: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_update_counters(net, arg, len); else diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b1bb6a7e2dd7..cf20316094d0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -713,7 +713,7 @@ static int copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -800,7 +800,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(NFPROTO_ARP); #endif @@ -808,7 +808,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -835,7 +835,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(NFPROTO_ARP); #endif @@ -1044,7 +1044,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_arpt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1412,7 +1412,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, switch (cmd) { case ARPT_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1444,7 +1444,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len break; case ARPT_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1580,7 +1580,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_ARP, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d6caaed5dd45..13acb687c19a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -868,7 +868,7 @@ copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -957,7 +957,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET); #endif @@ -965,7 +965,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -993,7 +993,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET); #endif @@ -1199,7 +1199,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1621,7 +1621,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) switch (cmd) { case IPT_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1654,7 +1654,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) break; case IPT_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1846,7 +1846,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV4, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index a8b980ad11d4..8f7ca67475b7 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -541,7 +541,7 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) nf_ct_netns_put(par->net, par->family); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_clusterip_tgt_info { u_int32_t flags; @@ -553,7 +553,7 @@ struct compat_ipt_clusterip_tgt_info u_int32_t hash_initval; compat_uptr_t config; }; -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", @@ -563,9 +563,9 @@ static struct xt_target clusterip_tg_reg __read_mostly = { .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), .usersize = offsetof(struct ipt_clusterip_tgt_info, config), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ .me = THIS_MODULE }; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e763716ffa25..e810a23baf99 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -884,7 +884,7 @@ copy_entries_to_user(unsigned int total_size, return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -973,7 +973,7 @@ static int get_info(struct net *net, void __user *user, const int *len) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_lock(AF_INET6); #endif @@ -981,7 +981,7 @@ static int get_info(struct net *net, void __user *user, const int *len) if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct xt_table_info tmp; if (in_compat_syscall()) { @@ -1009,7 +1009,7 @@ static int get_info(struct net *net, void __user *user, const int *len) module_put(t->me); } else ret = PTR_ERR(t); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) xt_compat_unlock(AF_INET6); #endif @@ -1215,7 +1215,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) return ret; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ip6t_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; @@ -1630,7 +1630,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len) switch (cmd) { case IP6T_SO_SET_REPLACE: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_do_replace(sock_net(sk), arg, len); else @@ -1663,7 +1663,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) break; case IP6T_SO_GET_ENTRIES: -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) ret = compat_get_entries(sock_net(sk), user, len); else @@ -1853,7 +1853,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV6, -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index fcd8682704c4..56a2531a3402 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -728,6 +728,16 @@ config NETFILTER_XTABLES if NETFILTER_XTABLES +config NETFILTER_XTABLES_COMPAT + bool "Netfilter Xtables 32bit support" + depends on COMPAT + default y + help + This option provides a translation layer to run 32bit arp,ip(6),ebtables + binaries on 64bit kernels. + + If unsure, say N. + comment "Xtables combined modules" config NETFILTER_XT_MARK diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index ef37deff8405..84e58ee501a4 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -52,7 +52,7 @@ struct xt_af { struct mutex mutex; struct list_head match; struct list_head target; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct mutex compat_mutex; struct compat_delta *compat_tab; unsigned int number; /* number of slots in compat_tab[] */ @@ -647,7 +647,7 @@ static bool error_tg_ok(unsigned int usersize, unsigned int kernsize, return usersize == kernsize && strnlen(msg, msglen) < msglen; } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { struct xt_af *xp = &xt[af]; @@ -850,7 +850,7 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems, __alignof__(struct compat_xt_entry_match)); } EXPORT_SYMBOL(xt_compat_check_entry_offsets); -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ /** * xt_check_entry_offsets - validate arp/ip/ip6t_entry @@ -868,7 +868,7 @@ EXPORT_SYMBOL(xt_compat_check_entry_offsets); * match structures are aligned, and that the last structure ends where * the target structure begins. * - * Also see xt_compat_check_entry_offsets for CONFIG_COMPAT version. + * Also see xt_compat_check_entry_offsets for CONFIG_NETFILTER_XTABLES_COMPAT version. * * The arp/ip/ip6t_entry structure @base must have passed following tests: * - it must point to a valid memory location @@ -1059,7 +1059,7 @@ void *xt_copy_counters(sockptr_t arg, unsigned int len, void *mem; u64 size; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT if (in_compat_syscall()) { /* structures only differ in size due to alignment */ struct compat_xt_counters_info compat_tmp; @@ -1106,7 +1106,7 @@ void *xt_copy_counters(sockptr_t arg, unsigned int len, } EXPORT_SYMBOL_GPL(xt_copy_counters); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT int xt_compat_target_offset(const struct xt_target *target) { u_int16_t csize = target->compatsize ? : target->targetsize; @@ -1293,7 +1293,7 @@ void xt_table_unlock(struct xt_table *table) } EXPORT_SYMBOL_GPL(xt_table_unlock); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); @@ -1931,7 +1931,7 @@ static int __init xt_init(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) { mutex_init(&xt[i].mutex); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT mutex_init(&xt[i].compat_mutex); xt[i].compat_tab = NULL; #endif diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index bd1dea9c7b88..24d4afb9988d 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -134,7 +134,7 @@ static void limit_mt_destroy(const struct xt_mtdtor_param *par) kfree(info->master); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_xt_rateinfo { u_int32_t avg; u_int32_t burst; @@ -176,7 +176,7 @@ static int limit_mt_compat_to_user(void __user *dst, const void *src) }; return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0; } -#endif /* CONFIG_COMPAT */ +#endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_match limit_mt_reg __read_mostly = { .name = "limit", @@ -186,7 +186,7 @@ static struct xt_match limit_mt_reg __read_mostly = { .checkentry = limit_mt_check, .destroy = limit_mt_destroy, .matchsize = sizeof(struct xt_rateinfo), -#ifdef CONFIG_COMPAT +#ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_xt_rateinfo), .compat_from_user = limit_mt_compat_from_user, .compat_to_user = limit_mt_compat_to_user, -- cgit v1.2.3 From 63fa73e2151848ed5930dfe0040c823ffe1f2cc4 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 26 Apr 2021 09:44:47 -0300 Subject: net: Fix typo in comment about ancillary data Ingo sent typo fixes for tools/ and this resulted in a warning when building the perf/core branch that will be sent upstream in the next merge window: Warning: Kernel ABI header at 'tools/perf/trace/beauty/include/linux/socket.h' differs from latest version at 'include/linux/socket.h' diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Fix the typo on the kernel file to address this. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 385894b4a8bb..b8fc5c53ba6f 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -85,7 +85,7 @@ struct mmsghdr { /* * POSIX 1003.1g - ancillary data object information - * Ancillary data consits of a sequence of pairs of + * Ancillary data consists of a sequence of pairs of * (cmsghdr, cmsg_data[]) */ -- cgit v1.2.3 From aaa31047a6d25da0fa101da1ed544e1247949b40 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 27 Apr 2021 18:05:55 +0200 Subject: netfilter: nftables: add catch-all set element support This patch extends the set infrastructure to add a special catch-all set element. If the lookup fails to find an element (or range) in the set, then the catch-all element is selected. Users can specify a mapping, expression(s) and timeout to be attached to the catch-all element. This patch adds a catchall list to the set, this list might contain more than one single catch-all element (e.g. in case that the catch-all element is removed and a new one is added in the same transaction). However, most of the time, there will be either one element or no elements at all in this list. The catch-all element is identified via NFT_SET_ELEM_CATCHALL flag and such special element has no NFTA_SET_ELEM_KEY attribute. There is a new nft_set_elem_catchall object that stores a reference to the dummy catch-all element (catchall->elem) whose layout is the same of the set element type to reuse the existing set element codebase. The set size does not apply to the catch-all element, users can define a catch-all element even if the set is full. The check for valid set element flags hava been updates to report EOPNOTSUPP in case userspace requests flags that are not supported when using new userspace nftables and old kernel. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 + include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nf_tables_api.c | 480 +++++++++++++++++++++++++++---- net/netfilter/nft_lookup.c | 12 +- net/netfilter/nft_objref.c | 11 +- net/netfilter/nft_set_hash.c | 6 + net/netfilter/nft_set_pipapo.c | 6 +- net/netfilter/nft_set_rbtree.c | 6 + 8 files changed, 465 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index eb708b77c4a5..27eeb613bb4e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -497,6 +497,7 @@ struct nft_set { u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; + struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; @@ -522,6 +523,10 @@ struct nft_set *nft_set_lookup_global(const struct net *net, const struct nlattr *nla_set_id, u8 genmask); +struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + const struct nft_set *set); +void *nft_set_catchall_gc(const struct nft_set *set); + static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { return set->gc_int ? msecs_to_jiffies(set->gc_int) : HZ; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 467365ed59a7..1fb4ca18ffbb 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -398,9 +398,11 @@ enum nft_set_attributes { * enum nft_set_elem_flags - nf_tables set element flags * * @NFT_SET_ELEM_INTERVAL_END: element ends the previous interval + * @NFT_SET_ELEM_CATCHALL: special catch-all element */ enum nft_set_elem_flags { NFT_SET_ELEM_INTERVAL_END = 0x1, + NFT_SET_ELEM_CATCHALL = 0x2, }; /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index faf0424375e8..0b7fe0a902ff 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4389,6 +4389,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, } INIT_LIST_HEAD(&set->bindings); + INIT_LIST_HEAD(&set->catchall_list); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -4434,6 +4435,24 @@ err_set_name: return err; } +struct nft_set_elem_catchall { + struct list_head list; + struct rcu_head rcu; + void *elem; +}; + +static void nft_set_catchall_destroy(const struct nft_ctx *ctx, + struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_del_rcu(&catchall->list); + nft_set_elem_destroy(set, catchall->elem, true); + kfree_rcu(catchall); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; @@ -4445,6 +4464,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) nft_expr_destroy(ctx, set->exprs[i]); set->ops->destroy(set); + nft_set_catchall_destroy(ctx, set); kfree(set->name); kvfree(set); } @@ -4521,6 +4541,29 @@ static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx, return nft_setelem_data_validate(ctx, set, elem); } +static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; + ret = nft_setelem_data_validate(ctx, set, &elem); + if (ret < 0) + break; + } + + return ret; +} + int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { @@ -4550,6 +4593,9 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, iter.fn = nf_tables_bind_check_setelem; set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_bind_check(ctx, set); + if (iter.err < 0) return iter.err; } @@ -4736,7 +4782,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nest == NULL) goto nla_put_failure; - if (nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext), NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; @@ -4825,6 +4872,29 @@ struct nft_set_dump_ctx { struct nft_ctx ctx; }; +static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_expired(ext)) + continue; + + elem.priv = catchall->elem; + ret = nf_tables_fill_setelem(skb, set, &elem); + break; + } + + return ret; +} + static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { struct nft_set_dump_ctx *dump_ctx = cb->data; @@ -4889,6 +4959,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) args.iter.err = 0; args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); + + if (!args.iter.err && args.iter.count == cb->args[0]) + args.iter.err = nft_set_catchall_dump(net, skb, set); rcu_read_unlock(); nla_nest_end(skb, nest); @@ -4968,8 +5041,8 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; *flags = ntohl(nla_get_be32(attr)); - if (*flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; + if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) + return -EOPNOTSUPP; if (!(set->flags & NFT_SET_INTERVAL) && *flags & NFT_SET_ELEM_INTERVAL_END) return -EINVAL; @@ -5014,6 +5087,46 @@ static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, return 0; } +static void *nft_setelem_catchall_get(const struct net *net, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_ext *ext; + void *priv = NULL; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_expired(ext)) + continue; + + priv = catchall->elem; + break; + } + + return priv; +} + +static int nft_setelem_get(struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_elem *elem, u32 flags) +{ + void *priv; + + if (!(flags & NFT_SET_ELEM_CATCHALL)) { + priv = set->ops->get(ctx->net, set, elem, flags); + if (IS_ERR(priv)) + return PTR_ERR(priv); + } else { + priv = nft_setelem_catchall_get(ctx->net, set); + if (!priv) + return -ENOENT; + } + elem->priv = priv; + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -5021,7 +5134,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; - void *priv; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, @@ -5029,17 +5141,19 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (!nla[NFTA_SET_ELEM_KEY]) - return -EINVAL; - err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - return err; + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + return -EINVAL; + + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -5048,11 +5162,9 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - priv = set->ops->get(ctx->net, set, &elem, flags); - if (IS_ERR(priv)) - return PTR_ERR(priv); - - elem.priv = priv; + err = nft_setelem_get(ctx, set, &elem, flags); + if (err < 0) + return err; err = -ENOMEM; skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); @@ -5212,7 +5324,8 @@ void *nft_set_elem_init(const struct nft_set *set, ext = nft_set_elem_ext(set, elem); nft_set_ext_init(ext, tmpl); - memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY)) + memcpy(nft_set_ext_key(ext), key, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) memcpy(nft_set_ext_key_end(ext), key_end, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) @@ -5343,6 +5456,192 @@ err_elem_expr_setup: return -ENOMEM; } +struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_cur(net); + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (nft_set_elem_active(ext, genmask) && + !nft_set_elem_expired(ext)) + return ext; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); + +void *nft_set_catchall_gc(const struct nft_set *set) +{ + struct nft_set_elem_catchall *catchall, *next; + struct nft_set_ext *ext; + void *elem = NULL; + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext) || + nft_set_elem_mark_busy(ext)) + continue; + + elem = catchall->elem; + list_del_rcu(&catchall->list); + kfree_rcu(catchall, rcu); + break; + } + + return elem; +} +EXPORT_SYMBOL_GPL(nft_set_catchall_gc); + +static int nft_setelem_catchall_insert(const struct net *net, + struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **pext) +{ + struct nft_set_elem_catchall *catchall; + u8 genmask = nft_genmask_next(net); + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (nft_set_elem_active(ext, genmask)) { + *pext = ext; + return -EEXIST; + } + } + + catchall = kmalloc(sizeof(*catchall), GFP_KERNEL); + if (!catchall) + return -ENOMEM; + + catchall->elem = elem->priv; + list_add_tail_rcu(&catchall->list, &set->catchall_list); + + return 0; +} + +static int nft_setelem_insert(const struct net *net, + struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext, unsigned int flags) +{ + int ret; + + if (flags & NFT_SET_ELEM_CATCHALL) + ret = nft_setelem_catchall_insert(net, set, elem, ext); + else + ret = set->ops->insert(net, set, elem, ext); + + return ret; +} + +static bool nft_setelem_is_catchall(const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL) + return true; + + return false; +} + +static void nft_setelem_activate(struct net *net, struct nft_set *set, + struct nft_set_elem *elem) +{ + struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + if (nft_setelem_is_catchall(set, elem)) { + nft_set_elem_change_active(net, set, ext); + nft_set_elem_clear_busy(ext); + } else { + set->ops->activate(net, set, elem); + } +} + +static int nft_setelem_catchall_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem) +{ + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_is_active(net, ext) || + nft_set_elem_mark_busy(ext)) + continue; + + kfree(elem->priv); + elem->priv = catchall->elem; + nft_set_elem_change_active(net, set, ext); + return 0; + } + + return -ENOENT; +} + +static int __nft_setelem_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem) +{ + void *priv; + + priv = set->ops->deactivate(net, set, elem); + if (!priv) + return -ENOENT; + + kfree(elem->priv); + elem->priv = priv; + set->ndeact++; + + return 0; +} + +static int nft_setelem_deactivate(const struct net *net, + struct nft_set *set, + struct nft_set_elem *elem, u32 flags) +{ + int ret; + + if (flags & NFT_SET_ELEM_CATCHALL) + ret = nft_setelem_catchall_deactivate(net, set, elem); + else + ret = __nft_setelem_deactivate(net, set, elem); + + return ret; +} + +static void nft_setelem_catchall_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_set_elem_catchall *catchall, *next; + + list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { + if (catchall->elem == elem->priv) { + list_del_rcu(&catchall->list); + kfree_rcu(catchall); + break; + } + } +} + +static void nft_setelem_remove(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + if (nft_setelem_is_catchall(set, elem)) + nft_setelem_catchall_remove(net, set, elem); + else + set->ops->remove(net, set, elem); +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { @@ -5369,14 +5668,15 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (nla[NFTA_SET_ELEM_KEY] == NULL) - return -EINVAL; - nft_set_ext_prepare(&tmpl); err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); if (err < 0) return err; + + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) + return -EINVAL; + if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); @@ -5481,12 +5781,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, num_exprs = set->num_exprs; } - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - goto err_set_elem_expr; + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + goto err_set_elem_expr; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -5603,7 +5905,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; - err = set->ops->insert(ctx->net, set, &elem, &ext2); + + err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); if (err) { if (err == -EEXIST) { if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ @@ -5630,7 +5933,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_element_clash; } - if (set->size && + if (!(flags & NFT_SET_ELEM_CATCHALL) && set->size && !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { err = -ENFILE; goto err_set_full; @@ -5641,7 +5944,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return 0; err_set_full: - set->ops->remove(ctx->net, set, &elem); + nft_setelem_remove(ctx->net, set, &elem); err_element_clash: kfree(trans); err_elem_expr: @@ -5773,7 +6076,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, struct nft_set_ext *ext; struct nft_trans *trans; u32 flags = 0; - void *priv; int err; err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, @@ -5781,23 +6083,26 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - if (nla[NFTA_SET_ELEM_KEY] == NULL) + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + + if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL)) return -EINVAL; nft_set_ext_prepare(&tmpl); - err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); - if (err < 0) - return err; if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_setelem_parse_key(ctx, set, &elem.key.val, - nla[NFTA_SET_ELEM_KEY]); - if (err < 0) - return err; + if (nla[NFTA_SET_ELEM_KEY]) { + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); + if (err < 0) + return err; - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + } if (nla[NFTA_SET_ELEM_KEY_END]) { err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, @@ -5823,13 +6128,9 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (trans == NULL) goto fail_trans; - priv = set->ops->deactivate(ctx->net, set, &elem); - if (priv == NULL) { - err = -ENOENT; + err = nft_setelem_deactivate(ctx->net, set, &elem, flags); + if (err < 0) goto fail_ops; - } - kfree(elem.priv); - elem.priv = priv; nft_setelem_data_deactivate(ctx->net, set, &elem); @@ -5876,6 +6177,49 @@ err1: return err; } +static int __nft_set_catchall_flush(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_elem *elem) +{ + struct nft_trans *trans; + + trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, + sizeof(struct nft_trans_elem), GFP_KERNEL); + if (!trans) + return -ENOMEM; + + nft_setelem_data_deactivate(ctx->net, set, elem); + nft_trans_elem_set(trans) = set; + nft_trans_elem(trans) = *elem; + nft_trans_commit_list_add_tail(ctx->net, trans); + + return 0; +} + +static int nft_set_catchall_flush(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_elem elem; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask) || + nft_set_elem_mark_busy(ext)) + continue; + + elem.priv = catchall->elem; + ret = __nft_set_catchall_flush(ctx, set, &elem); + if (ret < 0) + break; + } + + return ret; +} + static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) { struct nft_set_iter iter = { @@ -5884,6 +6228,8 @@ static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask) }; set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_flush(ctx, set); return iter.err; } @@ -5918,8 +6264,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb, err = nft_del_setelem(&ctx, set, attr); if (err < 0) break; - - set->ndeact++; } return err; } @@ -8270,7 +8614,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) case NFT_MSG_NEWSETELEM: te = (struct nft_trans_elem *)trans->data; - te->set->ops->activate(net, te->set, &te->elem); + nft_setelem_activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_NEWSETELEM, 0); @@ -8282,9 +8626,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, NFT_MSG_DELSETELEM, 0); - te->set->ops->remove(net, te->set, &te->elem); - atomic_dec(&te->set->nelems); - te->set->ndeact--; + nft_setelem_remove(net, te->set, &te->elem); + if (!nft_setelem_is_catchall(te->set, &te->elem)) { + atomic_dec(&te->set->nelems); + te->set->ndeact--; + } break; case NFT_MSG_NEWOBJ: if (nft_trans_obj_update(trans)) { @@ -8485,15 +8831,17 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; } te = (struct nft_trans_elem *)trans->data; - te->set->ops->remove(net, te->set, &te->elem); - atomic_dec(&te->set->nelems); + nft_setelem_remove(net, te->set, &te->elem); + if (!nft_setelem_is_catchall(te->set, &te->elem)) + atomic_dec(&te->set->nelems); break; case NFT_MSG_DELSETELEM: te = (struct nft_trans_elem *)trans->data; nft_setelem_data_activate(net, te->set, &te->elem); - te->set->ops->activate(net, te->set, &te->elem); - te->set->ndeact--; + nft_setelem_activate(net, te->set, &te->elem); + if (!nft_setelem_is_catchall(te->set, &te->elem)) + te->set->ndeact--; nft_trans_destroy(trans); break; @@ -8672,6 +9020,27 @@ static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, return nft_check_loops(ctx, ext); } +static int nft_set_catchall_loops(const struct nft_ctx *ctx, + struct nft_set *set) +{ + u8 genmask = nft_genmask_next(ctx->net); + struct nft_set_elem_catchall *catchall; + struct nft_set_ext *ext; + int ret = 0; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (!nft_set_elem_active(ext, genmask)) + continue; + + ret = nft_check_loops(ctx, ext); + if (ret < 0) + return ret; + } + + return ret; +} + static int nf_tables_check_loops(const struct nft_ctx *ctx, const struct nft_chain *chain) { @@ -8731,6 +9100,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, iter.fn = nf_tables_loop_check_setelem; set->ops->walk(ctx, set, &iter); + if (!iter.err) + iter.err = nft_set_catchall_loops(ctx, set); + if (iter.err < 0) return iter.err; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index b0f558b4fea5..a479f8a1270c 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -30,13 +30,17 @@ void nft_lookup_eval(const struct nft_expr *expr, const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; const struct nft_set_ext *ext = NULL; + const struct net *net = nft_net(pkt); bool found; - found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], - &ext) ^ priv->invert; + found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext) ^ + priv->invert; if (!found) { - regs->verdict.code = NFT_BREAK; - return; + ext = nft_set_catchall_lookup(net, set); + if (!ext) { + regs->verdict.code = NFT_BREAK; + return; + } } if (ext) { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index bc104d36d3bb..7e47edee88ee 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -105,15 +105,18 @@ static void nft_objref_map_eval(const struct nft_expr *expr, { struct nft_objref_map *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; + struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; bool found; - found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], - &ext); + found = set->ops->lookup(net, set, ®s->data[priv->sreg], &ext); if (!found) { - regs->verdict.code = NFT_BREAK; - return; + ext = nft_set_catchall_lookup(net, set); + if (!ext) { + regs->verdict.code = NFT_BREAK; + return; + } } obj = *nft_set_ext_obj(ext); obj->ops->eval(obj, regs, pkt); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index bf618b7ec1ae..58f576abcd4a 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -350,6 +350,12 @@ needs_gc_run: rhashtable_walk_stop(&hti); rhashtable_walk_exit(&hti); + he = nft_set_catchall_gc(set); + if (he) { + gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); + if (gcb) + nft_set_gc_batch_add(gcb, he); + } nft_set_gc_batch_complete(gcb); queue_delayed_work(system_power_efficient_wq, &priv->gc_work, nft_set_gc_interval(set)); diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 9944523f5c2c..528a2d7ca991 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1529,11 +1529,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) { struct nft_pipapo *priv = nft_set_priv(set); int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; struct nft_pipapo_field *f; - struct nft_pipapo_elem *e; int i, start, rules_fx; start = first_rule; @@ -1569,6 +1569,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) } } + e = nft_set_catchall_gc(set); + if (e) + nft_set_elem_destroy(set, e, true); + priv->last_gc = jiffies; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 217ab3644c25..9e36eb4a7429 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -541,6 +541,12 @@ static void nft_rbtree_gc(struct work_struct *work) write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); + rbe = nft_set_catchall_gc(set); + if (rbe) { + gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); + if (gcb) + nft_set_gc_batch_add(gcb, rbe); + } nft_set_gc_batch_complete(gcb); queue_delayed_work(system_power_efficient_wq, &priv->gc_work, -- cgit v1.2.3 From 99014088156cd78867d19514a0bc771c4b86b93b Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Sun, 25 Apr 2021 17:27:35 +0200 Subject: net: bridge: mcast: fix broken length + header check for MRDv6 Adv. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The IPv6 Multicast Router Advertisements parsing has the following two issues: For one thing, ICMPv6 MRD Advertisements are smaller than ICMPv6 MLD messages (ICMPv6 MRD Adv.: 8 bytes vs. ICMPv6 MLDv1/2: >= 24 bytes, assuming MLDv2 Reports with at least one multicast address entry). When ipv6_mc_check_mld_msg() tries to parse an Multicast Router Advertisement its MLD length check will fail - and it will wrongly return -EINVAL, even if we have a valid MRD Advertisement. With the returned -EINVAL the bridge code will assume a broken packet and will wrongly discard it, potentially leading to multicast packet loss towards multicast routers. The second issue is the MRD header parsing in br_ip6_multicast_mrd_rcv(): It wrongly checks for an ICMPv6 header immediately after the IPv6 header (IPv6 next header type). However according to RFC4286, section 2 all MRD messages contain a Router Alert option (just like MLD). So instead there is an IPv6 Hop-by-Hop option for the Router Alert between the IPv6 and ICMPv6 header, again leading to the bridge wrongly discarding Multicast Router Advertisements. To fix these two issues, introduce a new return value -ENODATA to ipv6_mc_check_mld() to indicate a valid ICMPv6 packet with a hop-by-hop option which is not an MLD but potentially an MRD packet. This also simplifies further parsing in the bridge code, as ipv6_mc_check_mld() already fully checks the ICMPv6 header and hop-by-hop option. These issues were found and fixed with the help of the mrdisc tool (https://github.com/troglobit/mrdisc). Fixes: 4b3087c7e37f ("bridge: Snoop Multicast Router Advertisements") Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/net/addrconf.h | 1 - net/bridge/br_multicast.c | 33 ++++++++------------------------- net/ipv6/mcast_snoop.c | 12 +++++++----- 3 files changed, 15 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 18f783dcd55f..78ea3e332688 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -233,7 +233,6 @@ void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); -int ipv6_mc_check_icmpv6(struct sk_buff *skb); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 2883601d5c8b..226bb05c3b42 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -3159,25 +3159,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_mrd_rcv(struct net_bridge *br, - struct net_bridge_port *port, - struct sk_buff *skb) +static void br_ip6_multicast_mrd_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) { - int ret; - - if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) - return -ENOMSG; - - ret = ipv6_mc_check_icmpv6(skb); - if (ret < 0) - return ret; - if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) - return -ENOMSG; + return; br_multicast_mark_router(br, port); - - return 0; } static int br_multicast_ipv6_rcv(struct net_bridge *br, @@ -3191,18 +3180,12 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, err = ipv6_mc_check_mld(skb); - if (err == -ENOMSG) { + if (err == -ENOMSG || err == -ENODATA) { if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - - if (ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) { - err = br_ip6_multicast_mrd_rcv(br, port, skb); - - if (err < 0 && err != -ENOMSG) { - br_multicast_err_count(br, port, skb->protocol); - return err; - } - } + if (err == -ENODATA && + ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) + br_ip6_multicast_mrd_rcv(br, port, skb); return 0; } else if (err < 0) { diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c index d3d6b6a66e5f..04d5fcdfa6e0 100644 --- a/net/ipv6/mcast_snoop.c +++ b/net/ipv6/mcast_snoop.c @@ -109,7 +109,7 @@ static int ipv6_mc_check_mld_msg(struct sk_buff *skb) struct mld_msg *mld; if (!ipv6_mc_may_pull(skb, len)) - return -EINVAL; + return -ENODATA; mld = (struct mld_msg *)skb_transport_header(skb); @@ -122,7 +122,7 @@ static int ipv6_mc_check_mld_msg(struct sk_buff *skb) case ICMPV6_MGM_QUERY: return ipv6_mc_check_mld_query(skb); default: - return -ENOMSG; + return -ENODATA; } } @@ -131,7 +131,7 @@ static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); } -int ipv6_mc_check_icmpv6(struct sk_buff *skb) +static int ipv6_mc_check_icmpv6(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr); unsigned int transport_len = ipv6_transport_len(skb); @@ -150,7 +150,6 @@ int ipv6_mc_check_icmpv6(struct sk_buff *skb) return 0; } -EXPORT_SYMBOL(ipv6_mc_check_icmpv6); /** * ipv6_mc_check_mld - checks whether this is a sane MLD packet @@ -161,7 +160,10 @@ EXPORT_SYMBOL(ipv6_mc_check_icmpv6); * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard - * -ENOMSG: IP header validation succeeded but it is not an MLD packet. + * -ENOMSG: IP header validation succeeded but it is not an ICMPv6 packet + * with a hop-by-hop option. + * -ENODATA: IP+ICMPv6 header with hop-by-hop option validation succeeded + * but it is not an MLD packet. * -ENOMEM: A memory allocation failure happened. * * Caller needs to set the skb network header and free any returned skb if it -- cgit v1.2.3 From cf536ea3c7eefb26082836eb7f930b293dd38345 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 27 Apr 2021 12:21:58 +0800 Subject: net: dsa: no longer identify PTP packet in core driver Move ptp_classify_raw out of dsa core driver for handling tx timestamp request. Let device drivers do this if they want. Not all drivers want to limit tx timestamping for only PTP packet. Signed-off-by: Yangbo Lu Tested-by: Kurt Kanzenbach Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c | 7 ++++++- drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h | 2 +- drivers/net/dsa/mv88e6xxx/hwtstamp.c | 7 ++++++- drivers/net/dsa/mv88e6xxx/hwtstamp.h | 5 ++--- drivers/net/dsa/ocelot/felix.c | 2 +- drivers/net/dsa/sja1105/sja1105_ptp.c | 3 +-- drivers/net/dsa/sja1105/sja1105_ptp.h | 2 +- include/net/dsa.h | 2 +- net/dsa/slave.c | 12 ++---------- 9 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c index 6ba5e2333066..5b2e023468fe 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c +++ b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c @@ -374,14 +374,19 @@ long hellcreek_hwtstamp_work(struct ptp_clock_info *ptp) } bool hellcreek_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone, unsigned int type) + struct sk_buff *clone) { struct hellcreek *hellcreek = ds->priv; struct hellcreek_port_hwtstamp *ps; struct ptp_header *hdr; + unsigned int type; ps = &hellcreek->ports[port].port_hwtstamp; + type = ptp_classify_raw(clone); + if (type == PTP_CLASS_NONE) + return false; + /* Make sure the message is a PTP message that needs to be timestamped * and the interaction with the HW timestamping is enabled. If not, stop * here diff --git a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h index c0745ffa1ebb..728cd5dc650f 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h +++ b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h @@ -45,7 +45,7 @@ int hellcreek_port_hwtstamp_get(struct dsa_switch *ds, int port, bool hellcreek_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone, unsigned int type); bool hellcreek_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone, unsigned int type); + struct sk_buff *clone); int hellcreek_get_ts_info(struct dsa_switch *ds, int port, struct ethtool_ts_info *info); diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c b/drivers/net/dsa/mv88e6xxx/hwtstamp.c index 05ca1d3c6498..79514a54d903 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c @@ -469,11 +469,16 @@ long mv88e6xxx_hwtstamp_work(struct ptp_clock_info *ptp) } bool mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone, unsigned int type) + struct sk_buff *clone) { struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_port_hwtstamp *ps = &chip->port_hwtstamp[port]; struct ptp_header *hdr; + unsigned int type; + + type = ptp_classify_raw(clone); + if (type == PTP_CLASS_NONE) + return false; hdr = mv88e6xxx_should_tstamp(chip, port, clone, type); if (!hdr) diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.h b/drivers/net/dsa/mv88e6xxx/hwtstamp.h index 9da9f197ba02..91fbc7838fc8 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.h +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.h @@ -118,7 +118,7 @@ int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int port, bool mv88e6xxx_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone, unsigned int type); bool mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone, unsigned int type); + struct sk_buff *clone); int mv88e6xxx_get_ts_info(struct dsa_switch *ds, int port, struct ethtool_ts_info *info); @@ -152,8 +152,7 @@ static inline bool mv88e6xxx_port_rxtstamp(struct dsa_switch *ds, int port, } static inline bool mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone, - unsigned int type) + struct sk_buff *clone) { return false; } diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index 1379f86d71ec..d679f023dc00 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1396,7 +1396,7 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port, } static bool felix_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone, unsigned int type) + struct sk_buff *clone) { struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port = ocelot->ports[port]; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 1b90570b257b..72d052de82d8 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -435,8 +435,7 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit * callback, where we will timestamp it synchronously. */ -bool sja1105_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *skb, unsigned int type) +bool sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) { struct sja1105_private *priv = ds->priv; struct sja1105_port *sp = &priv->ports[port]; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h index 3daa33e98e77..c70c4729a06d 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.h +++ b/drivers/net/dsa/sja1105/sja1105_ptp.h @@ -105,7 +105,7 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); bool sja1105_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *skb, unsigned int type); + struct sk_buff *skb); int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr); diff --git a/include/net/dsa.h b/include/net/dsa.h index 507082959aa4..905066055b08 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -741,7 +741,7 @@ struct dsa_switch_ops { int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct ifreq *ifr); bool (*port_txtstamp)(struct dsa_switch *ds, int port, - struct sk_buff *clone, unsigned int type); + struct sk_buff *clone); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index b2a802e9330e..acaa52e60d7f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -20,7 +20,6 @@ #include #include #include -#include #include "dsa_priv.h" @@ -557,15 +556,10 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, { struct dsa_switch *ds = p->dp->ds; struct sk_buff *clone; - unsigned int type; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) return; - type = ptp_classify_raw(skb); - if (type == PTP_CLASS_NONE) - return; - if (!ds->ops->port_txtstamp) return; @@ -573,7 +567,7 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, if (!clone) return; - if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type)) { + if (ds->ops->port_txtstamp(ds, p->dp->index, clone)) { DSA_SKB_CB(skb)->clone = clone; return; } @@ -632,9 +626,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) DSA_SKB_CB(skb)->clone = NULL; - /* Identify PTP protocol packets, clone them, and pass them to the - * switch driver - */ + /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); if (dsa_realloc_skb(skb, dev)) { -- cgit v1.2.3 From 5c5416f5d4c75fe6aba56f6c2c45a070b5e7cc78 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 27 Apr 2021 12:21:59 +0800 Subject: net: dsa: no longer clone skb in core driver It was a waste to clone skb directly in dsa_skb_tx_timestamp(). For one-step timestamping, a clone was not needed. For any failure of port_txtstamp (this may usually happen), the skb clone had to be freed. So this patch moves skb cloning for tx timestamp out of dsa core, and let drivers clone skb in port_txtstamp if they really need. Signed-off-by: Yangbo Lu Tested-by: Kurt Kanzenbach Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c | 25 +++++++++++++++---------- drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h | 4 ++-- drivers/net/dsa/mv88e6xxx/hwtstamp.c | 24 +++++++++++++++--------- drivers/net/dsa/mv88e6xxx/hwtstamp.h | 9 ++++----- drivers/net/dsa/ocelot/felix.c | 13 ++++++++----- drivers/net/dsa/sja1105/sja1105_ptp.c | 13 +++++++++---- drivers/net/dsa/sja1105/sja1105_ptp.h | 2 +- include/net/dsa.h | 4 ++-- net/dsa/slave.c | 12 +----------- 9 files changed, 57 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c index 5b2e023468fe..40b41c794dfa 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c +++ b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.c @@ -373,31 +373,38 @@ long hellcreek_hwtstamp_work(struct ptp_clock_info *ptp) return restart ? 1 : -1; } -bool hellcreek_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone) +void hellcreek_port_txtstamp(struct dsa_switch *ds, int port, + struct sk_buff *skb) { struct hellcreek *hellcreek = ds->priv; struct hellcreek_port_hwtstamp *ps; struct ptp_header *hdr; + struct sk_buff *clone; unsigned int type; ps = &hellcreek->ports[port].port_hwtstamp; - type = ptp_classify_raw(clone); + type = ptp_classify_raw(skb); if (type == PTP_CLASS_NONE) - return false; + return; /* Make sure the message is a PTP message that needs to be timestamped * and the interaction with the HW timestamping is enabled. If not, stop * here */ - hdr = hellcreek_should_tstamp(hellcreek, port, clone, type); + hdr = hellcreek_should_tstamp(hellcreek, port, skb, type); if (!hdr) - return false; + return; + + clone = skb_clone_sk(skb); + if (!clone) + return; if (test_and_set_bit_lock(HELLCREEK_HWTSTAMP_TX_IN_PROGRESS, - &ps->state)) - return false; + &ps->state)) { + kfree_skb(clone); + return; + } ps->tx_skb = clone; @@ -407,8 +414,6 @@ bool hellcreek_port_txtstamp(struct dsa_switch *ds, int port, ps->tx_tstamp_start = jiffies; ptp_schedule_worker(hellcreek->ptp_clock, 0); - - return true; } bool hellcreek_port_rxtstamp(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h index 728cd5dc650f..71af77efb28b 100644 --- a/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h +++ b/drivers/net/dsa/hirschmann/hellcreek_hwtstamp.h @@ -44,8 +44,8 @@ int hellcreek_port_hwtstamp_get(struct dsa_switch *ds, int port, bool hellcreek_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone, unsigned int type); -bool hellcreek_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone); +void hellcreek_port_txtstamp(struct dsa_switch *ds, int port, + struct sk_buff *skb); int hellcreek_get_ts_info(struct dsa_switch *ds, int port, struct ethtool_ts_info *info); diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.c b/drivers/net/dsa/mv88e6xxx/hwtstamp.c index 79514a54d903..8f74ffc7a279 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.c +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.c @@ -468,32 +468,38 @@ long mv88e6xxx_hwtstamp_work(struct ptp_clock_info *ptp) return restart ? 1 : -1; } -bool mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone) +void mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, + struct sk_buff *skb) { struct mv88e6xxx_chip *chip = ds->priv; struct mv88e6xxx_port_hwtstamp *ps = &chip->port_hwtstamp[port]; struct ptp_header *hdr; + struct sk_buff *clone; unsigned int type; - type = ptp_classify_raw(clone); + type = ptp_classify_raw(skb); if (type == PTP_CLASS_NONE) - return false; + return; - hdr = mv88e6xxx_should_tstamp(chip, port, clone, type); + hdr = mv88e6xxx_should_tstamp(chip, port, skb, type); if (!hdr) - return false; + return; + + clone = skb_clone_sk(skb); + if (!clone) + return; if (test_and_set_bit_lock(MV88E6XXX_HWTSTAMP_TX_IN_PROGRESS, - &ps->state)) - return false; + &ps->state)) { + kfree_skb(clone); + return; + } ps->tx_skb = clone; ps->tx_tstamp_start = jiffies; ps->tx_seq_id = be16_to_cpu(hdr->sequence_id); ptp_schedule_worker(chip->ptp_clock, 0); - return true; } int mv88e6165_global_disable(struct mv88e6xxx_chip *chip) diff --git a/drivers/net/dsa/mv88e6xxx/hwtstamp.h b/drivers/net/dsa/mv88e6xxx/hwtstamp.h index 91fbc7838fc8..cf7fb6d660b1 100644 --- a/drivers/net/dsa/mv88e6xxx/hwtstamp.h +++ b/drivers/net/dsa/mv88e6xxx/hwtstamp.h @@ -117,8 +117,8 @@ int mv88e6xxx_port_hwtstamp_get(struct dsa_switch *ds, int port, bool mv88e6xxx_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *clone, unsigned int type); -bool mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone); +void mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, + struct sk_buff *skb); int mv88e6xxx_get_ts_info(struct dsa_switch *ds, int port, struct ethtool_ts_info *info); @@ -151,10 +151,9 @@ static inline bool mv88e6xxx_port_rxtstamp(struct dsa_switch *ds, int port, return false; } -static inline bool mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone) +static inline void mv88e6xxx_port_txtstamp(struct dsa_switch *ds, int port, + struct sk_buff *skb) { - return false; } static inline int mv88e6xxx_get_ts_info(struct dsa_switch *ds, int port, diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index d679f023dc00..fe7e8bad90df 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1395,18 +1395,21 @@ static bool felix_rxtstamp(struct dsa_switch *ds, int port, return false; } -static bool felix_txtstamp(struct dsa_switch *ds, int port, - struct sk_buff *clone) +static void felix_txtstamp(struct dsa_switch *ds, int port, + struct sk_buff *skb) { struct ocelot *ocelot = ds->priv; struct ocelot_port *ocelot_port = ocelot->ports[port]; + struct sk_buff *clone; if (ocelot->ptp && ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + clone = skb_clone_sk(skb); + if (!clone) + return; + ocelot_port_add_txtstamp_skb(ocelot, port, clone); - return true; + DSA_SKB_CB(skb)->clone = clone; } - - return false; } static int felix_change_mtu(struct dsa_switch *ds, int port, int new_mtu) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 72d052de82d8..a5140084000d 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -431,19 +431,24 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, return true; } -/* Called from dsa_skb_tx_timestamp. This callback is just to make DSA clone +/* Called from dsa_skb_tx_timestamp. This callback is just to clone * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit * callback, where we will timestamp it synchronously. */ -bool sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) +void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) { struct sja1105_private *priv = ds->priv; struct sja1105_port *sp = &priv->ports[port]; + struct sk_buff *clone; if (!sp->hwts_tx_en) - return false; + return; - return true; + clone = skb_clone_sk(skb); + if (!clone) + return; + + DSA_SKB_CB(skb)->clone = clone; } static int sja1105_ptp_reset(struct dsa_switch *ds) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.h b/drivers/net/dsa/sja1105/sja1105_ptp.h index c70c4729a06d..34f97f58a355 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.h +++ b/drivers/net/dsa/sja1105/sja1105_ptp.h @@ -104,7 +104,7 @@ void sja1105_ptp_txtstamp_skb(struct dsa_switch *ds, int slot, bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); -bool sja1105_port_txtstamp(struct dsa_switch *ds, int port, +void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb); int sja1105_hwtstamp_get(struct dsa_switch *ds, int port, struct ifreq *ifr); diff --git a/include/net/dsa.h b/include/net/dsa.h index 905066055b08..73ce6ce38aa1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -740,8 +740,8 @@ struct dsa_switch_ops { struct ifreq *ifr); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, struct ifreq *ifr); - bool (*port_txtstamp)(struct dsa_switch *ds, int port, - struct sk_buff *clone); + void (*port_txtstamp)(struct dsa_switch *ds, int port, + struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index acaa52e60d7f..85e51f46a9d5 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -555,7 +555,6 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; - struct sk_buff *clone; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) return; @@ -563,16 +562,7 @@ static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, if (!ds->ops->port_txtstamp) return; - clone = skb_clone_sk(skb); - if (!clone) - return; - - if (ds->ops->port_txtstamp(ds, p->dp->index, clone)) { - DSA_SKB_CB(skb)->clone = clone; - return; - } - - kfree_skb(clone); + ds->ops->port_txtstamp(ds, p->dp->index, skb); } netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From c4b364ce1270d689ee5010001344b8eae3685f32 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 27 Apr 2021 12:22:00 +0800 Subject: net: dsa: free skb->cb usage in core driver Free skb->cb usage in core driver and let device drivers decide to use or not. The reason having a DSA_SKB_CB(skb)->clone was because dsa_skb_tx_timestamp() which may set the clone pointer was called before p->xmit() which would use the clone if any, and the device driver has no way to initialize the clone pointer. This patch just put memset(skb->cb, 0, sizeof(skb->cb)) at beginning of dsa_slave_xmit(). Some new features in the future, like one-step timestamp may need more bytes of skb->cb to use in dsa_skb_tx_timestamp(), and p->xmit(). Signed-off-by: Yangbo Lu Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 2 +- drivers/net/dsa/sja1105/sja1105_main.c | 2 +- drivers/net/dsa/sja1105/sja1105_ptp.c | 4 ++-- drivers/net/ethernet/mscc/ocelot.c | 6 +++--- drivers/net/ethernet/mscc/ocelot_net.c | 2 +- include/linux/dsa/sja1105.h | 3 ++- include/net/dsa.h | 14 -------------- include/soc/mscc/ocelot.h | 8 ++++++++ net/dsa/slave.c | 2 +- net/dsa/tag_ocelot.c | 8 ++++---- net/dsa/tag_ocelot_8021q.c | 8 ++++---- 11 files changed, 27 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index fe7e8bad90df..b28280b6e91a 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1408,7 +1408,7 @@ static void felix_txtstamp(struct dsa_switch *ds, int port, return; ocelot_port_add_txtstamp_skb(ocelot, port, clone); - DSA_SKB_CB(skb)->clone = clone; + OCELOT_SKB_CB(skb)->clone = clone; } } diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index d9c198ca0197..405024b637d6 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3137,7 +3137,7 @@ static void sja1105_port_deferred_xmit(struct kthread_work *work) struct sk_buff *skb; while ((skb = skb_dequeue(&sp->xmit_queue)) != NULL) { - struct sk_buff *clone = DSA_SKB_CB(skb)->clone; + struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; mutex_lock(&priv->mgmt_lock); diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index a5140084000d..0bc566b9e958 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -432,7 +432,7 @@ bool sja1105_port_rxtstamp(struct dsa_switch *ds, int port, } /* Called from dsa_skb_tx_timestamp. This callback is just to clone - * the skb and have it available in DSA_SKB_CB in the .port_deferred_xmit + * the skb and have it available in SJA1105_SKB_CB in the .port_deferred_xmit * callback, where we will timestamp it synchronously. */ void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) @@ -448,7 +448,7 @@ void sja1105_port_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) if (!clone) return; - DSA_SKB_CB(skb)->clone = clone; + SJA1105_SKB_CB(skb)->clone = clone; } static int sja1105_ptp_reset(struct dsa_switch *ds) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 8d06ffaf318a..7da2dd1632b1 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -538,8 +538,8 @@ void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, spin_lock(&ocelot_port->ts_id_lock); skb_shinfo(clone)->tx_flags |= SKBTX_IN_PROGRESS; - /* Store timestamp ID in cb[0] of sk_buff */ - clone->cb[0] = ocelot_port->ts_id; + /* Store timestamp ID in OCELOT_SKB_CB(clone)->ts_id */ + OCELOT_SKB_CB(clone)->ts_id = ocelot_port->ts_id; ocelot_port->ts_id = (ocelot_port->ts_id + 1) % 4; skb_queue_tail(&ocelot_port->tx_skbs, clone); @@ -604,7 +604,7 @@ void ocelot_get_txtstamp(struct ocelot *ocelot) spin_lock_irqsave(&port->tx_skbs.lock, flags); skb_queue_walk_safe(&port->tx_skbs, skb, skb_tmp) { - if (skb->cb[0] != id) + if (OCELOT_SKB_CB(skb)->ts_id != id) continue; __skb_unlink(skb, &port->tx_skbs); skb_match = skb; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 36f32a4d9b0f..789a5fba146c 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -520,7 +520,7 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) ocelot_port_add_txtstamp_skb(ocelot, port, clone); - rew_op |= clone->cb[0] << 3; + rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; } } diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index dd93735ae228..1eb84562b311 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -47,11 +47,12 @@ struct sja1105_tagger_data { }; struct sja1105_skb_cb { + struct sk_buff *clone; u32 meta_tstamp; }; #define SJA1105_SKB_CB(skb) \ - ((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb)) + ((struct sja1105_skb_cb *)((skb)->cb)) struct sja1105_port { u16 subvlan_map[DSA_8021Q_N_SUBVLAN]; diff --git a/include/net/dsa.h b/include/net/dsa.h index 73ce6ce38aa1..e1a2610a0e06 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -117,20 +117,6 @@ struct dsa_netdevice_ops { #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) -struct dsa_skb_cb { - struct sk_buff *clone; -}; - -struct __dsa_skb_cb { - struct dsa_skb_cb cb; - u8 priv[48 - sizeof(struct dsa_skb_cb)]; -}; - -#define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb)) - -#define DSA_SKB_CB_PRIV(skb) \ - ((void *)(skb)->cb + offsetof(struct __dsa_skb_cb, priv)) - struct dsa_switch_tree { struct list_head list; diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 68cdc7ceaf4d..f075aaf70eee 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -689,6 +689,14 @@ struct ocelot_policer { u32 burst; /* bytes */ }; +struct ocelot_skb_cb { + struct sk_buff *clone; + u8 ts_id; +}; + +#define OCELOT_SKB_CB(skb) \ + ((struct ocelot_skb_cb *)((skb)->cb)) + #define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) #define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi)) #define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri)) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 85e51f46a9d5..8c0f3c6ab365 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -614,7 +614,7 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) dev_sw_netstats_tx_add(dev, 1, skb->len); - DSA_SKB_CB(skb)->clone = NULL; + memset(skb->cb, 0, sizeof(skb->cb)); /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index f9df9cac81c5..1100a16f1032 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -15,11 +15,11 @@ static void ocelot_xmit_ptp(struct dsa_port *dp, void *injection, ocelot_port = ocelot->ports[dp->index]; rew_op = ocelot_port->ptp_cmd; - /* Retrieve timestamp ID populated inside skb->cb[0] of the - * clone by ocelot_port_add_txtstamp_skb + /* Retrieve timestamp ID populated inside OCELOT_SKB_CB(clone)->ts_id + * by ocelot_port_add_txtstamp_skb */ if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - rew_op |= clone->cb[0] << 3; + rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; ocelot_ifh_set_rew_op(injection, rew_op); } @@ -28,7 +28,7 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, __be32 ifh_prefix, void **ifh) { struct dsa_port *dp = dsa_slave_to_port(netdev); - struct sk_buff *clone = DSA_SKB_CB(skb)->clone; + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; struct dsa_switch *ds = dp->ds; void *injection; __be32 *prefix; diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 5f3e8e124a82..a001a7e3f575 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -28,11 +28,11 @@ static struct sk_buff *ocelot_xmit_ptp(struct dsa_port *dp, ocelot_port = ocelot->ports[port]; rew_op = ocelot_port->ptp_cmd; - /* Retrieve timestamp ID populated inside skb->cb[0] of the - * clone by ocelot_port_add_txtstamp_skb + /* Retrieve timestamp ID populated inside OCELOT_SKB_CB(clone)->ts_id + * by ocelot_port_add_txtstamp_skb */ if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - rew_op |= clone->cb[0] << 3; + rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); @@ -46,7 +46,7 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - struct sk_buff *clone = DSA_SKB_CB(skb)->clone; + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; /* TX timestamping was requested, so inject through MMIO */ if (clone) -- cgit v1.2.3 From 682eaad93e8cfaaa439af39861ab8610eae5ff33 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 27 Apr 2021 12:22:02 +0800 Subject: net: mscc: ocelot: convert to ocelot_port_txtstamp_request() Convert to a common ocelot_port_txtstamp_request() for TX timestamp request handling. Signed-off-by: Yangbo Lu Reviewed-by: Vladimir Oltean Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/dsa/ocelot/felix.c | 15 +++++++-------- drivers/net/ethernet/mscc/ocelot.c | 24 +++++++++++++++++++++--- drivers/net/ethernet/mscc/ocelot_net.c | 18 +++++++----------- include/soc/mscc/ocelot.h | 5 +++-- 4 files changed, 38 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c index b28280b6e91a..ce607fbaaa3a 100644 --- a/drivers/net/dsa/ocelot/felix.c +++ b/drivers/net/dsa/ocelot/felix.c @@ -1399,17 +1399,16 @@ static void felix_txtstamp(struct dsa_switch *ds, int port, struct sk_buff *skb) { struct ocelot *ocelot = ds->priv; - struct ocelot_port *ocelot_port = ocelot->ports[port]; - struct sk_buff *clone; + struct sk_buff *clone = NULL; - if (ocelot->ptp && ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { - clone = skb_clone_sk(skb); - if (!clone) - return; + if (!ocelot->ptp) + return; - ocelot_port_add_txtstamp_skb(ocelot, port, clone); + if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) + return; + + if (clone) OCELOT_SKB_CB(skb)->clone = clone; - } } static int felix_change_mtu(struct dsa_switch *ds, int port, int new_mtu) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 7da2dd1632b1..3ff4cce1ce7d 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -530,8 +530,8 @@ void ocelot_port_disable(struct ocelot *ocelot, int port) } EXPORT_SYMBOL(ocelot_port_disable); -void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, - struct sk_buff *clone) +static void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *clone) { struct ocelot_port *ocelot_port = ocelot->ports[port]; @@ -545,7 +545,25 @@ void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, spin_unlock(&ocelot_port->ts_id_lock); } -EXPORT_SYMBOL(ocelot_port_add_txtstamp_skb); + +int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, + struct sk_buff *skb, + struct sk_buff **clone) +{ + struct ocelot_port *ocelot_port = ocelot->ports[port]; + u8 ptp_cmd = ocelot_port->ptp_cmd; + + if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + *clone = skb_clone_sk(skb); + if (!(*clone)) + return -ENOMEM; + + ocelot_port_add_txtstamp_skb(ocelot, port, *clone); + } + + return 0; +} +EXPORT_SYMBOL(ocelot_port_txtstamp_request); static void ocelot_get_hwtimestamp(struct ocelot *ocelot, struct timespec64 *ts) diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 789a5fba146c..e99c8fb3cb15 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -507,19 +507,15 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) /* Check if timestamping is needed */ if (ocelot->ptp && (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { - rew_op = ocelot_port->ptp_cmd; + struct sk_buff *clone = NULL; - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { - struct sk_buff *clone; - - clone = skb_clone_sk(skb); - if (!clone) { - kfree_skb(skb); - return NETDEV_TX_OK; - } - - ocelot_port_add_txtstamp_skb(ocelot, port, clone); + if (ocelot_port_txtstamp_request(ocelot, port, skb, &clone)) { + kfree_skb(skb); + return NETDEV_TX_OK; + } + if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { + rew_op = ocelot_port->ptp_cmd; rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; } } diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index f075aaf70eee..f7632519cb9c 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -828,8 +828,9 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); -void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, - struct sk_buff *clone); +int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, + struct sk_buff *skb, + struct sk_buff **clone); void ocelot_get_txtstamp(struct ocelot *ocelot); void ocelot_port_set_maxlen(struct ocelot *ocelot, int port, size_t sdu); int ocelot_get_max_mtu(struct ocelot *ocelot, int port); -- cgit v1.2.3 From 39e5308b3250666cc92c5ca33a667698ac645bd2 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 27 Apr 2021 12:22:03 +0800 Subject: net: mscc: ocelot: support PTP Sync one-step timestamping Although HWTSTAMP_TX_ONESTEP_SYNC existed in ioctl for hardware timestamp configuration, the PTP Sync one-step timestamping had never been supported. This patch is to truely support it. - ocelot_port_txtstamp_request() This function handles tx timestamp request by storing ptp_cmd(tx timestamp type) in OCELOT_SKB_CB(skb)->ptp_cmd, and additionally for two-step timestamp storing ts_id in OCELOT_SKB_CB(clone)->ptp_cmd. - ocelot_ptp_rew_op() During xmit, this function is called to get rew_op (rewriter option) by checking skb->cb for tx timestamp request, and configure to transmitting. Non-onestep-Sync packet with one-step timestamp request falls back to use two-step timestamp. Signed-off-by: Yangbo Lu Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 53 ++++++++++++++++++++++++++++++++++ drivers/net/ethernet/mscc/ocelot_net.c | 8 ++--- include/soc/mscc/ocelot.h | 8 ++++- net/dsa/Kconfig | 2 ++ net/dsa/tag_ocelot.c | 27 +++-------------- net/dsa/tag_ocelot_8021q.c | 41 +++++++------------------- 6 files changed, 81 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 3ff4cce1ce7d..0c4283319d7f 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include "ocelot.h" #include "ocelot_vcap.h" @@ -546,6 +547,46 @@ static void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, spin_unlock(&ocelot_port->ts_id_lock); } +u32 ocelot_ptp_rew_op(struct sk_buff *skb) +{ + struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; + u8 ptp_cmd = OCELOT_SKB_CB(skb)->ptp_cmd; + u32 rew_op = 0; + + if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP && clone) { + rew_op = ptp_cmd; + rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; + } else if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { + rew_op = ptp_cmd; + } + + return rew_op; +} +EXPORT_SYMBOL(ocelot_ptp_rew_op); + +static bool ocelot_ptp_is_onestep_sync(struct sk_buff *skb) +{ + struct ptp_header *hdr; + unsigned int ptp_class; + u8 msgtype, twostep; + + ptp_class = ptp_classify_raw(skb); + if (ptp_class == PTP_CLASS_NONE) + return false; + + hdr = ptp_parse_header(skb, ptp_class); + if (!hdr) + return false; + + msgtype = ptp_get_msgtype(hdr, ptp_class); + twostep = hdr->flag_field[0] & 0x2; + + if (msgtype == PTP_MSGTYPE_SYNC && twostep == 0) + return true; + + return false; +} + int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, struct sk_buff *skb, struct sk_buff **clone) @@ -553,12 +594,24 @@ int ocelot_port_txtstamp_request(struct ocelot *ocelot, int port, struct ocelot_port *ocelot_port = ocelot->ports[port]; u8 ptp_cmd = ocelot_port->ptp_cmd; + /* Store ptp_cmd in OCELOT_SKB_CB(skb)->ptp_cmd */ + if (ptp_cmd == IFH_REW_OP_ORIGIN_PTP) { + if (ocelot_ptp_is_onestep_sync(skb)) { + OCELOT_SKB_CB(skb)->ptp_cmd = ptp_cmd; + return 0; + } + + /* Fall back to two-step timestamping */ + ptp_cmd = IFH_REW_OP_TWO_STEP_PTP; + } + if (ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { *clone = skb_clone_sk(skb); if (!(*clone)) return -ENOMEM; ocelot_port_add_txtstamp_skb(ocelot, port, *clone); + OCELOT_SKB_CB(skb)->ptp_cmd = ptp_cmd; } return 0; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index e99c8fb3cb15..aad33d22c33f 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -514,10 +514,10 @@ static netdev_tx_t ocelot_port_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) { - rew_op = ocelot_port->ptp_cmd; - rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; - } + if (clone) + OCELOT_SKB_CB(skb)->clone = clone; + + rew_op = ocelot_ptp_rew_op(skb); } ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index f7632519cb9c..2f5ce4d4fdbf 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -691,6 +691,7 @@ struct ocelot_policer { struct ocelot_skb_cb { struct sk_buff *clone; + u8 ptp_cmd; u8 ts_id; }; @@ -748,15 +749,16 @@ u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, u32 val, u32 reg, u32 offset); -/* Packet I/O */ #if IS_ENABLED(CONFIG_MSCC_OCELOT_SWITCH_LIB) +/* Packet I/O */ bool ocelot_can_inject(struct ocelot *ocelot, int grp); void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp, u32 rew_op, struct sk_buff *skb); int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb); void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp); +u32 ocelot_ptp_rew_op(struct sk_buff *skb); #else static inline bool ocelot_can_inject(struct ocelot *ocelot, int grp) @@ -780,6 +782,10 @@ static inline void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp) { } +static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb) +{ + return 0; +} #endif /* Hardware initialization */ diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index cbc2bd643ab2..5baba7021427 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -111,6 +111,8 @@ config NET_DSA_TAG_RTL4_A config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches, using NPI port" + depends on MSCC_OCELOT_SWITCH_LIB || \ + (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) select PACKING help Say Y or M if you want to enable NPI tagging for the Ocelot switches diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 1100a16f1032..91f0fd1242cd 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -5,33 +5,14 @@ #include #include "dsa_priv.h" -static void ocelot_xmit_ptp(struct dsa_port *dp, void *injection, - struct sk_buff *clone) -{ - struct ocelot *ocelot = dp->ds->priv; - struct ocelot_port *ocelot_port; - u64 rew_op; - - ocelot_port = ocelot->ports[dp->index]; - rew_op = ocelot_port->ptp_cmd; - - /* Retrieve timestamp ID populated inside OCELOT_SKB_CB(clone)->ts_id - * by ocelot_port_add_txtstamp_skb - */ - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; - - ocelot_ifh_set_rew_op(injection, rew_op); -} - static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, __be32 ifh_prefix, void **ifh) { struct dsa_port *dp = dsa_slave_to_port(netdev); - struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; struct dsa_switch *ds = dp->ds; void *injection; __be32 *prefix; + u32 rew_op = 0; injection = skb_push(skb, OCELOT_TAG_LEN); prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN); @@ -42,9 +23,9 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, ocelot_ifh_set_src(injection, ds->num_ports); ocelot_ifh_set_qos_class(injection, skb->priority); - /* TX timestamping was requested */ - if (clone) - ocelot_xmit_ptp(dp, injection, clone); + rew_op = ocelot_ptp_rew_op(skb); + if (rew_op) + ocelot_ifh_set_rew_op(injection, rew_op); *ifh = injection; } diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index a001a7e3f575..62a93303bd63 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -13,32 +13,6 @@ #include #include "dsa_priv.h" -static struct sk_buff *ocelot_xmit_ptp(struct dsa_port *dp, - struct sk_buff *skb, - struct sk_buff *clone) -{ - struct ocelot *ocelot = dp->ds->priv; - struct ocelot_port *ocelot_port; - int port = dp->index; - u32 rew_op; - - if (!ocelot_can_inject(ocelot, 0)) - return NULL; - - ocelot_port = ocelot->ports[port]; - rew_op = ocelot_port->ptp_cmd; - - /* Retrieve timestamp ID populated inside OCELOT_SKB_CB(clone)->ts_id - * by ocelot_port_add_txtstamp_skb - */ - if (ocelot_port->ptp_cmd == IFH_REW_OP_TWO_STEP_PTP) - rew_op |= OCELOT_SKB_CB(clone)->ts_id << 3; - - ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); - - return NULL; -} - static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -46,11 +20,18 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - struct sk_buff *clone = OCELOT_SKB_CB(skb)->clone; + struct ocelot *ocelot = dp->ds->priv; + int port = dp->index; + u32 rew_op = 0; + + rew_op = ocelot_ptp_rew_op(skb); + if (rew_op) { + if (!ocelot_can_inject(ocelot, 0)) + return NULL; - /* TX timestamping was requested, so inject through MMIO */ - if (clone) - return ocelot_xmit_ptp(dp, skb, clone); + ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); + return NULL; + } return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); -- cgit v1.2.3 From 800fcab8230f622544a12403977b5b7259a076f8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 27 Apr 2021 09:09:07 +0200 Subject: net: phy: Add support for microchip SMI0 MDIO bus SMI0 is a mangled version of MDIO. The main low level difference is the MDIO C22 OP code is always 0, not 0x2 or 0x1 for Read/Write. The read/write information is instead encoded in the PHY address. Extend the bit-bang code to allow the op code to be overridden, but default to normal C22 values. Add an extra compatible to the mdio-gpio driver, and when this compatible is present, set the op codes to 0. A higher level driver, sitting on top of the basic MDIO bus driver can then implement the rest of the microchip SMI0 odderties. Signed-off-by: Andrew Lunn Signed-off-by: Michael Grzeschik Signed-off-by: Oleksij Rempel Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-bitbang.c | 8 ++++++-- drivers/net/mdio/mdio-gpio.c | 8 ++++++++ include/linux/mdio-bitbang.h | 3 +++ 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c index 0f457c436335..07609114a26b 100644 --- a/drivers/net/mdio/mdio-bitbang.c +++ b/drivers/net/mdio/mdio-bitbang.c @@ -158,7 +158,7 @@ int mdiobb_read(struct mii_bus *bus, int phy, int reg) reg = mdiobb_cmd_addr(ctrl, phy, reg); mdiobb_cmd(ctrl, MDIO_C45_READ, phy, reg); } else - mdiobb_cmd(ctrl, MDIO_READ, phy, reg); + mdiobb_cmd(ctrl, ctrl->op_c22_read, phy, reg); ctrl->ops->set_mdio_dir(ctrl, 0); @@ -190,7 +190,7 @@ int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val) reg = mdiobb_cmd_addr(ctrl, phy, reg); mdiobb_cmd(ctrl, MDIO_C45_WRITE, phy, reg); } else - mdiobb_cmd(ctrl, MDIO_WRITE, phy, reg); + mdiobb_cmd(ctrl, ctrl->op_c22_write, phy, reg); /* send the turnaround (10) */ mdiobb_send_bit(ctrl, 1); @@ -217,6 +217,10 @@ struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl) bus->read = mdiobb_read; bus->write = mdiobb_write; bus->priv = ctrl; + if (!ctrl->override_op_c22) { + ctrl->op_c22_read = MDIO_READ; + ctrl->op_c22_write = MDIO_WRITE; + } return bus; } diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c index 56c8f914f893..0fb3c2de0845 100644 --- a/drivers/net/mdio/mdio-gpio.c +++ b/drivers/net/mdio/mdio-gpio.c @@ -132,6 +132,13 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask; } + if (dev->of_node && + of_device_is_compatible(dev->of_node, "microchip,mdio-smi0")) { + bitbang->ctrl.op_c22_read = 0; + bitbang->ctrl.op_c22_write = 0; + bitbang->ctrl.override_op_c22 = 1; + } + dev_set_drvdata(dev, new_bus); return new_bus; @@ -196,6 +203,7 @@ static int mdio_gpio_remove(struct platform_device *pdev) static const struct of_device_id mdio_gpio_of_match[] = { { .compatible = "virtual,mdio-gpio", }, + { .compatible = "microchip,mdio-smi0" }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, mdio_gpio_of_match); diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h index aca4dc037b70..373630fe5c28 100644 --- a/include/linux/mdio-bitbang.h +++ b/include/linux/mdio-bitbang.h @@ -33,6 +33,9 @@ struct mdiobb_ops { struct mdiobb_ctrl { const struct mdiobb_ops *ops; + unsigned int override_op_c22; + u8 op_c22_read; + u8 op_c22_write; }; int mdiobb_read(struct mii_bus *bus, int phy, int reg); -- cgit v1.2.3 From 76d6a13383b8e3ff20a9cf52aa9c3de39e485632 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Tue, 27 Apr 2021 19:43:12 +0200 Subject: seq_file: Add a seq_bprintf function Similarly to seq_buf_bprintf in lib/seq_buf.c, this function writes a printf formatted string with arguments provided in a "binary representation" built by functions such as vbin_printf. Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210427174313.860948-2-revest@chromium.org --- fs/seq_file.c | 18 ++++++++++++++++++ include/linux/seq_file.h | 4 ++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/fs/seq_file.c b/fs/seq_file.c index cb11a34fb871..5059248f2d64 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -412,6 +412,24 @@ void seq_printf(struct seq_file *m, const char *f, ...) } EXPORT_SYMBOL(seq_printf); +#ifdef CONFIG_BINARY_PRINTF +void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary) +{ + int len; + + if (m->count < m->size) { + len = bstr_printf(m->buf + m->count, m->size - m->count, f, + binary); + if (m->count + len < m->size) { + m->count += len; + return; + } + } + seq_set_overflow(m); +} +EXPORT_SYMBOL(seq_bprintf); +#endif /* CONFIG_BINARY_PRINTF */ + /** * mangle_path - mangle and copy path to buffer beginning * @s: buffer start diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index b83b3ae3c877..723b1fa1177e 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -146,6 +146,10 @@ void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); +#ifdef CONFIG_BINARY_PRINTF +void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); +#endif + #define DEFINE_SEQ_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ -- cgit v1.2.3 From 48cac3f4a96ddf08df8e53809ed066de0dc93915 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Tue, 27 Apr 2021 19:43:13 +0200 Subject: bpf: Implement formatted output helpers with bstr_printf BPF has three formatted output helpers: bpf_trace_printk, bpf_seq_printf and bpf_snprintf. Their signatures specify that all arguments are provided from the BPF world as u64s (in an array or as registers). All of these helpers are currently implemented by calling functions such as snprintf() whose signatures take a variable number of arguments, then placed in a va_list by the compiler to call vsnprintf(). "d9c9e4db bpf: Factorize bpf_trace_printk and bpf_seq_printf" introduced a bpf_printf_prepare function that fills an array of u64 sanitized arguments with an array of "modifiers" which indicate what the "real" size of each argument should be (given by the format specifier). The BPF_CAST_FMT_ARG macro consumes these arrays and casts each argument to its real size. However, the C promotion rules implicitely cast them all back to u64s. Therefore, the arguments given to snprintf are u64s and the va_list constructed by the compiler will use 64 bits for each argument. On 64 bit machines, this happens to work well because 32 bit arguments in va_lists need to occupy 64 bits anyway, but on 32 bit architectures this breaks the layout of the va_list expected by the called function and mangles values. In "88a5c690b6 bpf: fix bpf_trace_printk on 32 bit archs", this problem had been solved for bpf_trace_printk only with a "horrid workaround" that emitted multiple calls to trace_printk where each call had different argument types and generated different va_list layouts. One of the call would be dynamically chosen at runtime. This was ok with the 3 arguments that bpf_trace_printk takes but bpf_seq_printf and bpf_snprintf accept up to 12 arguments. Because this approach scales code exponentially, it is not a viable option anymore. Because the promotion rules are part of the language and because the construction of a va_list is an arch-specific ABI, it's best to just avoid variadic arguments and va_lists altogether. Thankfully the kernel's snprintf() has an alternative in the form of bstr_printf() that accepts arguments in a "binary buffer representation". These binary buffers are currently created by vbin_printf and used in the tracing subsystem to split the cost of printing into two parts: a fast one that only dereferences and remembers values, and a slower one, called later, that does the pretty-printing. This patch refactors bpf_printf_prepare to construct binary buffers of arguments consumable by bstr_printf() instead of arrays of arguments and modifiers. This gets rid of BPF_CAST_FMT_ARG and greatly simplifies the bpf_printf_prepare usage but there are a few gotchas that change how bpf_printf_prepare needs to do things. Currently, bpf_printf_prepare uses a per cpu temporary buffer as a generic storage for strings and IP addresses. With this refactoring, the temporary buffers now holds all the arguments in a structured binary format. To comply with the format expected by bstr_printf, certain format specifiers also need to be pre-formatted: %pB and %pi6/%pi4/%pI4/%pI6. Because vsnprintf subroutines for these specifiers are hard to expose, we pre-format these arguments with calls to snprintf(). Reported-by: Rasmus Villemoes Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210427174313.860948-3-revest@chromium.org --- include/linux/bpf.h | 22 +----- init/Kconfig | 1 + kernel/bpf/helpers.c | 188 +++++++++++++++++++++++++---------------------- kernel/bpf/verifier.c | 2 +- kernel/trace/bpf_trace.c | 34 +++------ 5 files changed, 115 insertions(+), 132 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ad4bcf1cadbb..b33f199c4cc2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2081,24 +2081,8 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); -enum bpf_printf_mod_type { - BPF_PRINTF_INT, - BPF_PRINTF_LONG, - BPF_PRINTF_LONG_LONG, -}; - -/* Workaround for getting va_list handling working with different argument type - * combinations generically for 32 and 64 bit archs. - */ -#define BPF_CAST_FMT_ARG(arg_nb, args, mod) \ - (mod[arg_nb] == BPF_PRINTF_LONG_LONG || \ - (mod[arg_nb] == BPF_PRINTF_LONG && __BITS_PER_LONG == 64) \ - ? (u64)args[arg_nb] \ - : (u32)args[arg_nb]) - -int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, - u64 *final_args, enum bpf_printf_mod_type *mod, - u32 num_args); -void bpf_printf_cleanup(void); +int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 **bin_buf, u32 num_args); +void bpf_bprintf_cleanup(void); #endif /* _LINUX_BPF_H */ diff --git a/init/Kconfig b/init/Kconfig index 5deae45b8d81..0d82a1f838cc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1708,6 +1708,7 @@ config BPF_SYSCALL select BPF select IRQ_WORK select TASKS_TRACE_RCU + select BINARY_PRINTF select NET_SOCK_MSG if INET default n help diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 85b26ca5aacd..544773970dbc 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -707,9 +707,6 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) struct bpf_printf_buf *bufs; int used; - if (*tmp_buf) - return 0; - preempt_disable(); used = this_cpu_inc_return(bpf_printf_buf_used); if (WARN_ON_ONCE(used > 1)) { @@ -723,7 +720,7 @@ static int try_get_fmt_tmp_buf(char **tmp_buf) return 0; } -void bpf_printf_cleanup(void) +void bpf_bprintf_cleanup(void) { if (this_cpu_read(bpf_printf_buf_used)) { this_cpu_dec(bpf_printf_buf_used); @@ -732,43 +729,45 @@ void bpf_printf_cleanup(void) } /* - * bpf_parse_fmt_str - Generic pass on format strings for printf-like helpers + * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * * Returns a negative value if fmt is an invalid format string or 0 otherwise. * * This can be used in two ways: - * - Format string verification only: when final_args and mod are NULL + * - Format string verification only: when bin_args is NULL * - Arguments preparation: in addition to the above verification, it writes in - * final_args a copy of raw_args where pointers from BPF have been sanitized - * into pointers safe to use by snprintf. This also writes in the mod array - * the size requirement of each argument, usable by BPF_CAST_FMT_ARG for ex. + * bin_args a binary representation of arguments usable by bstr_printf where + * pointers from BPF have been sanitized. * * In argument preparation mode, if 0 is returned, safe temporary buffers are - * allocated and bpf_printf_cleanup should be called to free them after use. + * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, - u64 *final_args, enum bpf_printf_mod_type *mod, - u32 num_args) +int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, + u32 **bin_args, u32 num_args) { - char *unsafe_ptr = NULL, *tmp_buf = NULL, *fmt_end; - size_t tmp_buf_len = MAX_PRINTF_BUF_LEN; - int err, i, num_spec = 0, copy_size; - enum bpf_printf_mod_type cur_mod; + char *unsafe_ptr = NULL, *tmp_buf = NULL, *tmp_buf_end, *fmt_end; + size_t sizeof_cur_arg, sizeof_cur_ip; + int err, i, num_spec = 0; u64 cur_arg; - char fmt_ptype; - - if (!!final_args != !!mod) - return -EINVAL; + char fmt_ptype, cur_ip[16], ip_spec[] = "%pXX"; fmt_end = strnchr(fmt, fmt_size, 0); if (!fmt_end) return -EINVAL; fmt_size = fmt_end - fmt; + if (bin_args) { + if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) + return -EBUSY; + + tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN; + *bin_args = (u32 *)tmp_buf; + } + for (i = 0; i < fmt_size; i++) { if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { err = -EINVAL; - goto cleanup; + goto out; } if (fmt[i] != '%') @@ -781,7 +780,7 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (num_spec >= num_args) { err = -EINVAL; - goto cleanup; + goto out; } /* The string is zero-terminated so if fmt[i] != 0, we can @@ -800,7 +799,7 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, } if (fmt[i] == 'p') { - cur_mod = BPF_PRINTF_LONG; + sizeof_cur_arg = sizeof(long); if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { @@ -811,117 +810,140 @@ int bpf_printf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || - fmt[i + 1] == 'x' || fmt[i + 1] == 'B' || - fmt[i + 1] == 's' || fmt[i + 1] == 'S') { + fmt[i + 1] == 'x' || fmt[i + 1] == 's' || + fmt[i + 1] == 'S') { /* just kernel pointers */ - if (final_args) + if (tmp_buf) cur_arg = raw_args[num_spec]; - goto fmt_next; + i++; + goto nocopy_fmt; + } + + if (fmt[i + 1] == 'B') { + if (tmp_buf) { + err = snprintf(tmp_buf, + (tmp_buf_end - tmp_buf), + "%pB", + (void *)(long)raw_args[num_spec]); + tmp_buf += (err + 1); + } + + i++; + num_spec++; + continue; } /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ if ((fmt[i + 1] != 'i' && fmt[i + 1] != 'I') || (fmt[i + 2] != '4' && fmt[i + 2] != '6')) { err = -EINVAL; - goto cleanup; + goto out; } i += 2; - if (!final_args) - goto fmt_next; + if (!tmp_buf) + goto nocopy_fmt; - if (try_get_fmt_tmp_buf(&tmp_buf)) { - err = -EBUSY; - goto out; - } - - copy_size = (fmt[i + 2] == '4') ? 4 : 16; - if (tmp_buf_len < copy_size) { + sizeof_cur_ip = (fmt[i] == '4') ? 4 : 16; + if (tmp_buf_end - tmp_buf < sizeof_cur_ip) { err = -ENOSPC; - goto cleanup; + goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; - err = copy_from_kernel_nofault(tmp_buf, unsafe_ptr, - copy_size); + err = copy_from_kernel_nofault(cur_ip, unsafe_ptr, + sizeof_cur_ip); if (err < 0) - memset(tmp_buf, 0, copy_size); - cur_arg = (u64)(long)tmp_buf; - tmp_buf += copy_size; - tmp_buf_len -= copy_size; + memset(cur_ip, 0, sizeof_cur_ip); + + /* hack: bstr_printf expects IP addresses to be + * pre-formatted as strings, ironically, the easiest way + * to do that is to call snprintf. + */ + ip_spec[2] = fmt[i - 1]; + ip_spec[3] = fmt[i]; + err = snprintf(tmp_buf, tmp_buf_end - tmp_buf, + ip_spec, &cur_ip); - goto fmt_next; + tmp_buf += err + 1; + num_spec++; + + continue; } else if (fmt[i] == 's') { - cur_mod = BPF_PRINTF_LONG; fmt_ptype = fmt[i]; fmt_str: if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) { err = -EINVAL; - goto cleanup; - } - - if (!final_args) - goto fmt_next; - - if (try_get_fmt_tmp_buf(&tmp_buf)) { - err = -EBUSY; goto out; } - if (!tmp_buf_len) { + if (!tmp_buf) + goto nocopy_fmt; + + if (tmp_buf_end == tmp_buf) { err = -ENOSPC; - goto cleanup; + goto out; } unsafe_ptr = (char *)(long)raw_args[num_spec]; err = bpf_trace_copy_string(tmp_buf, unsafe_ptr, - fmt_ptype, tmp_buf_len); + fmt_ptype, + tmp_buf_end - tmp_buf); if (err < 0) { tmp_buf[0] = '\0'; err = 1; } - cur_arg = (u64)(long)tmp_buf; tmp_buf += err; - tmp_buf_len -= err; + num_spec++; - goto fmt_next; + continue; } - cur_mod = BPF_PRINTF_INT; + sizeof_cur_arg = sizeof(int); if (fmt[i] == 'l') { - cur_mod = BPF_PRINTF_LONG; + sizeof_cur_arg = sizeof(long); i++; } if (fmt[i] == 'l') { - cur_mod = BPF_PRINTF_LONG_LONG; + sizeof_cur_arg = sizeof(long long); i++; } if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x' && fmt[i] != 'X') { err = -EINVAL; - goto cleanup; + goto out; } - if (final_args) + if (tmp_buf) cur_arg = raw_args[num_spec]; -fmt_next: - if (final_args) { - mod[num_spec] = cur_mod; - final_args[num_spec] = cur_arg; +nocopy_fmt: + if (tmp_buf) { + tmp_buf = PTR_ALIGN(tmp_buf, sizeof(u32)); + if (tmp_buf_end - tmp_buf < sizeof_cur_arg) { + err = -ENOSPC; + goto out; + } + + if (sizeof_cur_arg == 8) { + *(u32 *)tmp_buf = *(u32 *)&cur_arg; + *(u32 *)(tmp_buf + 4) = *((u32 *)&cur_arg + 1); + } else { + *(u32 *)tmp_buf = (u32)(long)cur_arg; + } + tmp_buf += sizeof_cur_arg; } num_spec++; } err = 0; -cleanup: - if (err) - bpf_printf_cleanup(); out: + if (err) + bpf_bprintf_cleanup(); return err; } @@ -930,9 +952,8 @@ out: BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, const void *, data, u32, data_len) { - enum bpf_printf_mod_type mod[MAX_SNPRINTF_VARARGS]; - u64 args[MAX_SNPRINTF_VARARGS]; int err, num_args; + u32 *bin_args; if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 || (data_len && !data)) @@ -942,22 +963,13 @@ BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt, /* ARG_PTR_TO_CONST_STR guarantees that fmt is zero-terminated so we * can safely give an unbounded size. */ - err = bpf_printf_prepare(fmt, UINT_MAX, data, args, mod, num_args); + err = bpf_bprintf_prepare(fmt, UINT_MAX, data, &bin_args, num_args); if (err < 0) return err; - /* Maximumly we can have MAX_SNPRINTF_VARARGS parameters, just give - * all of them to snprintf(). - */ - err = snprintf(str, str_size, fmt, BPF_CAST_FMT_ARG(0, args, mod), - BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), - BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), - BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), - BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), - BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), - BPF_CAST_FMT_ARG(11, args, mod)); - - bpf_printf_cleanup(); + err = bstr_printf(str, str_size, fmt, bin_args); + + bpf_bprintf_cleanup(); return err + 1; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9145f88b2a0a..8fd552c16763 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5946,7 +5946,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, /* We are also guaranteed that fmt+fmt_map_off is NULL terminated, we * can focus on validating the format specifiers. */ - err = bpf_printf_prepare(fmt, UINT_MAX, NULL, NULL, NULL, num_args); + err = bpf_bprintf_prepare(fmt, UINT_MAX, NULL, NULL, num_args); if (err < 0) verbose(env, "Invalid format string\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0e67d12a8f40..d2d7cf6cfe83 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -381,27 +381,23 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { u64 args[MAX_TRACE_PRINTK_VARARGS] = { arg1, arg2, arg3 }; - enum bpf_printf_mod_type mod[MAX_TRACE_PRINTK_VARARGS]; + u32 *bin_args; static char buf[BPF_TRACE_PRINTK_SIZE]; unsigned long flags; int ret; - ret = bpf_printf_prepare(fmt, fmt_size, args, args, mod, - MAX_TRACE_PRINTK_VARARGS); + ret = bpf_bprintf_prepare(fmt, fmt_size, args, &bin_args, + MAX_TRACE_PRINTK_VARARGS); if (ret < 0) return ret; raw_spin_lock_irqsave(&trace_printk_lock, flags); - ret = snprintf(buf, sizeof(buf), fmt, BPF_CAST_FMT_ARG(0, args, mod), - BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod)); - /* snprintf() will not append null for zero-length strings */ - if (ret == 0) - buf[0] = '\0'; + ret = bstr_printf(buf, sizeof(buf), fmt, bin_args); trace_bpf_trace_printk(buf); raw_spin_unlock_irqrestore(&trace_printk_lock, flags); - bpf_printf_cleanup(); + bpf_bprintf_cleanup(); return ret; } @@ -435,31 +431,21 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, const void *, data, u32, data_len) { - enum bpf_printf_mod_type mod[MAX_SEQ_PRINTF_VARARGS]; - u64 args[MAX_SEQ_PRINTF_VARARGS]; int err, num_args; + u32 *bin_args; if (data_len & 7 || data_len > MAX_SEQ_PRINTF_VARARGS * 8 || (data_len && !data)) return -EINVAL; num_args = data_len / 8; - err = bpf_printf_prepare(fmt, fmt_size, data, args, mod, num_args); + err = bpf_bprintf_prepare(fmt, fmt_size, data, &bin_args, num_args); if (err < 0) return err; - /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give - * all of them to seq_printf(). - */ - seq_printf(m, fmt, BPF_CAST_FMT_ARG(0, args, mod), - BPF_CAST_FMT_ARG(1, args, mod), BPF_CAST_FMT_ARG(2, args, mod), - BPF_CAST_FMT_ARG(3, args, mod), BPF_CAST_FMT_ARG(4, args, mod), - BPF_CAST_FMT_ARG(5, args, mod), BPF_CAST_FMT_ARG(6, args, mod), - BPF_CAST_FMT_ARG(7, args, mod), BPF_CAST_FMT_ARG(8, args, mod), - BPF_CAST_FMT_ARG(9, args, mod), BPF_CAST_FMT_ARG(10, args, mod), - BPF_CAST_FMT_ARG(11, args, mod)); - - bpf_printf_cleanup(); + seq_bprintf(m, fmt, bin_args); + + bpf_bprintf_cleanup(); return seq_has_overflowed(m) ? -EOVERFLOW : 0; } -- cgit v1.2.3 From e542d29ca81d005651680a0a697b72ca13ddc4cc Mon Sep 17 00:00:00 2001 From: Andreas Roeseler Date: Tue, 27 Apr 2021 10:36:35 -0500 Subject: icmp: standardize naming of RFC 8335 PROBE constants The current definitions of constants for PROBE, currently defined only in the net-next kernel branch, are inconsistent, with some beginning with ICMP and others with simply EXT. This patch attempts to standardize the naming conventions of the constants for PROBE before their release into a stable Kernel, and to update the relevant definitions in net/ipv4/icmp.c. Similarly, the definitions for the code field (previously ICMP_EXT_MAL_QUERY, etc) use the same prefixes as the type field. This patch adds _CODE_ to the prefix to clarify the distinction of these constants. Signed-off-by: Andreas Roeseler Acked-by: David Ahern Link: https://lore.kernel.org/r/20210427153635.2591-1-andreas.a.roeseler@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/icmp.h | 28 ++++++++++++++-------------- net/ipv4/icmp.c | 16 ++++++++-------- 2 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/icmp.h b/include/uapi/linux/icmp.h index 222325d1d80e..c1da8244c5e1 100644 --- a/include/uapi/linux/icmp.h +++ b/include/uapi/linux/icmp.h @@ -70,22 +70,22 @@ #define ICMP_EXC_FRAGTIME 1 /* Fragment Reass time exceeded */ /* Codes for EXT_ECHO (PROBE) */ -#define ICMP_EXT_ECHO 42 -#define ICMP_EXT_ECHOREPLY 43 -#define ICMP_EXT_MAL_QUERY 1 /* Malformed Query */ -#define ICMP_EXT_NO_IF 2 /* No such Interface */ -#define ICMP_EXT_NO_TABLE_ENT 3 /* No such Table Entry */ -#define ICMP_EXT_MULT_IFS 4 /* Multiple Interfaces Satisfy Query */ +#define ICMP_EXT_ECHO 42 +#define ICMP_EXT_ECHOREPLY 43 +#define ICMP_EXT_CODE_MAL_QUERY 1 /* Malformed Query */ +#define ICMP_EXT_CODE_NO_IF 2 /* No such Interface */ +#define ICMP_EXT_CODE_NO_TABLE_ENT 3 /* No such Table Entry */ +#define ICMP_EXT_CODE_MULT_IFS 4 /* Multiple Interfaces Satisfy Query */ /* Constants for EXT_ECHO (PROBE) */ -#define EXT_ECHOREPLY_ACTIVE (1 << 2)/* active bit in reply message */ -#define EXT_ECHOREPLY_IPV4 (1 << 1)/* ipv4 bit in reply message */ -#define EXT_ECHOREPLY_IPV6 1 /* ipv6 bit in reply message */ -#define EXT_ECHO_CTYPE_NAME 1 -#define EXT_ECHO_CTYPE_INDEX 2 -#define EXT_ECHO_CTYPE_ADDR 3 -#define ICMP_AFI_IP 1 /* Address Family Identifier for ipv4 */ -#define ICMP_AFI_IP6 2 /* Address Family Identifier for ipv6 */ +#define ICMP_EXT_ECHOREPLY_ACTIVE (1 << 2)/* active bit in reply message */ +#define ICMP_EXT_ECHOREPLY_IPV4 (1 << 1)/* ipv4 bit in reply message */ +#define ICMP_EXT_ECHOREPLY_IPV6 1 /* ipv6 bit in reply message */ +#define ICMP_EXT_ECHO_CTYPE_NAME 1 +#define ICMP_EXT_ECHO_CTYPE_INDEX 2 +#define ICMP_EXT_ECHO_CTYPE_ADDR 3 +#define ICMP_AFI_IP 1 /* Address Family Identifier for ipv4 */ +#define ICMP_AFI_IP6 2 /* Address Family Identifier for ipv6 */ struct icmphdr { __u8 type; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8bd988fbcb31..7b6931a4d775 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1033,7 +1033,7 @@ static bool icmp_echo(struct sk_buff *skb) status = 0; dev = NULL; switch (iio->extobj_hdr.class_type) { - case EXT_ECHO_CTYPE_NAME: + case ICMP_EXT_ECHO_CTYPE_NAME: iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); if (ident_len >= IFNAMSIZ) goto send_mal_query; @@ -1041,14 +1041,14 @@ static bool icmp_echo(struct sk_buff *skb) memcpy(buff, &iio->ident.name, ident_len); dev = dev_get_by_name(net, buff); break; - case EXT_ECHO_CTYPE_INDEX: + case ICMP_EXT_ECHO_CTYPE_INDEX: iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + sizeof(iio->ident.ifindex), &_iio); if (ident_len != sizeof(iio->ident.ifindex)) goto send_mal_query; dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); break; - case EXT_ECHO_CTYPE_ADDR: + case ICMP_EXT_ECHO_CTYPE_ADDR: if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + iio->ident.addr.ctype3_hdr.addrlen) goto send_mal_query; @@ -1080,23 +1080,23 @@ static bool icmp_echo(struct sk_buff *skb) goto send_mal_query; } if (!dev) { - icmp_param.data.icmph.code = ICMP_EXT_NO_IF; + icmp_param.data.icmph.code = ICMP_EXT_CODE_NO_IF; goto send_reply; } /* Fill bits in reply message */ if (dev->flags & IFF_UP) - status |= EXT_ECHOREPLY_ACTIVE; + status |= ICMP_EXT_ECHOREPLY_ACTIVE; if (__in_dev_get_rcu(dev) && __in_dev_get_rcu(dev)->ifa_list) - status |= EXT_ECHOREPLY_IPV4; + status |= ICMP_EXT_ECHOREPLY_IPV4; if (!list_empty(&rcu_dereference(dev->ip6_ptr)->addr_list)) - status |= EXT_ECHOREPLY_IPV6; + status |= ICMP_EXT_ECHOREPLY_IPV6; dev_put(dev); icmp_param.data.icmph.un.echo.sequence |= htons(status); send_reply: icmp_reply(&icmp_param, skb); return true; send_mal_query: - icmp_param.data.icmph.code = ICMP_EXT_MAL_QUERY; + icmp_param.data.icmph.code = ICMP_EXT_CODE_MAL_QUERY; goto send_reply; } -- cgit v1.2.3 From 4a52dd8fefb45626dace70a63c0738dbd83b7edb Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 28 Apr 2021 15:09:46 +0200 Subject: net: selftest: fix build issue if INET is disabled In case ethernet driver is enabled and INET is disabled, selftest will fail to build. Reported-by: Randy Dunlap Fixes: 3e1e58d64c3d ("net: add generic selftest support") Signed-off-by: Oleksij Rempel Acked-by: Randy Dunlap # build-tested Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20210428130947.29649-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/atheros/Kconfig | 2 +- drivers/net/ethernet/freescale/Kconfig | 2 +- include/net/selftests.h | 19 +++++++++++++++++++ net/Kconfig | 2 +- net/core/Makefile | 2 +- net/dsa/Kconfig | 2 +- 6 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/atheros/Kconfig b/drivers/net/ethernet/atheros/Kconfig index 6842b74b0696..482c58c4c584 100644 --- a/drivers/net/ethernet/atheros/Kconfig +++ b/drivers/net/ethernet/atheros/Kconfig @@ -20,8 +20,8 @@ if NET_VENDOR_ATHEROS config AG71XX tristate "Atheros AR7XXX/AR9XXX built-in ethernet mac support" depends on ATH79 - select NET_SELFTESTS select PHYLINK + imply NET_SELFTESTS help If you wish to compile a kernel for AR7XXX/91XXX and enable ethernet support, then you should always answer Y to this. diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig index 3d937b4650b2..2d1abdd58fab 100644 --- a/drivers/net/ethernet/freescale/Kconfig +++ b/drivers/net/ethernet/freescale/Kconfig @@ -26,8 +26,8 @@ config FEC ARCH_MXC || SOC_IMX28 || COMPILE_TEST) default ARCH_MXC || SOC_IMX28 if ARM select CRC32 - select NET_SELFTESTS select PHYLIB + imply NET_SELFTESTS imply PTP_1588_CLOCK help Say Y here if you want to use the built-in 10/100 Fast ethernet diff --git a/include/net/selftests.h b/include/net/selftests.h index 9993b9498cf3..e65e8d230d33 100644 --- a/include/net/selftests.h +++ b/include/net/selftests.h @@ -4,9 +4,28 @@ #include +#if IS_ENABLED(CONFIG_NET_SELFTESTS) + void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf); int net_selftest_get_count(void); void net_selftest_get_strings(u8 *data); +#else + +static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest, + u64 *buf) +{ +} + +static inline int net_selftest_get_count(void) +{ + return 0; +} + +static inline void net_selftest_get_strings(u8 *data) +{ +} + +#endif #endif /* _NET_SELFTESTS */ diff --git a/net/Kconfig b/net/Kconfig index 8d955195c069..f5ee7c65e6b4 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -431,7 +431,7 @@ config SOCK_VALIDATE_XMIT config NET_SELFTESTS def_tristate PHYLIB - depends on PHYLIB + depends on PHYLIB && INET config NET_SOCK_MSG bool diff --git a/net/core/Makefile b/net/core/Makefile index 1a6168d8f23b..f7f16650fe9e 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o @@ -33,7 +34,6 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o endif diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 5baba7021427..00bb89b2d86f 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,7 +9,7 @@ menuconfig NET_DSA select NET_SWITCHDEV select PHYLINK select NET_DEVLINK - select NET_SELFTESTS + imply NET_SELFTESTS help Say Y if you want to enable support for the hardware switches supported by the Distributed Switch Architecture. -- cgit v1.2.3