From 92658916e21f12c4e8c81484f7f3f9ee56b42bb1 Mon Sep 17 00:00:00 2001 From: Amitay Isaacs Date: Fri, 21 Jan 2022 16:08:16 +1030 Subject: fsi: sbefifo: Implement FSI_SBEFIFO_READ_TIMEOUT_SECONDS ioctl FSI_SBEFIFO_READ_TIMEOUT_SECONDS ioctl sets the read timeout (in seconds) for the response received by sbefifo device from sbe. The timeout affects only the read operation on current sbefifo device fd. Certain SBE operations can take long time to complete and the default timeout of 10 seconds might not be sufficient to start receiving response from SBE. In such cases, allow the timeout to be set to the maximum of 120 seconds. The kernel does not contain the definition of the various SBE operations, so we must expose an interface to userspace to set the timeout for the given operation. OpenBMC-Staging-Count: 1 Signed-off-by: Amitay Isaacs Signed-off-by: Joel Stanley Reviewed-by: Eddie James Link: https://lore.kernel.org/r/20220121053816.82253-3-joel@jms.id.au Signed-off-by: Joel Stanley --- include/uapi/linux/fsi.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fsi.h b/include/uapi/linux/fsi.h index da577ecd90e7..b2f1977378c7 100644 --- a/include/uapi/linux/fsi.h +++ b/include/uapi/linux/fsi.h @@ -55,4 +55,18 @@ struct scom_access { #define FSI_SCOM_WRITE _IOWR('s', 0x02, struct scom_access) #define FSI_SCOM_RESET _IOW('s', 0x03, __u32) +/* + * /dev/sbefifo* ioctl interface + */ + +/** + * FSI_SBEFIFO_READ_TIMEOUT sets the read timeout for response from SBE. + * + * The read timeout is specified in seconds. The minimum value of read + * timeout is 10 seconds (default) and the maximum value of read timeout is + * 120 seconds. A read timeout of 0 will reset the value to the default of + * (10 seconds). + */ +#define FSI_SBEFIFO_READ_TIMEOUT_SECONDS _IOW('s', 0x00, __u32) + #endif /* _UAPI_LINUX_FSI_H */ -- cgit v1.2.3 From ab53623d24de80aebad2b5fe734dc7789691f411 Mon Sep 17 00:00:00 2001 From: Eddie James Date: Mon, 7 Feb 2022 10:16:40 -0600 Subject: fsi: Add trace events in initialization path Add definitions for trace events to show the scanning flow. OpenBMC-Staging-Count: 1 Signed-off-by: Eddie James Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20220207161640.35605-1-eajames@linux.ibm.com Signed-off-by: Joel Stanley --- drivers/fsi/fsi-core.c | 11 ++-- drivers/fsi/fsi-master-aspeed.c | 2 + include/trace/events/fsi.h | 86 ++++++++++++++++++++++++++++++++ include/trace/events/fsi_master_aspeed.h | 12 +++++ 4 files changed, 108 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/fsi/fsi-core.c b/drivers/fsi/fsi-core.c index 92e6eebd1851..5858e6339a10 100644 --- a/drivers/fsi/fsi-core.c +++ b/drivers/fsi/fsi-core.c @@ -24,9 +24,6 @@ #include "fsi-master.h" -#define CREATE_TRACE_POINTS -#include - #define FSI_SLAVE_CONF_NEXT_MASK GENMASK(31, 31) #define FSI_SLAVE_CONF_SLOTS_MASK GENMASK(23, 16) #define FSI_SLAVE_CONF_SLOTS_SHIFT 16 @@ -95,6 +92,9 @@ struct fsi_slave { u8 t_echo_delay; }; +#define CREATE_TRACE_POINTS +#include + #define to_fsi_master(d) container_of(d, struct fsi_master, dev) #define to_fsi_slave(d) container_of(d, struct fsi_slave, dev) @@ -524,6 +524,8 @@ static int fsi_slave_scan(struct fsi_slave *slave) dev->addr = engine_addr; dev->size = slots * engine_page_size; + trace_fsi_dev_init(dev); + dev_dbg(&slave->dev, "engine[%i]: type %x, version %x, addr %x size %x\n", dev->unit, dev->engine_type, version, @@ -1006,6 +1008,7 @@ static int fsi_slave_init(struct fsi_master *master, int link, uint8_t id) crc = crc4(0, cfam_id, 32); if (crc) { + trace_fsi_slave_invalid_cfam(master, link, cfam_id); dev_warn(&master->dev, "slave %02x:%02x invalid cfam id CRC!\n", link, id); return -EIO; @@ -1080,6 +1083,8 @@ static int fsi_slave_init(struct fsi_master *master, int link, uint8_t id) if (rc) goto err_free; + trace_fsi_slave_init(slave); + /* Create chardev for userspace access */ cdev_init(&slave->cdev, &cfam_fops); rc = cdev_device_add(&slave->cdev, &slave->dev); diff --git a/drivers/fsi/fsi-master-aspeed.c b/drivers/fsi/fsi-master-aspeed.c index 8606e55c1721..04fec1aab23c 100644 --- a/drivers/fsi/fsi-master-aspeed.c +++ b/drivers/fsi/fsi-master-aspeed.c @@ -449,11 +449,13 @@ static ssize_t cfam_reset_store(struct device *dev, struct device_attribute *att { struct fsi_master_aspeed *aspeed = dev_get_drvdata(dev); + trace_fsi_master_aspeed_cfam_reset(true); mutex_lock(&aspeed->lock); gpiod_set_value(aspeed->cfam_reset_gpio, 1); usleep_range(900, 1000); gpiod_set_value(aspeed->cfam_reset_gpio, 0); mutex_unlock(&aspeed->lock); + trace_fsi_master_aspeed_cfam_reset(false); return count; } diff --git a/include/trace/events/fsi.h b/include/trace/events/fsi.h index 9832cb8e0eb0..c9a72e8432b8 100644 --- a/include/trace/events/fsi.h +++ b/include/trace/events/fsi.h @@ -122,6 +122,92 @@ TRACE_EVENT(fsi_master_break, ) ); +TRACE_EVENT(fsi_slave_init, + TP_PROTO(const struct fsi_slave *slave), + TP_ARGS(slave), + TP_STRUCT__entry( + __field(int, master_idx) + __field(int, master_n_links) + __field(int, idx) + __field(int, link) + __field(int, chip_id) + __field(__u32, cfam_id) + __field(__u32, size) + ), + TP_fast_assign( + __entry->master_idx = slave->master->idx; + __entry->master_n_links = slave->master->n_links; + __entry->idx = slave->cdev_idx; + __entry->link = slave->link; + __entry->chip_id = slave->chip_id; + __entry->cfam_id = slave->cfam_id; + __entry->size = slave->size; + ), + TP_printk("fsi%d: idx:%d link:%d/%d cid:%d cfam:%08x %08x", + __entry->master_idx, + __entry->idx, + __entry->link, + __entry->master_n_links, + __entry->chip_id, + __entry->cfam_id, + __entry->size + ) +); + +TRACE_EVENT(fsi_slave_invalid_cfam, + TP_PROTO(const struct fsi_master *master, int link, uint32_t cfam_id), + TP_ARGS(master, link, cfam_id), + TP_STRUCT__entry( + __field(int, master_idx) + __field(int, master_n_links) + __field(int, link) + __field(__u32, cfam_id) + ), + TP_fast_assign( + __entry->master_idx = master->idx; + __entry->master_n_links = master->n_links; + __entry->link = link; + __entry->cfam_id = cfam_id; + ), + TP_printk("fsi%d: cfam:%08x link:%d/%d", + __entry->master_idx, + __entry->cfam_id, + __entry->link, + __entry->master_n_links + ) +); + +TRACE_EVENT(fsi_dev_init, + TP_PROTO(const struct fsi_device *dev), + TP_ARGS(dev), + TP_STRUCT__entry( + __field(int, master_idx) + __field(int, link) + __field(int, type) + __field(int, unit) + __field(int, version) + __field(__u32, addr) + __field(__u32, size) + ), + TP_fast_assign( + __entry->master_idx = dev->slave->master->idx; + __entry->link = dev->slave->link; + __entry->type = dev->engine_type; + __entry->unit = dev->unit; + __entry->version = dev->version; + __entry->addr = dev->addr; + __entry->size = dev->size; + ), + TP_printk("fsi%d: slv%d: t:%02x u:%02x v:%02x %08x@%08x", + __entry->master_idx, + __entry->link, + __entry->type, + __entry->unit, + __entry->version, + __entry->size, + __entry->addr + ) +); #endif /* _TRACE_FSI_H */ diff --git a/include/trace/events/fsi_master_aspeed.h b/include/trace/events/fsi_master_aspeed.h index a355ceacc33f..0fff873775f1 100644 --- a/include/trace/events/fsi_master_aspeed.h +++ b/include/trace/events/fsi_master_aspeed.h @@ -72,6 +72,18 @@ TRACE_EVENT(fsi_master_aspeed_opb_error, ) ); +TRACE_EVENT(fsi_master_aspeed_cfam_reset, + TP_PROTO(bool start), + TP_ARGS(start), + TP_STRUCT__entry( + __field(bool, start) + ), + TP_fast_assign( + __entry->start = start; + ), + TP_printk("%s", __entry->start ? "start" : "end") +); + #endif #include -- cgit v1.2.3 From 0a01326fddf6fb9ae75440ab4912ecbd5e4583f3 Mon Sep 17 00:00:00 2001 From: Igor Pylypiv Date: Thu, 27 Jan 2022 15:39:53 -0800 Subject: Revert "module, async: async_synchronize_full() on module init iff async is used" [ Upstream commit 67d6212afda218d564890d1674bab28e8612170f ] This reverts commit 774a1221e862b343388347bac9b318767336b20b. We need to finish all async code before the module init sequence is done. In the reverted commit the PF_USED_ASYNC flag was added to mark a thread that called async_schedule(). Then the PF_USED_ASYNC flag was used to determine whether or not async_synchronize_full() needs to be invoked. This works when modprobe thread is calling async_schedule(), but it does not work if module dispatches init code to a worker thread which then calls async_schedule(). For example, PCI driver probing is invoked from a worker thread based on a node where device is attached: if (cpu < nr_cpu_ids) error = work_on_cpu(cpu, local_pci_probe, &ddi); else error = local_pci_probe(&ddi); We end up in a situation where a worker thread gets the PF_USED_ASYNC flag set instead of the modprobe thread. As a result, async_synchronize_full() is not invoked and modprobe completes without waiting for the async code to finish. The issue was discovered while loading the pm80xx driver: (scsi_mod.scan=async) modprobe pm80xx worker ... do_init_module() ... pci_call_probe() work_on_cpu(local_pci_probe) local_pci_probe() pm8001_pci_probe() scsi_scan_host() async_schedule() worker->flags |= PF_USED_ASYNC; ... < return from worker > ... if (current->flags & PF_USED_ASYNC) <--- false async_synchronize_full(); Commit 21c3c5d28007 ("block: don't request module during elevator init") fixed the deadlock issue which the reverted commit 774a1221e862 ("module, async: async_synchronize_full() on module init iff async is used") tried to fix. Since commit 0fdff3ec6d87 ("async, kmod: warn on synchronous request_module() from async workers") synchronous module loading from async is not allowed. Given that the original deadlock issue is fixed and it is no longer allowed to call synchronous request_module() from async we can remove PF_USED_ASYNC flag to make module init consistently invoke async_synchronize_full() unless async module probe is requested. Signed-off-by: Igor Pylypiv Reviewed-by: Changyuan Lyu Reviewed-by: Luis Chamberlain Acked-by: Tejun Heo Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- include/linux/sched.h | 1 - kernel/async.c | 3 --- kernel/module.c | 25 +++++-------------------- 3 files changed, 5 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index c1a927ddec64..76e869550646 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1675,7 +1675,6 @@ extern struct pid *cad_pid; #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ -#define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f..b2c4ba5686ee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 5c26a76e800b..83991c2d5af9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3683,12 +3683,6 @@ static noinline int do_init_module(struct module *mod) } freeinit->module_init = mod->init_layout.base; - /* - * We want to find out whether @mod uses async during init. Clear - * PF_USED_ASYNC. async_schedule*() will set it. - */ - current->flags &= ~PF_USED_ASYNC; - do_mod_ctors(mod); /* Start the module */ if (mod->init != NULL) @@ -3714,22 +3708,13 @@ static noinline int do_init_module(struct module *mod) /* * We need to finish all async code before the module init sequence - * is done. This has potential to deadlock. For example, a newly - * detected block device can trigger request_module() of the - * default iosched from async probing task. Once userland helper - * reaches here, async_synchronize_full() will wait on the async - * task waiting on request_module() and deadlock. - * - * This deadlock is avoided by perfomring async_synchronize_full() - * iff module init queued any async jobs. This isn't a full - * solution as it will deadlock the same if module loading from - * async jobs nests more than once; however, due to the various - * constraints, this hack seems to be the best option for now. - * Please refer to the following thread for details. + * is done. This has potential to deadlock if synchronous module + * loading is requested from async (which is not allowed!). * - * http://thread.gmane.org/gmane.linux.kernel/1420814 + * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous + * request_module() from async workers") for more details. */ - if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC)) + if (!mod->async_probe_requested) async_synchronize_full(); ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base + -- cgit v1.2.3 From 9ed25183e6d47a71f5385dbc52419c473646c768 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Feb 2022 09:32:17 -0800 Subject: ipv6: fix data-race in fib6_info_hw_flags_set / fib6_purge_rt commit d95d6320ba7a51d61c097ffc3bcafcf70283414e upstream. Because fib6_info_hw_flags_set() is called without any synchronization, all accesses to gi6->offload, fi->trap and fi->offload_failed need some basic protection like READ_ONCE()/WRITE_ONCE(). BUG: KCSAN: data-race in fib6_info_hw_flags_set / fib6_purge_rt read to 0xffff8881087d5886 of 1 bytes by task 13953 on cpu 0: fib6_drop_pcpu_from net/ipv6/ip6_fib.c:1007 [inline] fib6_purge_rt+0x4f/0x580 net/ipv6/ip6_fib.c:1033 fib6_del_route net/ipv6/ip6_fib.c:1983 [inline] fib6_del+0x696/0x890 net/ipv6/ip6_fib.c:2028 __ip6_del_rt net/ipv6/route.c:3876 [inline] ip6_del_rt+0x83/0x140 net/ipv6/route.c:3891 __ipv6_dev_ac_dec+0x2b5/0x370 net/ipv6/anycast.c:374 ipv6_dev_ac_dec net/ipv6/anycast.c:387 [inline] __ipv6_sock_ac_close+0x141/0x200 net/ipv6/anycast.c:207 ipv6_sock_ac_close+0x79/0x90 net/ipv6/anycast.c:220 inet6_release+0x32/0x50 net/ipv6/af_inet6.c:476 __sock_release net/socket.c:650 [inline] sock_close+0x6c/0x150 net/socket.c:1318 __fput+0x295/0x520 fs/file_table.c:280 ____fput+0x11/0x20 fs/file_table.c:313 task_work_run+0x8e/0x110 kernel/task_work.c:164 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:175 [inline] exit_to_user_mode_prepare+0x160/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881087d5886 of 1 bytes by task 1912 on cpu 1: fib6_info_hw_flags_set+0x155/0x3b0 net/ipv6/route.c:6230 nsim_fib6_rt_hw_flags_set drivers/net/netdevsim/fib.c:668 [inline] nsim_fib6_rt_add drivers/net/netdevsim/fib.c:691 [inline] nsim_fib6_rt_insert drivers/net/netdevsim/fib.c:756 [inline] nsim_fib6_event drivers/net/netdevsim/fib.c:853 [inline] nsim_fib_event drivers/net/netdevsim/fib.c:886 [inline] nsim_fib_event_work+0x284f/0x2cf0 drivers/net/netdevsim/fib.c:1477 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x22 -> 0x2a Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 1912 Comm: kworker/1:3 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events nsim_fib_event_work Fixes: 0c5fcf9e249e ("IPv6: Add "offload failed" indication to routes") Fixes: bb3c4ab93e44 ("ipv6: Add "offload" and "trap" indications to routes") Signed-off-by: Eric Dumazet Cc: Amit Cohen Cc: Ido Schimmel Reported-by: syzbot Link: https://lore.kernel.org/r/20220216173217.3792411-2-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/netdevsim/fib.c | 4 ++-- include/net/ip6_fib.h | 10 ++++++---- net/ipv6/route.c | 19 ++++++++++--------- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 4300261e2f9e..378ee779061c 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -623,14 +623,14 @@ static int nsim_fib6_rt_append(struct nsim_fib_data *data, if (err) goto err_fib6_rt_nh_del; - fib6_event->rt_arr[i]->trap = true; + WRITE_ONCE(fib6_event->rt_arr[i]->trap, true); } return 0; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { - fib6_event->rt_arr[i]->trap = false; + WRITE_ONCE(fib6_event->rt_arr[i]->trap, false); nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); } return err; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c85b040728d7..bbb27639f293 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -189,14 +189,16 @@ struct fib6_info { u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; + + u8 offload; + u8 trap; + u8 offload_failed; + u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, - offload:1, - trap:1, - offload_failed:1, - unused:1; + unused:4; struct rcu_head rcu; struct nexthop *nh; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3c5bb4969220..e0766bdf20e7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5767,11 +5767,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } if (!dst) { - if (rt->offload) + if (READ_ONCE(rt->offload)) rtm->rtm_flags |= RTM_F_OFFLOAD; - if (rt->trap) + if (READ_ONCE(rt->trap)) rtm->rtm_flags |= RTM_F_TRAP; - if (rt->offload_failed) + if (READ_ONCE(rt->offload_failed)) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } @@ -6229,19 +6229,20 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, struct sk_buff *skb; int err; - if (f6i->offload == offload && f6i->trap == trap && - f6i->offload_failed == offload_failed) + if (READ_ONCE(f6i->offload) == offload && + READ_ONCE(f6i->trap) == trap && + READ_ONCE(f6i->offload_failed) == offload_failed) return; - f6i->offload = offload; - f6i->trap = trap; + WRITE_ONCE(f6i->offload, offload); + WRITE_ONCE(f6i->trap, trap); /* 2 means send notifications only if offload_failed was changed. */ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && - f6i->offload_failed == offload_failed) + READ_ONCE(f6i->offload_failed) == offload_failed) return; - f6i->offload_failed = offload_failed; + WRITE_ONCE(f6i->offload_failed, offload_failed); if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send -- cgit v1.2.3 From 3e11ef1903cf6c2fba35594b193a3570854d9e9e Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Fri, 11 Feb 2022 17:30:42 +0000 Subject: ipv6: mcast: use rcu-safe version of ipv6_get_lladdr() commit 26394fc118d6115390bd5b3a0fb17096271da227 upstream. Some time ago 8965779d2c0e ("ipv6,mcast: always hold idev->lock before mca_lock") switched ipv6_get_lladdr() to __ipv6_get_lladdr(), which is rcu-unsafe version. That was OK, because idev->lock was held for these codepaths. In 88e2ca308094 ("mld: convert ifmcaddr6 to RCU") these external locks were removed, so we probably need to restore the original rcu-safe call. Otherwise, we occasionally get a machine crashed/stalled with the following in dmesg: [ 3405.966610][T230589] general protection fault, probably for non-canonical address 0xdead00000000008c: 0000 [#1] SMP NOPTI [ 3405.982083][T230589] CPU: 44 PID: 230589 Comm: kworker/44:3 Tainted: G O 5.15.19-cloudflare-2022.2.1 #1 [ 3405.998061][T230589] Hardware name: SUPA-COOL-SERV [ 3406.009552][T230589] Workqueue: mld mld_ifc_work [ 3406.017224][T230589] RIP: 0010:__ipv6_get_lladdr+0x34/0x60 [ 3406.025780][T230589] Code: 57 10 48 83 c7 08 48 89 e5 48 39 d7 74 3e 48 8d 82 38 ff ff ff eb 13 48 8b 90 d0 00 00 00 48 8d 82 38 ff ff ff 48 39 d7 74 22 <66> 83 78 32 20 77 1b 75 e4 89 ca 23 50 2c 75 dd 48 8b 50 08 48 8b [ 3406.055748][T230589] RSP: 0018:ffff94e4b3fc3d10 EFLAGS: 00010202 [ 3406.065617][T230589] RAX: dead00000000005a RBX: ffff94e4b3fc3d30 RCX: 0000000000000040 [ 3406.077477][T230589] RDX: dead000000000122 RSI: ffff94e4b3fc3d30 RDI: ffff8c3a31431008 [ 3406.089389][T230589] RBP: ffff94e4b3fc3d10 R08: 0000000000000000 R09: 0000000000000000 [ 3406.101445][T230589] R10: ffff8c3a31430000 R11: 000000000000000b R12: ffff8c2c37887100 [ 3406.113553][T230589] R13: ffff8c3a39537000 R14: 00000000000005dc R15: ffff8c3a31431000 [ 3406.125730][T230589] FS: 0000000000000000(0000) GS:ffff8c3b9fc80000(0000) knlGS:0000000000000000 [ 3406.138992][T230589] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3406.149895][T230589] CR2: 00007f0dfea1db60 CR3: 000000387b5f2000 CR4: 0000000000350ee0 [ 3406.162421][T230589] Call Trace: [ 3406.170235][T230589] [ 3406.177736][T230589] mld_newpack+0xfe/0x1a0 [ 3406.186686][T230589] add_grhead+0x87/0xa0 [ 3406.195498][T230589] add_grec+0x485/0x4e0 [ 3406.204310][T230589] ? newidle_balance+0x126/0x3f0 [ 3406.214024][T230589] mld_ifc_work+0x15d/0x450 [ 3406.223279][T230589] process_one_work+0x1e6/0x380 [ 3406.232982][T230589] worker_thread+0x50/0x3a0 [ 3406.242371][T230589] ? rescuer_thread+0x360/0x360 [ 3406.252175][T230589] kthread+0x127/0x150 [ 3406.261197][T230589] ? set_kthread_struct+0x40/0x40 [ 3406.271287][T230589] ret_from_fork+0x22/0x30 [ 3406.280812][T230589] [ 3406.288937][T230589] Modules linked in: ... [last unloaded: kheaders] [ 3406.476714][T230589] ---[ end trace 3525a7655f2f3b9e ]--- Fixes: 88e2ca308094 ("mld: convert ifmcaddr6 to RCU") Reported-by: David Pinilla Caparros Signed-off-by: Ignat Korchagin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/addrconf.h | 2 -- net/ipv6/addrconf.c | 4 ++-- net/ipv6/mcast.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index e7ce719838b5..59940e230b78 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -109,8 +109,6 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); -int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, - u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index bf1386542634..c6e1989ab2ed 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1837,8 +1837,8 @@ out: } EXPORT_SYMBOL(ipv6_dev_get_saddr); -int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, - u32 banned_flags) +static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) { struct inet6_ifaddr *ifp; int err = -EADDRNOTAVAIL; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index bed8155508c8..a8861db52c18 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1759,7 +1759,7 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); - if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* : * use unspecified address as the source address * when a valid link-local address is not available. -- cgit v1.2.3 From d51cd648dbcab2b23cb22483d3511cfe85fdb100 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 15 Feb 2022 11:00:37 -0500 Subject: ipv6: per-netns exclusive flowlabel checks commit 0b0dff5b3b98c5c7ce848151df9da0b3cdf0cc8b upstream. Ipv6 flowlabels historically require a reservation before use. Optionally in exclusive mode (e.g., user-private). Commit 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist") introduced a fastpath that avoids this check when no exclusive leases exist in the system, and thus any flowlabel use will be granted. That allows skipping the control operation to reserve a flowlabel entirely. Though with a warning if the fast path fails: This is an optimization. Robust applications still have to revert to requesting leases if the fast path fails due to an exclusive lease. Still, this is subtle. Better isolate network namespaces from each other. Flowlabels are per-netns. Also record per-netns whether exclusive leases are in use. Then behavior does not change based on activity in other netns. Changes v2 - wrap in IS_ENABLED(CONFIG_IPV6) to avoid breakage if disabled Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist") Link: https://lore.kernel.org/netdev/MWHPR2201MB1072BCCCFCE779E4094837ACD0329@MWHPR2201MB1072.namprd22.prod.outlook.com/ Reported-by: Congyu Liu Signed-off-by: Willem de Bruijn Tested-by: Congyu Liu Link: https://lore.kernel.org/r/20220215160037.1976072-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/net/ipv6.h | 5 ++++- include/net/netns/ipv6.h | 3 ++- net/ipv6/ip6_flowlabel.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f2d0ecc257bb..359540dfc033 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -391,17 +391,20 @@ static inline void txopt_put(struct ipv6_txoptions *opt) kfree_rcu(opt, rcu); } +#if IS_ENABLED(CONFIG_IPV6) struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { - if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key)) + if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) && + READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } +#endif struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index a4b550380316..6bd7e5a85ce7 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -77,9 +77,10 @@ struct netns_ipv6 { spinlock_t fib6_gc_lock; unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; + unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES - unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; + unsigned int fib6_rules_require_fldissect; #ifdef CONFIG_IPV6_SUBTREES unsigned int fib6_routes_require_src; #endif diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index aa673a6a7e43..ceb85c67ce39 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -450,8 +450,10 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, err = -EINVAL; goto done; } - if (fl_shared_exclusive(fl) || fl->opt) + if (fl_shared_exclusive(fl) || fl->opt) { + WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); + } return fl; done: -- cgit v1.2.3 From 2566a89b9e163b2fcd104d6005e0149f197b8a48 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Feb 2022 19:45:06 +0200 Subject: net: dsa: mv88e6xxx: flush switchdev FDB workqueue before removing VLAN commit a2614140dc0f467a83aa3bb4b6ee2d6480a76202 upstream. mv88e6xxx is special among DSA drivers in that it requires the VTU to contain the VID of the FDB entry it modifies in mv88e6xxx_port_db_load_purge(), otherwise it will return -EOPNOTSUPP. Sometimes due to races this is not always satisfied even if external code does everything right (first deletes the FDB entries, then the VLAN), because DSA commits to hardware FDB entries asynchronously since commit c9eb3e0f8701 ("net: dsa: Add support for learning FDB through notification"). Therefore, the mv88e6xxx driver must close this race condition by itself, by asking DSA to flush the switchdev workqueue of any FDB deletions in progress, prior to exiting a VLAN. Fixes: c9eb3e0f8701 ("net: dsa: Add support for learning FDB through notification") Reported-by: Rafael Richter Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/mv88e6xxx/chip.c | 7 +++++++ include/net/dsa.h | 1 + net/dsa/dsa.c | 1 + net/dsa/dsa_priv.h | 1 - 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 056e3b65cd27..263da7e2d6be 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2291,6 +2291,13 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, if (!mv88e6xxx_max_vid(chip)) return -EOPNOTSUPP; + /* The ATU removal procedure needs the FID to be mapped in the VTU, + * but FDB deletion runs concurrently with VLAN deletion. Flush the DSA + * switchdev workqueue to ensure that all FDB entries are deleted + * before we remove the VLAN. + */ + dsa_flush_workqueue(); + mv88e6xxx_reg_lock(chip); err = mv88e6xxx_port_get_pvid(chip, port, &pvid); diff --git a/include/net/dsa.h b/include/net/dsa.h index d784e76113b8..49e5ece9361c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1056,6 +1056,7 @@ void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); +void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 41f36ad8b0ec..4ff03fb262e0 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -349,6 +349,7 @@ void dsa_flush_workqueue(void) { flush_workqueue(dsa_owq); } +EXPORT_SYMBOL_GPL(dsa_flush_workqueue); int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index a5c9bc7b66c6..33ab7d7af9eb 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -170,7 +170,6 @@ void dsa_tag_driver_put(const struct dsa_device_ops *ops); const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); -void dsa_flush_workqueue(void); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) -- cgit v1.2.3 From 9bb363ba014ff7d2823baf2a0cf7a91e39fb63f2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Feb 2022 12:06:23 -0800 Subject: net_sched: add __rcu annotation to netdev->qdisc commit 5891cd5ec46c2c2eb6427cb54d214b149635dd0e upstream. syzbot found a data-race [1] which lead me to add __rcu annotations to netdev->qdisc, and proper accessors to get LOCKDEP support. [1] BUG: KCSAN: data-race in dev_activate / qdisc_lookup_rcu write to 0xffff888168ad6410 of 8 bytes by task 13559 on cpu 1: attach_default_qdiscs net/sched/sch_generic.c:1167 [inline] dev_activate+0x2ed/0x8f0 net/sched/sch_generic.c:1221 __dev_open+0x2e9/0x3a0 net/core/dev.c:1416 __dev_change_flags+0x167/0x3f0 net/core/dev.c:8139 rtnl_configure_link+0xc2/0x150 net/core/rtnetlink.c:3150 __rtnl_newlink net/core/rtnetlink.c:3489 [inline] rtnl_newlink+0xf4d/0x13e0 net/core/rtnetlink.c:3529 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888168ad6410 of 8 bytes by task 13560 on cpu 0: qdisc_lookup_rcu+0x30/0x2e0 net/sched/sch_api.c:323 __tcf_qdisc_find+0x74/0x3a0 net/sched/cls_api.c:1050 tc_del_tfilter+0x1c7/0x1350 net/sched/cls_api.c:2211 rtnetlink_rcv_msg+0x5ba/0x7e0 net/core/rtnetlink.c:5585 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffffffff85dee080 -> 0xffff88815d96ec00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 13560 Comm: syz-executor.2 Not tainted 5.17.0-rc3-syzkaller-00116-gf1baf68e1383-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 470502de5bdb ("net: sched: unlock rules update API") Signed-off-by: Eric Dumazet Cc: Vlad Buslov Reported-by: syzbot Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdevice.h | 2 +- net/core/rtnetlink.c | 6 ++++-- net/sched/cls_api.c | 6 +++--- net/sched/sch_api.c | 22 ++++++++++++---------- net/sched/sch_generic.c | 29 ++++++++++++++++------------- 5 files changed, 36 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fba54624191a..62ff09467776 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2149,7 +2149,7 @@ struct net_device { struct netdev_queue *_tx ____cacheline_aligned_in_smp; unsigned int num_tx_queues; unsigned int real_num_tx_queues; - struct Qdisc *qdisc; + struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 198cc8b74dc3..91d7a5a5a08d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1698,6 +1698,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; + struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1715,6 +1716,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; + qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, @@ -1733,8 +1735,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (dev->qdisc && - nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || + (qdisc && + nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 56dba8519d7c..cd44cac7fbcf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1044,7 +1044,7 @@ static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, /* Find qdisc */ if (!*parent) { - *q = dev->qdisc; + *q = rcu_dereference(dev->qdisc); *parent = (*q)->handle; } else { *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); @@ -2587,7 +2587,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) parent = tcm->tcm_parent; if (!parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) @@ -2962,7 +2962,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; if (!tcm->tcm_parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 8e629c356e69..0fb387c9d706 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -301,7 +301,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; @@ -320,7 +320,7 @@ struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; @@ -1082,10 +1082,10 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, skip: if (!ingress) { notify_and_destroy(net, skb, n, classid, - dev->qdisc, new); + rtnl_dereference(dev->qdisc), new); if (new && !new->ops->attach) qdisc_refcount_inc(new); - dev->qdisc = new ? : &noop_qdisc; + rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); if (new && new->ops->attach) new->ops->attach(new); @@ -1460,7 +1460,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); @@ -1549,7 +1549,7 @@ replay: q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ @@ -1771,7 +1771,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, + if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), + skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; @@ -2042,7 +2043,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, } else if (qid1) { qid = qid1; } else if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. @@ -2053,7 +2054,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ @@ -2214,7 +2215,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0) + if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), + skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 47ca76ba7ffa..30c29a9a2efd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1114,30 +1114,33 @@ static void attach_default_qdiscs(struct net_device *dev) if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { - dev->qdisc = qdisc; + rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } + qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ - if (dev->qdisc == &noop_qdisc) { + if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED - if (dev->qdisc != &noop_qdisc) - qdisc_hash_add(dev->qdisc, false); + if (qdisc != &noop_qdisc) + qdisc_hash_add(qdisc, false); #endif } @@ -1167,7 +1170,7 @@ void dev_activate(struct net_device *dev) * and noqueue_qdisc for virtual interfaces */ - if (dev->qdisc == &noop_qdisc) + if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) @@ -1333,7 +1336,7 @@ static int qdisc_change_tx_queue_len(struct net_device *dev, void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { - struct Qdisc *qdisc = dev->qdisc; + struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); @@ -1373,7 +1376,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, void dev_init_scheduler(struct net_device *dev) { - dev->qdisc = &noop_qdisc; + rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); @@ -1401,8 +1404,8 @@ void dev_shutdown(struct net_device *dev) netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - qdisc_put(dev->qdisc); - dev->qdisc = &noop_qdisc; + qdisc_put(rtnl_dereference(dev->qdisc)); + rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } -- cgit v1.2.3 From 4c9f207810b7507bc582ef3b2b333ef55993dbe6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2022 11:15:53 -0800 Subject: bonding: fix data-races around agg_select_timer commit 9ceaf6f76b203682bb6100e14b3d7da4c0bedde8 upstream. syzbot reported that two threads might write over agg_select_timer at the same time. Make agg_select_timer atomic to fix the races. BUG: KCSAN: data-race in bond_3ad_initiate_agg_selection / bond_3ad_state_machine_handler read to 0xffff8881242aea90 of 4 bytes by task 1846 on cpu 1: bond_3ad_state_machine_handler+0x99/0x2810 drivers/net/bonding/bond_3ad.c:2317 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 write to 0xffff8881242aea90 of 4 bytes by task 25910 on cpu 0: bond_3ad_initiate_agg_selection+0x18/0x30 drivers/net/bonding/bond_3ad.c:1998 bond_open+0x658/0x6f0 drivers/net/bonding/bond_main.c:3967 __dev_open+0x274/0x3a0 net/core/dev.c:1407 dev_open+0x54/0x190 net/core/dev.c:1443 bond_enslave+0xcef/0x3000 drivers/net/bonding/bond_main.c:1937 do_set_master net/core/rtnetlink.c:2532 [inline] do_setlink+0x94f/0x2500 net/core/rtnetlink.c:2736 __rtnl_newlink net/core/rtnetlink.c:3414 [inline] rtnl_newlink+0xfeb/0x13e0 net/core/rtnetlink.c:3529 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000050 -> 0x0000004f Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 25910 Comm: syz-executor.1 Tainted: G W 5.17.0-rc4-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jay Vosburgh Cc: Veaceslav Falico Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bonding/bond_3ad.c | 30 +++++++++++++++++++++++++----- include/net/bond_3ad.h | 2 +- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 9fd1d6cba3cd..a86b1f71762e 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -225,7 +225,7 @@ static inline int __check_agg_selection_timer(struct port *port) if (bond == NULL) return 0; - return BOND_AD_INFO(bond).agg_select_timer ? 1 : 0; + return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** @@ -1995,7 +1995,7 @@ static void ad_marker_response_received(struct bond_marker *marker, */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { - BOND_AD_INFO(bond).agg_select_timer = timeout; + atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** @@ -2278,6 +2278,28 @@ void bond_3ad_update_ad_actor_settings(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_agg_timer_advance - advance agg_select_timer + * @bond: bonding structure + * + * Return true when agg_select_timer reaches 0. + */ +static bool bond_agg_timer_advance(struct bonding *bond) +{ + int val, nval; + + while (1) { + val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); + if (!val) + return false; + nval = val - 1; + if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, + val, nval) == val) + break; + } + return nval == 0; +} + /** * bond_3ad_state_machine_handler - handle state machines timeout * @work: work context to fetch bonding struct to work on from @@ -2313,9 +2335,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) if (!bond_has_slaves(bond)) goto re_arm; - /* check if agg_select_timer timer after initialize is timed out */ - if (BOND_AD_INFO(bond).agg_select_timer && - !(--BOND_AD_INFO(bond).agg_select_timer)) { + if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 38785d48baff..184105d68294 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -262,7 +262,7 @@ struct ad_system { struct ad_bond_info { struct ad_system system; /* 802.3ad system structure */ struct bond_3ad_stats stats; - u32 agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ + atomic_t agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ u16 aggregator_identifier; }; -- cgit v1.2.3 From e65450a12cf4f3773b2607e305807f917e0bf239 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Feb 2022 08:52:31 +0100 Subject: block: fix surprise removal for drivers calling blk_set_queue_dying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7a5428dcb7902700b830e912feee4e845df7c019 upstream. Various block drivers call blk_set_queue_dying to mark a disk as dead due to surprise removal events, but since commit 8e141f9eb803 that doesn't work given that the GD_DEAD flag needs to be set to stop I/O. Replace the driver calls to blk_set_queue_dying with a new (and properly documented) blk_mark_disk_dead API, and fold blk_set_queue_dying into the only remaining caller. Fixes: 8e141f9eb803 ("block: drain file system I/O on del_gendisk") Reported-by: Markus Blöchl Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20220217075231.1140-1-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-core.c | 10 ++-------- block/genhd.c | 14 ++++++++++++++ drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/md/dm.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 2 +- include/linux/blkdev.h | 3 ++- 9 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index d42a0f3ff736..42ac3a985c2d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -350,13 +350,6 @@ void blk_queue_start_drain(struct request_queue *q) wake_up_all(&q->mq_freeze_wq); } -void blk_set_queue_dying(struct request_queue *q) -{ - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - blk_queue_start_drain(q); -} -EXPORT_SYMBOL_GPL(blk_set_queue_dying); - /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -374,7 +367,8 @@ void blk_cleanup_queue(struct request_queue *q) WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ - blk_set_queue_dying(q); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); diff --git a/block/genhd.c b/block/genhd.c index de789d1a1e3d..2dcedbe4ef04 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -544,6 +544,20 @@ out_free_ext_minor: } EXPORT_SYMBOL(device_add_disk); +/** + * blk_mark_disk_dead - mark a disk as dead + * @disk: disk to mark as dead + * + * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O + * to this disk. + */ +void blk_mark_disk_dead(struct gendisk *disk) +{ + set_bit(GD_DEAD, &disk->state); + blk_queue_start_drain(disk->queue); +} +EXPORT_SYMBOL_GPL(blk_mark_disk_dead); + /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 901855717cb5..ba61e72741ea 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4112,7 +4112,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) "Completion workers still active!\n"); } - blk_set_queue_dying(dd->queue); + blk_mark_disk_dead(dd->disk); set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); /* Clean up the block layer. */ diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e65c9d706f6f..c4a52f33604d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7182,7 +7182,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, * IO to complete/fail. */ blk_mq_freeze_queue(rbd_dev->disk->queue); - blk_set_queue_dying(rbd_dev->disk->queue); + blk_mark_disk_dead(rbd_dev->disk); } del_gendisk(rbd_dev->disk); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 4dbb71230d6e..3efd34195983 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2128,7 +2128,7 @@ static void blkfront_closing(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - blk_set_queue_dying(info->rq); + blk_mark_disk_dead(info->gd); set_capacity(info->gd, 0); for_each_rinfo(info, rinfo, i) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b75ff6b2b952..5f33700d1247 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2156,7 +2156,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - blk_set_queue_dying(md->queue); + blk_mark_disk_dead(md->disk); /* * Take suspend_lock so that presuspend and postsuspend methods diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8aa92ebb8b7c..a480e1af48e8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -131,7 +131,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; - blk_set_queue_dying(ns->queue); + blk_mark_disk_dead(ns->disk); blk_mq_unquiesce_queue(ns->queue); set_capacity_and_notify(ns->disk, 0); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 2f76969408b2..727520c39710 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -792,7 +792,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - blk_set_queue_dying(head->disk->queue); + blk_mark_disk_dead(head->disk); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index be8e7a55d803..413c0148c0ce 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1184,7 +1184,8 @@ extern void blk_dump_rq_flags(struct request *, char *); bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); -extern void blk_set_queue_dying(struct request_queue *); + +void blk_mark_disk_dead(struct gendisk *disk); #ifdef CONFIG_BLOCK /* -- cgit v1.2.3 From f7b95b396700a43a44a7e541c1cbf14031b42604 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 2 Feb 2022 12:55:53 -0800 Subject: x86/bug: Merge annotate_reachable() into _BUG_FLAGS() asm [ Upstream commit bfb1a7c91fb7758273b4a8d735313d9cc388b502 ] In __WARN_FLAGS(), we had two asm statements (abbreviated): asm volatile("ud2"); asm volatile(".pushsection .discard.reachable"); These pair of statements are used to trigger an exception, but then help objtool understand that for warnings, control flow will be restored immediately afterwards. The problem is that volatile is not a compiler barrier. GCC explicitly documents this: > Note that the compiler can move even volatile asm instructions > relative to other code, including across jump instructions. Also, no clobbers are specified to prevent instructions from subsequent statements from being scheduled by compiler before the second asm statement. This can lead to instructions from subsequent statements being emitted by the compiler before the second asm statement. Providing a scheduling model such as via -march= options enables the compiler to better schedule instructions with known latencies to hide latencies from data hazards compared to inline asm statements in which latencies are not estimated. If an instruction gets scheduled by the compiler between the two asm statements, then objtool will think that it is not reachable, producing a warning. To prevent instructions from being scheduled in between the two asm statements, merge them. Also remove an unnecessary unreachable() asm annotation from BUG() in favor of __builtin_unreachable(). objtool is able to track that the ud2 from BUG() terminates control flow within the function. Link: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Volatile Link: https://github.com/ClangBuiltLinux/linux/issues/1483 Signed-off-by: Nick Desaulniers Signed-off-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20220202205557.2260694-1-ndesaulniers@google.com Signed-off-by: Sasha Levin --- arch/x86/include/asm/bug.h | 20 +++++++++++--------- include/linux/compiler.h | 21 +++++---------------- 2 files changed, 16 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 84b87538a15d..bab883c0b6fe 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -22,7 +22,7 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUG_FLAGS(ins, flags) \ +#define _BUG_FLAGS(ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ @@ -31,7 +31,8 @@ do { \ "\t.word %c1" "\t# bug_entry::line\n" \ "\t.word %c2" "\t# bug_entry::flags\n" \ "\t.org 2b+%c3\n" \ - ".popsection" \ + ".popsection\n" \ + extra \ : : "i" (__FILE__), "i" (__LINE__), \ "i" (flags), \ "i" (sizeof(struct bug_entry))); \ @@ -39,14 +40,15 @@ do { \ #else /* !CONFIG_DEBUG_BUGVERBOSE */ -#define _BUG_FLAGS(ins, flags) \ +#define _BUG_FLAGS(ins, flags, extra) \ do { \ asm_inline volatile("1:\t" ins "\n" \ ".pushsection __bug_table,\"aw\"\n" \ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ "\t.word %c0" "\t# bug_entry::flags\n" \ "\t.org 2b+%c1\n" \ - ".popsection" \ + ".popsection\n" \ + extra \ : : "i" (flags), \ "i" (sizeof(struct bug_entry))); \ } while (0) @@ -55,7 +57,7 @@ do { \ #else -#define _BUG_FLAGS(ins, flags) asm volatile(ins) +#define _BUG_FLAGS(ins, flags, extra) asm volatile(ins) #endif /* CONFIG_GENERIC_BUG */ @@ -63,8 +65,8 @@ do { \ #define BUG() \ do { \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, 0); \ - unreachable(); \ + _BUG_FLAGS(ASM_UD2, 0, ""); \ + __builtin_unreachable(); \ } while (0) /* @@ -75,9 +77,9 @@ do { \ */ #define __WARN_FLAGS(flags) \ do { \ + __auto_type f = BUGFLAG_WARNING|(flags); \ instrumentation_begin(); \ - _BUG_FLAGS(ASM_UD2, BUGFLAG_WARNING|(flags)); \ - annotate_reachable(); \ + _BUG_FLAGS(ASM_UD2, f, ASM_REACHABLE); \ instrumentation_end(); \ } while (0) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 429dcebe2b99..0f7fd205ab7e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -117,14 +117,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, */ #define __stringify_label(n) #n -#define __annotate_reachable(c) ({ \ - asm volatile(__stringify_label(c) ":\n\t" \ - ".pushsection .discard.reachable\n\t" \ - ".long " __stringify_label(c) "b - .\n\t" \ - ".popsection\n\t" : : "i" (c)); \ -}) -#define annotate_reachable() __annotate_reachable(__COUNTER__) - #define __annotate_unreachable(c) ({ \ asm volatile(__stringify_label(c) ":\n\t" \ ".pushsection .discard.unreachable\n\t" \ @@ -133,24 +125,21 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, }) #define annotate_unreachable() __annotate_unreachable(__COUNTER__) -#define ASM_UNREACHABLE \ - "999:\n\t" \ - ".pushsection .discard.unreachable\n\t" \ - ".long 999b - .\n\t" \ +#define ASM_REACHABLE \ + "998:\n\t" \ + ".pushsection .discard.reachable\n\t" \ + ".long 998b - .\n\t" \ ".popsection\n\t" /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table") #else -#define annotate_reachable() #define annotate_unreachable() +# define ASM_REACHABLE #define __annotate_jump_table #endif -#ifndef ASM_UNREACHABLE -# define ASM_UNREACHABLE -#endif #ifndef unreachable # define unreachable() do { \ annotate_unreachable(); \ -- cgit v1.2.3 From 6c5d780469d6c3590729940e2be8a3bd66ea4814 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 17 Feb 2022 23:41:20 +0100 Subject: netfilter: nf_tables_offload: incorrect flow offload action array size commit b1a5983f56e371046dcf164f90bfaf704d2b89f6 upstream. immediate verdict expression needs to allocate one slot in the flow offload action array, however, immediate data expression does not need to do so. fwd and dup expression need to allocate one slot, this is missing. Add a new offload_action interface to report if this expression needs to allocate one slot in the flow offload action array. Fixes: be2861dc36d7 ("netfilter: nft_{fwd,dup}_netdev: add offload support") Reported-and-tested-by: Nick Gregory Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_tables.h | 2 +- include/net/netfilter/nf_tables_offload.h | 2 -- net/netfilter/nf_tables_offload.c | 3 ++- net/netfilter/nft_dup_netdev.c | 6 ++++++ net/netfilter/nft_fwd_netdev.c | 6 ++++++ net/netfilter/nft_immediate.c | 12 +++++++++++- 6 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index a16171c5fd9e..d52a5d776e76 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -883,9 +883,9 @@ struct nft_expr_ops { int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); + bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); - u32 offload_flags; const struct nft_expr_type *type; void *data; }; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index f9d95ff82df8..797147843958 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -67,8 +67,6 @@ struct nft_flow_rule { struct flow_rule *rule; }; -#define NFT_OFFLOAD_F_ACTION (1 << 0) - void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9656c1646222..2d36952b1392 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -94,7 +94,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { - if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + if (expr->ops->offload_action && + expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index bbf3fcba3df4..5b5c607fbf83 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -67,6 +67,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); } +static bool nft_dup_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -75,6 +80,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, .offload = nft_dup_netdev_offload, + .offload_action = nft_dup_netdev_offload_action, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index cd59afde5b2f..7730409f6f09 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -77,6 +77,11 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); } +static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + struct nft_fwd_neigh { u8 sreg_dev; u8 sreg_addr; @@ -219,6 +224,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, + .offload_action = nft_fwd_netdev_offload_action, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 90c64d27ae53..d0f67d325bdf 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -213,6 +213,16 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_immediate_offload_action(const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return true; + + return false; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -224,7 +234,7 @@ static const struct nft_expr_ops nft_imm_ops = { .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, - .offload_flags = NFT_OFFLOAD_F_ACTION, + .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { -- cgit v1.2.3 From 0efdc0360395ece0779e2f8475e64427b02d832d Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 4 Oct 2021 16:11:52 +0200 Subject: tee: export teedev_open() and teedev_close_context() commit 1e2c3ef0496e72ba9001da5fd1b7ed56ccb30597 upstream. Exports the two functions teedev_open() and teedev_close_context() in order to make it easier to create a driver internal struct tee_context. Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander Signed-off-by: Greg Kroah-Hartman --- drivers/tee/tee_core.c | 6 ++++-- include/linux/tee_drv.h | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 85102d12d716..3fc426dad2df 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(driver_lock); static struct class *tee_class; static dev_t tee_devt; -static struct tee_context *teedev_open(struct tee_device *teedev) +struct tee_context *teedev_open(struct tee_device *teedev) { int rc; struct tee_context *ctx; @@ -70,6 +70,7 @@ err: return ERR_PTR(rc); } +EXPORT_SYMBOL_GPL(teedev_open); void teedev_ctx_get(struct tee_context *ctx) { @@ -96,13 +97,14 @@ void teedev_ctx_put(struct tee_context *ctx) kref_put(&ctx->refcount, teedev_ctx_release); } -static void teedev_close_context(struct tee_context *ctx) +void teedev_close_context(struct tee_context *ctx) { struct tee_device *teedev = ctx->teedev; teedev_ctx_put(ctx); tee_device_put(teedev); } +EXPORT_SYMBOL_GPL(teedev_close_context); static int tee_open(struct inode *inode, struct file *filp) { diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index feda1dc7f98e..38b701b7af4c 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -582,4 +582,18 @@ struct tee_client_driver { #define to_tee_client_driver(d) \ container_of(d, struct tee_client_driver, driver) +/** + * teedev_open() - Open a struct tee_device + * @teedev: Device to open + * + * @return a pointer to struct tee_context on success or an ERR_PTR on failure. + */ +struct tee_context *teedev_open(struct tee_device *teedev); + +/** + * teedev_close_context() - closes a struct tee_context + * @ctx: The struct tee_context to close + */ +void teedev_close_context(struct tee_context *ctx); + #endif /*__TEE_DRV_H*/ -- cgit v1.2.3 From 719d1c2524c89ada78c4c9202641c1d9e942a322 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 9 Feb 2022 12:33:23 +0530 Subject: bpf: Fix crash due to incorrect copy_map_value commit a8abb0c3dc1e28454851a00f8b7333d9695d566c upstream. When both bpf_spin_lock and bpf_timer are present in a BPF map value, copy_map_value needs to skirt both objects when copying a value into and out of the map. However, the current code does not set both s_off and t_off in copy_map_value, which leads to a crash when e.g. bpf_spin_lock is placed in map value with bpf_timer, as bpf_map_update_elem call will be able to overwrite the other timer object. When the issue is not fixed, an overwriting can produce the following splat: [root@(none) bpf]# ./test_progs -t timer_crash [ 15.930339] bpf_testmod: loading out-of-tree module taints kernel. [ 16.037849] ================================================================== [ 16.038458] BUG: KASAN: user-memory-access in __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.038944] Write of size 8 at addr 0000000000043ec0 by task test_progs/325 [ 16.039399] [ 16.039514] CPU: 0 PID: 325 Comm: test_progs Tainted: G OE 5.16.0+ #278 [ 16.039983] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.15.0-1 04/01/2014 [ 16.040485] Call Trace: [ 16.040645] [ 16.040805] dump_stack_lvl+0x59/0x73 [ 16.041069] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.041427] kasan_report.cold+0x116/0x11b [ 16.041673] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042040] __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042328] ? memcpy+0x39/0x60 [ 16.042552] ? pv_hash+0xd0/0xd0 [ 16.042785] ? lockdep_hardirqs_off+0x95/0xd0 [ 16.043079] __bpf_spin_lock_irqsave+0xdf/0xf0 [ 16.043366] ? bpf_get_current_comm+0x50/0x50 [ 16.043608] ? jhash+0x11a/0x270 [ 16.043848] bpf_timer_cancel+0x34/0xe0 [ 16.044119] bpf_prog_c4ea1c0f7449940d_sys_enter+0x7c/0x81 [ 16.044500] bpf_trampoline_6442477838_0+0x36/0x1000 [ 16.044836] __x64_sys_nanosleep+0x5/0x140 [ 16.045119] do_syscall_64+0x59/0x80 [ 16.045377] ? lock_is_held_type+0xe4/0x140 [ 16.045670] ? irqentry_exit_to_user_mode+0xa/0x40 [ 16.046001] ? mark_held_locks+0x24/0x90 [ 16.046287] ? asm_exc_page_fault+0x1e/0x30 [ 16.046569] ? asm_exc_page_fault+0x8/0x30 [ 16.046851] ? lockdep_hardirqs_on+0x7e/0x100 [ 16.047137] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 16.047405] RIP: 0033:0x7f9e4831718d [ 16.047602] Code: b4 0c 00 0f 05 eb a9 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b3 6c 0c 00 f7 d8 64 89 01 48 [ 16.048764] RSP: 002b:00007fff488086b8 EFLAGS: 00000206 ORIG_RAX: 0000000000000023 [ 16.049275] RAX: ffffffffffffffda RBX: 00007f9e48683740 RCX: 00007f9e4831718d [ 16.049747] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007fff488086d0 [ 16.050225] RBP: 00007fff488086f0 R08: 00007fff488085d7 R09: 00007f9e4cb594a0 [ 16.050648] R10: 0000000000000000 R11: 0000000000000206 R12: 00007f9e484cde30 [ 16.051124] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 16.051608] [ 16.051762] ================================================================== Fixes: 68134668c17f ("bpf: Add map side support for bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6c4640526f74..855fad8ac0a8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -221,7 +221,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) if (unlikely(map_value_has_spin_lock(map))) { s_off = map->spin_lock_off; s_sz = sizeof(struct bpf_spin_lock); - } else if (unlikely(map_value_has_timer(map))) { + } + if (unlikely(map_value_has_timer(map))) { t_off = map->timer_off; t_sz = sizeof(struct bpf_timer); } -- cgit v1.2.3 From 976406c5cc00a51a8576ca6f0c05b601ebc72066 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 11 Feb 2022 11:49:53 -0800 Subject: bpf: Fix a bpf_timer initialization issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5eaed6eedbe9612f642ad2b880f961d1c6c8ec2b upstream. The patch in [1] intends to fix a bpf_timer related issue, but the fix caused existing 'timer' selftest to fail with hang or some random errors. After some debug, I found an issue with check_and_init_map_value() in the hashtab.c. More specifically, in hashtab.c, we have code l_new = bpf_map_kmalloc_node(&htab->map, ...) check_and_init_map_value(&htab->map, l_new...) Note that bpf_map_kmalloc_node() does not do initialization so l_new contains random value. The function check_and_init_map_value() intends to zero the bpf_spin_lock and bpf_timer if they exist in the map. But I found bpf_spin_lock is zero'ed but bpf_timer is not zero'ed. With [1], later copy_map_value() skips copying of bpf_spin_lock and bpf_timer. The non-zero bpf_timer caused random failures for 'timer' selftest. Without [1], for both bpf_spin_lock and bpf_timer case, bpf_timer will be zero'ed, so 'timer' self test is okay. For check_and_init_map_value(), why bpf_spin_lock is zero'ed properly while bpf_timer not. In bpf uapi header, we have struct bpf_spin_lock { __u32 val; }; struct bpf_timer { __u64 :64; __u64 :64; } __attribute__((aligned(8))); The initialization code: *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = (struct bpf_spin_lock){}; *(struct bpf_timer *)(dst + map->timer_off) = (struct bpf_timer){}; It appears the compiler has no obligation to initialize anonymous fields. For example, let us use clang with bpf target as below: $ cat t.c struct bpf_timer { unsigned long long :64; }; struct bpf_timer2 { unsigned long long a; }; void test(struct bpf_timer *t) { *t = (struct bpf_timer){}; } void test2(struct bpf_timer2 *t) { *t = (struct bpf_timer2){}; } $ clang -target bpf -O2 -c -g t.c $ llvm-objdump -d t.o ... 0000000000000000 : 0: 95 00 00 00 00 00 00 00 exit 0000000000000008 : 1: b7 02 00 00 00 00 00 00 r2 = 0 2: 7b 21 00 00 00 00 00 00 *(u64 *)(r1 + 0) = r2 3: 95 00 00 00 00 00 00 00 exit gcc11.2 does not have the above issue. But from INTERNATIONAL STANDARD ©ISO/IEC ISO/IEC 9899:201x Programming languages — C http://www.open-std.org/Jtc1/sc22/wg14/www/docs/n1547.pdf page 157: Except where explicitly stated otherwise, for the purposes of this subclause unnamed members of objects of structure and union type do not participate in initialization. Unnamed members of structure objects have indeterminate value even after initialization. To fix the problem, let use memset for bpf_timer case in check_and_init_map_value(). For consistency, memset is also used for bpf_spin_lock case. [1] https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com/ Fixes: 68134668c17f3 ("bpf: Add map side support for bpf timers.") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220211194953.3142152-1-yhs@fb.com Signed-off-by: Greg Kroah-Hartman --- include/linux/bpf.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 855fad8ac0a8..d9049f2a78ca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -206,11 +206,9 @@ static inline bool map_value_has_timer(const struct bpf_map *map) static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; + memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) - *(struct bpf_timer *)(dst + map->timer_off) = - (struct bpf_timer){}; + memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); } /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ -- cgit v1.2.3 From f941104aa116aaa7d80e3b72d776494136f1c466 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 23 Feb 2022 18:34:16 +0200 Subject: openvswitch: Fix setting ipv6 fields causing hw csum failure commit d9b5ae5c1b241b91480aa30408be12fe91af834a upstream. Ipv6 ttl, label and tos fields are modified without first pulling/pushing the ipv6 header, which would have updated the hw csum (if available). This might cause csum validation when sending the packet to the stack, as can be seen in the trace below. Fix this by updating skb->csum if available. Trace resulted by ipv6 ttl dec and then sending packet to conntrack [actions: set(ipv6(hlimit=63)),ct(zone=99)]: [295241.900063] s_pf0vf2: hw csum failure [295241.923191] Call Trace: [295241.925728] [295241.927836] dump_stack+0x5c/0x80 [295241.931240] __skb_checksum_complete+0xac/0xc0 [295241.935778] nf_conntrack_tcp_packet+0x398/0xba0 [nf_conntrack] [295241.953030] nf_conntrack_in+0x498/0x5e0 [nf_conntrack] [295241.958344] __ovs_ct_lookup+0xac/0x860 [openvswitch] [295241.968532] ovs_ct_execute+0x4a7/0x7c0 [openvswitch] [295241.979167] do_execute_actions+0x54a/0xaa0 [openvswitch] [295242.001482] ovs_execute_actions+0x48/0x100 [openvswitch] [295242.006966] ovs_dp_process_packet+0x96/0x1d0 [openvswitch] [295242.012626] ovs_vport_receive+0x6c/0xc0 [openvswitch] [295242.028763] netdev_frame_hook+0xc0/0x180 [openvswitch] [295242.034074] __netif_receive_skb_core+0x2ca/0xcb0 [295242.047498] netif_receive_skb_internal+0x3e/0xc0 [295242.052291] napi_gro_receive+0xba/0xe0 [295242.056231] mlx5e_handle_rx_cqe_mpwrq_rep+0x12b/0x250 [mlx5_core] [295242.062513] mlx5e_poll_rx_cq+0xa0f/0xa30 [mlx5_core] [295242.067669] mlx5e_napi_poll+0xe1/0x6b0 [mlx5_core] [295242.077958] net_rx_action+0x149/0x3b0 [295242.086762] __do_softirq+0xd7/0x2d6 [295242.090427] irq_exit+0xf7/0x100 [295242.093748] do_IRQ+0x7f/0xd0 [295242.096806] common_interrupt+0xf/0xf [295242.100559] [295242.102750] RIP: 0033:0x7f9022e88cbd [295242.125246] RSP: 002b:00007f9022282b20 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda [295242.132900] RAX: 0000000000000005 RBX: 0000000000000010 RCX: 0000000000000000 [295242.140120] RDX: 00007f9022282ba8 RSI: 00007f9022282a30 RDI: 00007f9014005c30 [295242.147337] RBP: 00007f9014014d60 R08: 0000000000000020 R09: 00007f90254a8340 [295242.154557] R10: 00007f9022282a28 R11: 0000000000000246 R12: 0000000000000000 [295242.161775] R13: 00007f902308c000 R14: 000000000000002b R15: 00007f9022b71f40 Fixes: 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action") Signed-off-by: Paul Blakey Link: https://lore.kernel.org/r/20220223163416.24096-1-paulb@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/net/checksum.h | 5 +++++ net/openvswitch/actions.c | 46 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 5b96d5bd6e54..3fda961f5e0f 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -141,6 +141,11 @@ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } +static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) +{ + *csum = csum_add(csum_sub(*csum, old), new); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 076774034bb9..780d9e2246f3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -423,12 +423,43 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, memcpy(addr, new_addr, sizeof(__be32[4])); } -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) +static void set_ipv6_dsfield(struct sk_buff *skb, struct ipv6hdr *nh, u8 ipv6_tclass, u8 mask) { + u8 old_ipv6_tclass = ipv6_get_dsfield(nh); + + ipv6_tclass = OVS_MASKED(old_ipv6_tclass, ipv6_tclass, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(old_ipv6_tclass << 12), + (__force __wsum)(ipv6_tclass << 12)); + + ipv6_change_dsfield(nh, ~mask, ipv6_tclass); +} + +static void set_ipv6_fl(struct sk_buff *skb, struct ipv6hdr *nh, u32 fl, u32 mask) +{ + u32 ofl; + + ofl = nh->flow_lbl[0] << 16 | nh->flow_lbl[1] << 8 | nh->flow_lbl[2]; + fl = OVS_MASKED(ofl, fl, mask); + /* Bits 21-24 are always unmasked, so this retains their values. */ - OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + nh->flow_lbl[0] = (u8)(fl >> 16); + nh->flow_lbl[1] = (u8)(fl >> 8); + nh->flow_lbl[2] = (u8)fl; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)htonl(ofl), (__force __wsum)htonl(fl)); +} + +static void set_ipv6_ttl(struct sk_buff *skb, struct ipv6hdr *nh, u8 new_ttl, u8 mask) +{ + new_ttl = OVS_MASKED(nh->hop_limit, new_ttl, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(nh->hop_limit << 8), + (__force __wsum)(new_ttl << 8)); + nh->hop_limit = new_ttl; } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, @@ -546,18 +577,17 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv6_tclass) { - ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + set_ipv6_dsfield(skb, nh, key->ipv6_tclass, mask->ipv6_tclass); flow_key->ip.tos = ipv6_get_dsfield(nh); } if (mask->ipv6_label) { - set_ipv6_fl(nh, ntohl(key->ipv6_label), + set_ipv6_fl(skb, nh, ntohl(key->ipv6_label), ntohl(mask->ipv6_label)); flow_key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, - mask->ipv6_hlimit); + set_ipv6_ttl(skb, nh, key->ipv6_hlimit, mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; -- cgit v1.2.3 From 2e15fa8091de35e2571c44867010531eb683e51f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Feb 2022 14:35:49 +0100 Subject: net: Force inlining of checksum functions in net/checksum.h commit 5486f5bf790b5c664913076c3194b8f916a5c7ad upstream. All functions defined as static inline in net/checksum.h are meant to be inlined for performance reason. But since commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") the compiler is allowed to uninline functions when it wants. Fair enough in the general case, but for tiny performance critical checksum helpers that's counter-productive. The problem mainly arises when selecting CONFIG_CC_OPTIMISE_FOR_SIZE, Those helpers being 'static inline' in header files you suddenly find them duplicated many times in the resulting vmlinux. Here is a typical exemple when building powerpc pmac32_defconfig with CONFIG_CC_OPTIMISE_FOR_SIZE. csum_sub() appears 4 times: c04a23cc : c04a23cc: 7c 84 20 f8 not r4,r4 c04a23d0: 7c 63 20 14 addc r3,r3,r4 c04a23d4: 7c 63 01 94 addze r3,r3 c04a23d8: 4e 80 00 20 blr ... c04a2ce8: 4b ff f6 e5 bl c04a23cc ... c04a2d2c: 4b ff f6 a1 bl c04a23cc ... c04a2d54: 4b ff f6 79 bl c04a23cc ... c04a754c : c04a754c: 7c 84 20 f8 not r4,r4 c04a7550: 7c 63 20 14 addc r3,r3,r4 c04a7554: 7c 63 01 94 addze r3,r3 c04a7558: 4e 80 00 20 blr ... c04ac930: 4b ff ac 1d bl c04a754c ... c04ad264: 4b ff a2 e9 bl c04a754c ... c04e3b08 : c04e3b08: 7c 84 20 f8 not r4,r4 c04e3b0c: 7c 63 20 14 addc r3,r3,r4 c04e3b10: 7c 63 01 94 addze r3,r3 c04e3b14: 4e 80 00 20 blr ... c04e5788: 4b ff e3 81 bl c04e3b08 ... c04e65c8: 4b ff d5 41 bl c04e3b08 ... c0512d34 : c0512d34: 7c 84 20 f8 not r4,r4 c0512d38: 7c 63 20 14 addc r3,r3,r4 c0512d3c: 7c 63 01 94 addze r3,r3 c0512d40: 4e 80 00 20 blr ... c0512dfc: 4b ff ff 39 bl c0512d34 ... c05138bc: 4b ff f4 79 bl c0512d34 ... Restore the expected behaviour by using __always_inline for all functions defined in net/checksum.h vmlinux size is even reduced by 256 bytes with this patch: text data bss dec hex filename 6980022 2515362 194384 9689768 93daa8 vmlinux.before 6979862 2515266 194384 9689512 93d9a8 vmlinux.now Fixes: ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") Cc: Masahiro Yamada Cc: Nick Desaulniers Cc: Andrew Morton Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/checksum.h | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 3fda961f5e0f..d3b5d368a0ca 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -22,7 +22,7 @@ #include #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER -static inline +static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { @@ -33,7 +33,7 @@ __wsum csum_and_copy_from_user (const void __user *src, void *dst, #endif #ifndef HAVE_CSUM_COPY_USER -static __inline__ __wsum csum_and_copy_to_user +static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); @@ -45,7 +45,7 @@ static __inline__ __wsum csum_and_copy_to_user #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY -static inline __wsum +static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); @@ -54,7 +54,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len) #endif #ifndef HAVE_ARCH_CSUM_ADD -static inline __wsum csum_add(__wsum csum, __wsum addend) +static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; @@ -62,12 +62,12 @@ static inline __wsum csum_add(__wsum csum, __wsum addend) } #endif -static inline __wsum csum_sub(__wsum csum, __wsum addend) +static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } -static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; @@ -75,12 +75,12 @@ static inline __sum16 csum16_add(__sum16 csum, __be16 addend) return (__force __sum16)(res + (res < (__force u16)addend)); } -static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } -static inline __wsum csum_shift(__wsum sum, int offset) +static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) @@ -88,42 +88,43 @@ static inline __wsum csum_shift(__wsum sum, int offset) return sum; } -static inline __wsum +static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } -static inline __wsum +static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } -static inline __wsum +static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } -static inline __wsum csum_unfold(__sum16 n) +static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } -static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) +static __always_inline +__wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) -static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } -static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) +static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); @@ -136,7 +137,7 @@ static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) * m : old value of a 16bit field * m' : new value of a 16bit field */ -static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) +static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } @@ -155,16 +156,16 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); -static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, - __be16 from, __be16 to, - bool pseudohdr) +static __always_inline +void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, + __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } -static inline __wsum remcsum_adjust(void *ptr, __wsum csum, - int start, int offset) +static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, + int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; @@ -180,7 +181,7 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, return delta; } -static inline void remcsum_unadjust(__sum16 *psum, __wsum delta) +static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } -- cgit v1.2.3 From ff999198ec2149a6b83edb083293c91ec027f56f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 09:05:02 -0800 Subject: net-timestamp: convert sk->sk_tskey to atomic_t [ Upstream commit a1cdec57e03a1352e92fbbe7974039dda4efcec0 ] UDP sendmsg() can be lockless, this is causing all kinds of data races. This patch converts sk->sk_tskey to remove one of these races. BUG: KCSAN: data-race in __ip_append_data / __ip_append_data read to 0xffff8881035d4b6c of 4 bytes by task 8877 on cpu 1: __ip_append_data+0x1c1/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881035d4b6c of 4 bytes by task 8880 on cpu 0: __ip_append_data+0x1d8/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000054d -> 0x0000054e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8880 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00167-gdcb85f85fa6f-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 09c2d251b707 ("net-timestamp: add key to disambiguate concurrent datagrams") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/sock.h | 4 ++-- net/can/j1939/transport.c | 2 +- net/core/skbuff.c | 2 +- net/core/sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index dfb92f91d5be..7d49196a3880 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -506,7 +506,7 @@ struct sock { u16 sk_tsflags; int sk_bind_phc; u8 sk_shutdown; - u32 sk_tskey; + atomic_t sk_tskey; atomic_t sk_zckey; u8 sk_clockid; @@ -2598,7 +2598,7 @@ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = sk->sk_tskey++; + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index a271688780a2..307ee1174a6e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2006,7 +2006,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, /* set the end-packet for broadcast */ session->pkt.last = session->pkt.total; - skcb->tskey = session->sk->sk_tskey++; + skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1; session->tskey = skcb->tskey; return session; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb4dce8fd040..449a96e358ad 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4844,7 +4844,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) - serr->ee.ee_data -= sk->sk_tskey; + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 6ea317f84edd..deaed1b20682 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -866,9 +866,9 @@ int sock_set_timestamping(struct sock *sk, int optname, if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - sk->sk_tskey = tcp_sk(sk)->snd_una; + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { - sk->sk_tskey = 0; + atomic_set(&sk->sk_tskey, 0); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a4d2eb691cbc..131066d0319a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -992,7 +992,7 @@ static int __ip_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ff4e83e2a506..22bf8fb61716 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1465,7 +1465,7 @@ static int __ip6_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); -- cgit v1.2.3 From fcd3f5906d64875371d923e1a82b12387fc6ab69 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Sun, 20 Feb 2022 15:14:31 +0000 Subject: nvmem: core: Fix a conflict between MTD and NVMEM on wp-gpios property commit f6c052afe6f802d87c74153b7a57c43b2e9faf07 upstream. Wp-gpios property can be used on NVMEM nodes and the same property can be also used on MTD NAND nodes. In case of the wp-gpios property is defined at NAND level node, the GPIO management is done at NAND driver level. Write protect is disabled when the driver is probed or resumed and is enabled when the driver is released or suspended. When no partitions are defined in the NAND DT node, then the NAND DT node will be passed to NVMEM framework. If wp-gpios property is defined in this node, the GPIO resource is taken twice and the NAND controller driver fails to probe. It would be possible to set config->wp_gpio at MTD level before calling nvmem_register function but NVMEM framework will toggle this GPIO on each write when this GPIO should only be controlled at NAND level driver to ensure that the Write Protect has not been enabled. A way to fix this conflict is to add a new boolean flag in nvmem_config named ignore_wp. In case ignore_wp is set, the GPIO resource will be managed by the provider. Fixes: 2a127da461a9 ("nvmem: add support for the write-protect pin") Cc: stable@vger.kernel.org Signed-off-by: Christophe Kerello Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220220151432.16605-2-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 2 +- include/linux/nvmem-provider.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 9aecb83021a2..fb7840c73765 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -768,7 +768,7 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) if (config->wp_gpio) nvmem->wp_gpio = config->wp_gpio; - else + else if (!config->ignore_wp) nvmem->wp_gpio = gpiod_get_optional(config->dev, "wp", GPIOD_OUT_HIGH); if (IS_ERR(nvmem->wp_gpio)) { diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 104505e9028f..87932bdb25d7 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -66,7 +66,8 @@ struct nvmem_keepout { * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. * @priv: User context passed to read/write callbacks. - * @wp-gpio: Write protect pin + * @wp-gpio: Write protect pin + * @ignore_wp: Write Protect pin is managed by the provider. * * Note: A default "nvmem" name will be assigned to the device if * no name is specified in its configuration. In such case "" is @@ -88,6 +89,7 @@ struct nvmem_config { enum nvmem_type type; bool read_only; bool root_only; + bool ignore_wp; struct device_node *of_node; bool no_of_node; nvmem_reg_read_t reg_read; -- cgit v1.2.3 From 97abcfedc87cb501071e8947184ee0c5acca6874 Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Mon, 17 Jan 2022 17:36:54 +0800 Subject: ext4: fast commit may not fallback for ineligible commit [ Upstream commit e85c81ba8859a4c839bcd69c5d83b32954133a5b ] For the follow scenario: 1. jbd start commit transaction n 2. task A get new handle for transaction n+1 3. task A do some ineligible actions and mark FC_INELIGIBLE 4. jbd complete transaction n and clean FC_INELIGIBLE 5. task A call fsync In this case fast commit will not fallback to full commit and transaction n+1 also not handled by jbd. Make ext4_fc_mark_ineligible() also record transaction tid for latest ineligible case, when call ext4_fc_cleanup() check current transaction tid, if small than latest ineligible tid do not clear the EXT4_MF_FC_INELIGIBLE. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Ritesh Harjani Suggested-by: Harshad Shirwadkar Signed-off-by: Xin Yin Link: https://lore.kernel.org/r/20220117093655.35160-2-yinxin.x@bytedance.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Sasha Levin --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 4 ++-- fs/ext4/fast_commit.c | 33 +++++++++++++++++++++++++-------- fs/ext4/inode.c | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/ext4/namei.c | 4 ++-- fs/ext4/super.c | 1 + fs/ext4/xattr.c | 6 +++--- fs/jbd2/commit.c | 2 +- fs/jbd2/journal.c | 2 +- include/linux/jbd2.h | 2 +- 11 files changed, 42 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7448e24cd545..2b67c7d1677b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1746,6 +1746,7 @@ struct ext4_sb_info { spinlock_t s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; + tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif @@ -2923,7 +2924,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); -void ext4_fc_mark_ineligible(struct super_block *sb, int reason); +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_start_update(struct inode *inode); void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c19813402ce9..b81c008e6675 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5340,7 +5340,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); @@ -5480,7 +5480,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) ret = PTR_ERR(handle); goto out_mmap; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f8915dff82ee..96fac6ac18f4 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -302,18 +302,32 @@ restart: } /* - * Mark file system as fast commit ineligible. This means that next commit - * operation would result in a full jbd2 commit. + * Mark file system as fast commit ineligible, and record latest + * ineligible transaction tid. This means until the recorded + * transaction, commit operation would result in a full jbd2 commit. */ -void ext4_fc_mark_ineligible(struct super_block *sb, int reason) +void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); + tid_t tid; if (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (handle && !IS_ERR(handle)) + tid = handle->h_transaction->t_tid; + else { + read_lock(&sbi->s_journal->j_state_lock); + tid = sbi->s_journal->j_running_transaction ? + sbi->s_journal->j_running_transaction->t_tid : 0; + read_unlock(&sbi->s_journal->j_state_lock); + } + spin_lock(&sbi->s_fc_lock); + if (sbi->s_fc_ineligible_tid < tid) + sbi->s_fc_ineligible_tid = tid; + spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -389,7 +403,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) mutex_unlock(&ei->i_fc_lock); node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -402,7 +416,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_NOMEM); + EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -504,7 +518,7 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_INODE_JOURNAL_DATA); + EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } @@ -1180,7 +1194,7 @@ fallback: * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ -static void ext4_fc_cleanup(journal_t *journal, int full) +static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1228,7 +1242,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full) &sbi->s_fc_q[FC_Q_MAIN]); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); - ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + if (tid >= sbi->s_fc_ineligible_tid) { + sbi->s_fc_ineligible_tid = 0; + ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); + } if (full) sbi->s_fc_bytes = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b6746cc86cee..22a5140546fb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -337,7 +337,7 @@ stop_handle: return; no_delete: if (!list_empty(&EXT4_I(inode)->i_fc_list)) - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } @@ -5969,7 +5969,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, - EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); + EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index fd70bebb1437..f61b59045c6d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -169,7 +169,7 @@ static long swap_inode_boot_loader(struct super_block *sb, err = -EINVAL; goto err_out; } - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); @@ -1075,7 +1075,7 @@ mext_out: err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { - ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index da7698341d7d..192ea90e757c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3889,7 +3889,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * dirents in directories. */ ext4_fc_mark_ineligible(old.inode->i_sb, - EXT4_FC_REASON_RENAME_DIR); + EXT4_FC_REASON_RENAME_DIR, handle); } else { if (new.inode) ext4_fc_track_unlink(handle, new.dentry); @@ -4049,7 +4049,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(retval)) goto end_rename; ext4_fc_mark_ineligible(new.inode->i_sb, - EXT4_FC_REASON_CROSS_RENAME); + EXT4_FC_REASON_CROSS_RENAME, handle); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); if (retval) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 538a33c3b117..eb5d55cf277c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4620,6 +4620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); + sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e0fc1ed845b..042325349098 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2408,7 +2408,7 @@ retry_inode: if (IS_SYNC(inode)) ext4_handle_sync(handle); } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); @@ -2486,7 +2486,7 @@ retry: if (error == 0) error = error2; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL); return error; } @@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, error); goto cleanup; } - ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR); + ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3cc4ab2ba7f4..d188fa913a07 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1170,7 +1170,7 @@ restart_loop: if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 1); + journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); jbd_debug(1, "JBD2: commit %d complete, head %d\n", diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bd9ac9891604..1f8493ef181d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -769,7 +769,7 @@ EXPORT_SYMBOL(jbd2_fc_begin_commit); static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { if (journal->j_fc_cleanup_callback) - journal->j_fc_cleanup_callback(journal, 0); + journal->j_fc_cleanup_callback(journal, 0, tid); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd933c45281a..d63b8106796e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1295,7 +1295,7 @@ struct journal_s * Clean-up after fast commit or full commit. JBD2 calls this function * after every commit operation. */ - void (*j_fc_cleanup_callback)(struct journal_s *journal, int); + void (*j_fc_cleanup_callback)(struct journal_s *journal, int full, tid_t tid); /** * @j_fc_replay_callback: -- cgit v1.2.3 From 771aca9bc70709771f66c3e7c00ce87339aa1790 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Mar 2022 09:37:28 -0800 Subject: ipv6: fix skb drops in igmp6_event_query() and igmp6_event_report() [ Upstream commit 2d3916f3189172d5c69d33065c3c21119fe539fc ] While investigating on why a synchronize_net() has been added recently in ipv6_mc_down(), I found that igmp6_event_query() and igmp6_event_report() might drop skbs in some cases. Discussion about removing synchronize_net() from ipv6_mc_down() will happen in a different thread. Fixes: f185de28d9ae ("mld: add new workqueues for process mld events") Signed-off-by: Eric Dumazet Cc: Taehee Yoo Cc: Cong Wang Cc: David Ahern Link: https://lore.kernel.org/r/20220303173728.937869-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/net/ndisc.h | 4 ++-- net/ipv6/mcast.c | 32 ++++++++++++-------------------- 2 files changed, 14 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 38e4094960ce..e97ef508664f 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -487,9 +487,9 @@ int igmp6_late_init(void); void igmp6_cleanup(void); void igmp6_late_cleanup(void); -int igmp6_event_query(struct sk_buff *skb); +void igmp6_event_query(struct sk_buff *skb); -int igmp6_event_report(struct sk_buff *skb); +void igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index a8861db52c18..909f937befd7 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1371,27 +1371,23 @@ static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, } /* called with rcu_read_lock() */ -int igmp6_event_query(struct sk_buff *skb) +void igmp6_event_query(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_query_lock); if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_query_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_query_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_query_work(struct sk_buff *skb) @@ -1542,27 +1538,23 @@ static void mld_query_work(struct work_struct *work) } /* called with rcu_read_lock() */ -int igmp6_event_report(struct sk_buff *skb) +void igmp6_event_report(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); - if (!idev) - return -EINVAL; - - if (idev->dead) { - kfree_skb(skb); - return -ENODEV; - } + if (!idev || idev->dead) + goto out; spin_lock_bh(&idev->mc_report_lock); if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) { __skb_queue_tail(&idev->mc_report_queue, skb); if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0)) in6_dev_hold(idev); + skb = NULL; } spin_unlock_bh(&idev->mc_report_lock); - - return 0; +out: + kfree_skb(skb); } static void __mld_report_work(struct sk_buff *skb) -- cgit v1.2.3 From 2de88544b3dbe9f4be3220750e34c97dd86386c1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Sep 2021 17:06:21 -0400 Subject: NFSD: Have legacy NFSD WRITE decoders use xdr_stream_subsegment() [ Upstream commit dae9a6cab8009e526570e7477ce858dcdfeb256e ] Refactor. Now that the NFSv2 and NFSv3 XDR decoders have been converted to use xdr_streams, the WRITE decoder functions can use xdr_stream_subsegment() to extract the WRITE payload into its own xdr_buf, just as the NFSv4 WRITE XDR decoder currently does. That makes it possible to pass the first kvec, pages array + length, page_base, and total payload length via a single function parameter. The payload's page_base is not yet assigned or used, but will be in subsequent patches. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Sasha Levin --- fs/nfsd/nfs3proc.c | 3 +-- fs/nfsd/nfs3xdr.c | 12 ++---------- fs/nfsd/nfs4proc.c | 3 +-- fs/nfsd/nfsproc.c | 3 +-- fs/nfsd/nfsxdr.c | 9 +-------- fs/nfsd/xdr.h | 2 +- fs/nfsd/xdr3.h | 2 +- include/linux/sunrpc/svc.h | 3 +-- net/sunrpc/svc.c | 11 ++++++----- 9 files changed, 15 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9918d6ad23ec..354bbfcd96aa 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -210,8 +210,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, - &argp->first, cnt); + nvecs = svc_fill_write_vector(rqstp, &argp->payload); if (!nvecs) { resp->status = nfserr_io; goto out; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 7a900131d20c..0ee156b9c9d7 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -621,9 +621,6 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd3_writeargs *args = rqstp->rq_argp; u32 max_blocksize = svc_max_payload(rqstp); - struct kvec *head = rqstp->rq_arg.head; - struct kvec *tail = rqstp->rq_arg.tail; - size_t remaining; if (!svcxdr_decode_nfs_fh3(xdr, &args->fh)) return 0; @@ -641,17 +638,12 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) /* request sanity */ if (args->count != args->len) return 0; - remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; - remaining -= xdr_stream_pos(xdr); - if (remaining < xdr_align_size(args->len)) - return 0; if (args->count > max_blocksize) { args->count = max_blocksize; args->len = max_blocksize; } - - args->first.iov_base = xdr->p; - args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); + if (!xdr_stream_subsegment(xdr, &args->payload, args->count)) + return 0; return 1; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4b9a3b90a41f..65200910107f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1038,8 +1038,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, write->wr_how_written = write->wr_stable_how; - nvecs = svc_fill_write_vector(rqstp, write->wr_payload.pages, - write->wr_payload.head, write->wr_buflen); + nvecs = svc_fill_write_vector(rqstp, &write->wr_payload); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 19c568b8a527..de282f3273c5 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -234,8 +234,7 @@ nfsd_proc_write(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages, - &argp->first, cnt); + nvecs = svc_fill_write_vector(rqstp, &argp->payload); if (!nvecs) { resp->status = nfserr_io; goto out; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index a06c05fe3b42..26a42f87c240 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -325,10 +325,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) { struct xdr_stream *xdr = &rqstp->rq_arg_stream; struct nfsd_writeargs *args = rqstp->rq_argp; - struct kvec *head = rqstp->rq_arg.head; - struct kvec *tail = rqstp->rq_arg.tail; u32 beginoffset, totalcount; - size_t remaining; if (!svcxdr_decode_fhandle(xdr, &args->fh)) return 0; @@ -346,12 +343,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p) return 0; if (args->len > NFSSVC_MAXBLKSIZE_V2) return 0; - remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len; - remaining -= xdr_stream_pos(xdr); - if (remaining < xdr_align_size(args->len)) + if (!xdr_stream_subsegment(xdr, &args->payload, args->len)) return 0; - args->first.iov_base = xdr->p; - args->first.iov_len = head->iov_len - xdr_stream_pos(xdr); return 1; } diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h index f45b4bc93f52..80fd6d7f3404 100644 --- a/fs/nfsd/xdr.h +++ b/fs/nfsd/xdr.h @@ -33,7 +33,7 @@ struct nfsd_writeargs { svc_fh fh; __u32 offset; int len; - struct kvec first; + struct xdr_buf payload; }; struct nfsd_createargs { diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 933008382bbe..712c117300cb 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -40,7 +40,7 @@ struct nfsd3_writeargs { __u32 count; int stable; __u32 len; - struct kvec first; + struct xdr_buf payload; }; struct nfsd3_createargs { diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 064c96157d1f..6263410c948a 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -532,8 +532,7 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct page **pages, - struct kvec *first, size_t total); + struct xdr_buf *payload); char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a3bbe5ce4570..08ca797bb8a4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1676,16 +1676,17 @@ EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** * svc_fill_write_vector - Construct data argument for VFS write call * @rqstp: svc_rqst to operate on - * @pages: list of pages containing data payload - * @first: buffer containing first section of write payload - * @total: total number of bytes of write payload + * @payload: xdr_buf containing only the write data payload * * Fills in rqstp::rq_vec, and returns the number of elements. */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages, - struct kvec *first, size_t total) +unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, + struct xdr_buf *payload) { + struct page **pages = payload->pages; + struct kvec *first = payload->head; struct kvec *vec = rqstp->rq_vec; + size_t total = payload->len; unsigned int i; /* Some types of transport can present the write payload -- cgit v1.2.3 From bdaa8c7b718998f43c3473181a25a7243770a745 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 9 Jan 2022 13:26:51 -0500 Subject: SUNRPC: Fix sockaddr handling in the svc_xprt_create_error trace point [ Upstream commit dc6c6fb3d639756a532bcc47d4a9bf9f3965881b ] While testing, I got an unexpected KASAN splat: Jan 08 13:50:27 oracle-102.nfsv4.dev kernel: BUG: KASAN: stack-out-of-bounds in trace_event_raw_event_svc_xprt_create_err+0x190/0x210 [sunrpc] Jan 08 13:50:27 oracle-102.nfsv4.dev kernel: Read of size 28 at addr ffffc9000008f728 by task mount.nfs/4628 The memcpy() in the TP_fast_assign section of this trace point copies the size of the destination buffer in order that the buffer won't be overrun. In other similar trace points, the source buffer for this memcpy is a "struct sockaddr_storage" so the actual length of the source buffer is always long enough to prevent the memcpy from reading uninitialized or unallocated memory. However, for this trace point, the source buffer can be as small as a "struct sockaddr_in". For AF_INET sockaddrs, the memcpy() reads memory that follows the source buffer, which is not always valid memory. To avoid copying past the end of the passed-in sockaddr, make the source address's length available to the memcpy(). It would be a little nicer if the tracing infrastructure was more friendly about storing socket addresses that are not AF_INET, but I could not find a way to make printk("%pIS") work with a dynamic array. Reported-by: KASAN Fixes: 4b8f380e46e4 ("SUNRPC: Tracepoint to record errors in svc_xpo_create()") Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- include/trace/events/sunrpc.h | 5 +++-- net/sunrpc/svc_xprt.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index daaf407e9e49..1213c078dcca 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1727,10 +1727,11 @@ TRACE_EVENT(svc_xprt_create_err, const char *program, const char *protocol, struct sockaddr *sap, + size_t salen, const struct svc_xprt *xprt ), - TP_ARGS(program, protocol, sap, xprt), + TP_ARGS(program, protocol, sap, salen, xprt), TP_STRUCT__entry( __field(long, error) @@ -1743,7 +1744,7 @@ TRACE_EVENT(svc_xprt_create_err, __entry->error = PTR_ERR(xprt); __assign_str(program, program); __assign_str(protocol, protocol); - memcpy(__entry->addr, sap, sizeof(__entry->addr)); + memcpy(__entry->addr, sap, min(salen, sizeof(__entry->addr))); ), TP_printk("addr=%pISpc program=%s protocol=%s error=%ld", diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6316bd2b8f37..d4b663401be1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -243,7 +243,7 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); if (IS_ERR(xprt)) trace_svc_xprt_create_err(serv->sv_program->pg_name, - xcl->xcl_name, sap, xprt); + xcl->xcl_name, sap, len, xprt); return xprt; } -- cgit v1.2.3 From e209742c13d2c72abaea0ce49d88947209937709 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 8 Jan 2022 16:59:54 -0500 Subject: SUNRPC: Fix sockaddr handling in svcsock_accept_class trace points [ Upstream commit 16720861675393a35974532b3c837d9fd7bfe08c ] Avoid potentially hazardous memory copying and the needless use of "%pIS" -- in the kernel, an RPC service listener is always bound to ANYADDR. Having the network namespace is helpful when recording errors, though. Fixes: a0469f46faab ("SUNRPC: Replace dprintk call sites in TCP state change callouts") Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- include/trace/events/sunrpc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 1213c078dcca..7c48613c1830 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -2112,17 +2112,17 @@ DECLARE_EVENT_CLASS(svcsock_accept_class, TP_STRUCT__entry( __field(long, status) __string(service, service) - __array(unsigned char, addr, sizeof(struct sockaddr_in6)) + __field(unsigned int, netns_ino) ), TP_fast_assign( __entry->status = status; __assign_str(service, service); - memcpy(__entry->addr, &xprt->xpt_local, sizeof(__entry->addr)); + __entry->netns_ino = xprt->xpt_net->ns.inum; ), - TP_printk("listener=%pISpc service=%s status=%ld", - __entry->addr, __get_str(service), __entry->status + TP_printk("addr=listener service=%s status=%ld", + __get_str(service), __entry->status ) ); -- cgit v1.2.3 From f1675103e0f361f4f636438c1c911dbe83f5f334 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jan 2022 14:04:11 -0800 Subject: mm: defer kmemleak object creation of module_alloc() [ Upstream commit 60115fa54ad7b913b7cb5844e6b7ffeb842d55f2 ] Yongqiang reports a kmemleak panic when module insmod/rmmod with KASAN enabled(without KASAN_VMALLOC) on x86[1]. When the module area allocates memory, it's kmemleak_object is created successfully, but the KASAN shadow memory of module allocation is not ready, so when kmemleak scan the module's pointer, it will panic due to no shadow memory with KASAN check. module_alloc __vmalloc_node_range kmemleak_vmalloc kmemleak_scan update_checksum kasan_module_alloc kmemleak_ignore Note, there is no problem if KASAN_VMALLOC enabled, the modules area entire shadow memory is preallocated. Thus, the bug only exits on ARCH which supports dynamic allocation of module area per module load, for now, only x86/arm64/s390 are involved. Add a VM_DEFER_KMEMLEAK flags, defer vmalloc'ed object register of kmemleak in module_alloc() to fix this issue. [1] https://lore.kernel.org/all/6d41e2b9-4692-5ec4-b1cd-cbe29ae89739@huawei.com/ [wangkefeng.wang@huawei.com: fix build] Link: https://lkml.kernel.org/r/20211125080307.27225-1-wangkefeng.wang@huawei.com [akpm@linux-foundation.org: simplify ifdefs, per Andrey] Link: https://lkml.kernel.org/r/CA+fCnZcnwJHUQq34VuRxpdoY6_XbJCDJ-jopksS5Eia4PijPzw@mail.gmail.com Link: https://lkml.kernel.org/r/20211124142034.192078-1-wangkefeng.wang@huawei.com Fixes: 793213a82de4 ("s390/kasan: dynamic shadow mem allocation for modules") Fixes: 39d114ddc682 ("arm64: add KASAN support") Fixes: bebf56a1b176 ("kasan: enable instrumentation of global variables") Signed-off-by: Kefeng Wang Reported-by: Yongqiang Liu Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Alexander Gordeev Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin --- arch/arm64/kernel/module.c | 4 ++-- arch/s390/kernel/module.c | 5 +++-- arch/x86/kernel/module.c | 7 ++++--- include/linux/kasan.h | 4 ++-- include/linux/vmalloc.h | 7 +++++++ mm/kasan/shadow.c | 9 +++++++-- mm/vmalloc.c | 3 ++- 7 files changed, 27 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index b5ec010c481f..309a27553c87 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -36,7 +36,7 @@ void *module_alloc(unsigned long size) module_alloc_end = MODULES_END; p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_end, gfp_mask, PAGE_KERNEL, 0, + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index a805ea5cb92d..b032e556eeb7 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -37,14 +37,15 @@ void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 5e9a34b5bd74..867a341a0c7e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) @@ -74,10 +75,10 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + MODULES_END, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index dd874a1ee862..f407e937241a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -461,12 +461,12 @@ static inline void kasan_release_vmalloc(unsigned long start, * allocations with real shadow memory. With KASAN vmalloc, the special * case is unnecessary, as the work is handled in the generic case. */ -int kasan_module_alloc(void *addr, size_t size); +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } +static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 671d402c3778..4fe9e885bbfa 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -28,6 +28,13 @@ struct notifier_block; /* in notifier.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ +#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ + !defined(CONFIG_KASAN_VMALLOC) +#define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ +#else +#define VM_DEFER_KMEMLEAK 0 +#endif + /* * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. * diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 8d95ee52d019..dd79840e6096 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -493,7 +493,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size) +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -515,9 +515,14 @@ int kasan_module_alloc(void *addr, size_t size) __builtin_return_address(0)); if (ret) { + struct vm_struct *vm = find_vm_area(addr); __memset(ret, KASAN_SHADOW_INIT, shadow_size); - find_vm_area(addr)->flags |= VM_KASAN; + vm->flags |= VM_KASAN; kmemleak_ignore(ret); + + if (vm->flags & VM_DEFER_KMEMLEAK) + kmemleak_vmalloc(vm, size, gfp_mask); + return 0; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8a807c78110..8375eecc55de 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3032,7 +3032,8 @@ again: clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); - kmemleak_vmalloc(area, size, gfp_mask); + if (!(vm_flags & VM_DEFER_KMEMLEAK)) + kmemleak_vmalloc(area, size, gfp_mask); return addr; -- cgit v1.2.3 From 462c5e6cb2414fcde7d0a76f5e0661744caaa0aa Mon Sep 17 00:00:00 2001 From: Moshe Tal Date: Thu, 20 Jan 2022 11:55:50 +0200 Subject: ethtool: Fix link extended state for big endian [ Upstream commit e2f08207c558bc0bc8abaa557cdb29bad776ac7b ] The link extended sub-states are assigned as enum that is an integer size but read from a union as u8, this is working for small values on little endian systems but for big endian this always give 0. Fix the variable in the union to match the enum size. Fixes: ecc31c60240b ("ethtool: Add link extended state") Signed-off-by: Moshe Tal Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Reviewed-by: Gal Pressman Reviewed-by: Amit Cohen Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/ethtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 849524b55d89..3fad741df53e 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -94,7 +94,7 @@ struct ethtool_link_ext_state_info { enum ethtool_link_ext_substate_link_logical_mismatch link_logical_mismatch; enum ethtool_link_ext_substate_bad_signal_integrity bad_signal_integrity; enum ethtool_link_ext_substate_cable_issue cable_issue; - u8 __link_ext_substate; + u32 __link_ext_substate; }; }; -- cgit v1.2.3 From aa5040691cb761a04c206a893e5771a904d22ab7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2021 14:41:33 -0700 Subject: bpf: Use u64_stats_t in struct bpf_prog_stats [ Upstream commit 61a0abaee2092eee69e44fe60336aa2f5b578938 ] Commit 316580b69d0a ("u64_stats: provide u64_stats_t type") fixed possible load/store tearing on 64bit arches. For instance the following C code stats->nsecs += sched_clock() - start; Could be rightfully implemented like this by a compiler, confusing concurrent readers a lot: stats->nsecs += sched_clock(); // arbitrary delay stats->nsecs -= start; Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211026214133.3114279-4-eric.dumazet@gmail.com Signed-off-by: Sasha Levin --- include/linux/filter.h | 10 +++++----- kernel/bpf/syscall.c | 18 ++++++++++++------ kernel/bpf/trampoline.c | 6 +++--- 3 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1611dc9d4420..a9956b681f09 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -554,9 +554,9 @@ struct bpf_binary_header { }; struct bpf_prog_stats { - u64 cnt; - u64 nsecs; - u64 misses; + u64_stats_t cnt; + u64_stats_t nsecs; + u64_stats_t misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); @@ -618,8 +618,8 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); - stats->cnt++; - stats->nsecs += sched_clock() - start; + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 53384622e8da..42490c39dfbf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1824,8 +1824,14 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +struct bpf_prog_kstats { + u64 nsecs; + u64 cnt; + u64 misses; +}; + static void bpf_prog_get_stats(const struct bpf_prog *prog, - struct bpf_prog_stats *stats) + struct bpf_prog_kstats *stats) { u64 nsecs = 0, cnt = 0, misses = 0; int cpu; @@ -1838,9 +1844,9 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); - tnsecs = st->nsecs; - tcnt = st->cnt; - tmisses = st->misses; + tnsecs = u64_stats_read(&st->nsecs); + tcnt = u64_stats_read(&st->cnt); + tmisses = u64_stats_read(&st->misses); } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; @@ -1856,7 +1862,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - struct bpf_prog_stats stats; + struct bpf_prog_kstats stats; bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); @@ -3595,7 +3601,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info; u32 info_len = attr->info.info_len; - struct bpf_prog_stats stats; + struct bpf_prog_kstats stats; char __user *uinsns; u32 ulen; int err; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d3a307a8c42b..6933a9bfee63 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -544,7 +544,7 @@ static void notrace inc_misses_counter(struct bpf_prog *prog) stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); - stats->misses++; + u64_stats_inc(&stats->misses); u64_stats_update_end(&stats->syncp); } @@ -589,8 +589,8 @@ static void notrace update_prog_stats(struct bpf_prog *prog, stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); - stats->cnt++; - stats->nsecs += sched_clock() - start; + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } -- cgit v1.2.3 From 3a3aa0881aebf1b81d0a37bb07d2b1aa307e4895 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 6 Oct 2021 18:06:54 -0700 Subject: of: net: move of_net under net/ [ Upstream commit e330fb14590c5c80f7195c3d8c9b4bcf79e1a5cd ] Rob suggests to move of_net.c from under drivers/of/ somewhere to the networking code. Suggested-by: Rob Herring Signed-off-by: Jakub Kicinski Reviewed-by: Rob Herring Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/amd/Kconfig | 2 +- drivers/net/ethernet/arc/Kconfig | 4 +- drivers/net/ethernet/ezchip/Kconfig | 2 +- drivers/net/ethernet/litex/Kconfig | 2 +- drivers/net/ethernet/mscc/Kconfig | 2 +- drivers/of/Kconfig | 4 - drivers/of/Makefile | 1 - drivers/of/of_net.c | 145 ------------------------------------ include/linux/of_net.h | 2 +- net/core/Makefile | 1 + net/core/net-sysfs.c | 2 +- net/core/of_net.c | 145 ++++++++++++++++++++++++++++++++++++ 12 files changed, 154 insertions(+), 158 deletions(-) delete mode 100644 drivers/of/of_net.c create mode 100644 net/core/of_net.c (limited to 'include') diff --git a/drivers/net/ethernet/amd/Kconfig b/drivers/net/ethernet/amd/Kconfig index 4786f0504691..899c8a2a34b6 100644 --- a/drivers/net/ethernet/amd/Kconfig +++ b/drivers/net/ethernet/amd/Kconfig @@ -168,7 +168,7 @@ config SUNLANCE config AMD_XGBE tristate "AMD 10GbE Ethernet driver" - depends on ((OF_NET && OF_ADDRESS) || ACPI || PCI) && HAS_IOMEM + depends on (OF_ADDRESS || ACPI || PCI) && HAS_IOMEM depends on X86 || ARM64 || COMPILE_TEST depends on PTP_1588_CLOCK_OPTIONAL select BITREVERSE diff --git a/drivers/net/ethernet/arc/Kconfig b/drivers/net/ethernet/arc/Kconfig index 92a79c4ffa2c..0a67612af228 100644 --- a/drivers/net/ethernet/arc/Kconfig +++ b/drivers/net/ethernet/arc/Kconfig @@ -26,7 +26,7 @@ config ARC_EMAC_CORE config ARC_EMAC tristate "ARC EMAC support" select ARC_EMAC_CORE - depends on OF_IRQ && OF_NET + depends on OF_IRQ depends on ARC || COMPILE_TEST help On some legacy ARC (Synopsys) FPGA boards such as ARCAngel4/ML50x @@ -36,7 +36,7 @@ config ARC_EMAC config EMAC_ROCKCHIP tristate "Rockchip EMAC support" select ARC_EMAC_CORE - depends on OF_IRQ && OF_NET && REGULATOR + depends on OF_IRQ && REGULATOR depends on ARCH_ROCKCHIP || COMPILE_TEST help Support for Rockchip RK3036/RK3066/RK3188 EMAC ethernet controllers. diff --git a/drivers/net/ethernet/ezchip/Kconfig b/drivers/net/ethernet/ezchip/Kconfig index 38aa824efb25..9241b9b1c7a3 100644 --- a/drivers/net/ethernet/ezchip/Kconfig +++ b/drivers/net/ethernet/ezchip/Kconfig @@ -18,7 +18,7 @@ if NET_VENDOR_EZCHIP config EZCHIP_NPS_MANAGEMENT_ENET tristate "EZchip NPS management enet support" - depends on OF_IRQ && OF_NET + depends on OF_IRQ depends on HAS_IOMEM help Simple LAN device for debug or management purposes. diff --git a/drivers/net/ethernet/litex/Kconfig b/drivers/net/ethernet/litex/Kconfig index 63bf01d28f0c..f99adbf26ab4 100644 --- a/drivers/net/ethernet/litex/Kconfig +++ b/drivers/net/ethernet/litex/Kconfig @@ -17,7 +17,7 @@ if NET_VENDOR_LITEX config LITEX_LITEETH tristate "LiteX Ethernet support" - depends on OF_NET + depends on OF help If you wish to compile a kernel for hardware with a LiteX LiteEth device then you should answer Y to this. diff --git a/drivers/net/ethernet/mscc/Kconfig b/drivers/net/ethernet/mscc/Kconfig index b6a73d151dec..8dd8c7f425d2 100644 --- a/drivers/net/ethernet/mscc/Kconfig +++ b/drivers/net/ethernet/mscc/Kconfig @@ -28,7 +28,7 @@ config MSCC_OCELOT_SWITCH depends on BRIDGE || BRIDGE=n depends on NET_SWITCHDEV depends on HAS_IOMEM - depends on OF_NET + depends on OF select MSCC_OCELOT_SWITCH_LIB select GENERIC_PHY help diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig index 3dfeae8912df..80b5fd44ab1c 100644 --- a/drivers/of/Kconfig +++ b/drivers/of/Kconfig @@ -70,10 +70,6 @@ config OF_IRQ def_bool y depends on !SPARC && IRQ_DOMAIN -config OF_NET - depends on NETDEVICES - def_bool y - config OF_RESERVED_MEM def_bool OF_EARLY_FLATTREE diff --git a/drivers/of/Makefile b/drivers/of/Makefile index c13b982084a3..e0360a44306e 100644 --- a/drivers/of/Makefile +++ b/drivers/of/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_OF_EARLY_FLATTREE) += fdt_address.o obj-$(CONFIG_OF_PROMTREE) += pdt.o obj-$(CONFIG_OF_ADDRESS) += address.o obj-$(CONFIG_OF_IRQ) += irq.o -obj-$(CONFIG_OF_NET) += of_net.o obj-$(CONFIG_OF_UNITTEST) += unittest.o obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o obj-$(CONFIG_OF_RESOLVE) += resolver.o diff --git a/drivers/of/of_net.c b/drivers/of/of_net.c deleted file mode 100644 index dbac3a172a11..000000000000 --- a/drivers/of/of_net.c +++ /dev/null @@ -1,145 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * OF helpers for network devices. - * - * Initially copied out of arch/powerpc/kernel/prom_parse.c - */ -#include -#include -#include -#include -#include -#include -#include -#include - -/** - * of_get_phy_mode - Get phy mode for given device_node - * @np: Pointer to the given device_node - * @interface: Pointer to the result - * - * The function gets phy interface string from property 'phy-mode' or - * 'phy-connection-type'. The index in phy_modes table is set in - * interface and 0 returned. In case of error interface is set to - * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV. - */ -int of_get_phy_mode(struct device_node *np, phy_interface_t *interface) -{ - const char *pm; - int err, i; - - *interface = PHY_INTERFACE_MODE_NA; - - err = of_property_read_string(np, "phy-mode", &pm); - if (err < 0) - err = of_property_read_string(np, "phy-connection-type", &pm); - if (err < 0) - return err; - - for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) - if (!strcasecmp(pm, phy_modes(i))) { - *interface = i; - return 0; - } - - return -ENODEV; -} -EXPORT_SYMBOL_GPL(of_get_phy_mode); - -static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) -{ - struct property *pp = of_find_property(np, name, NULL); - - if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) { - memcpy(addr, pp->value, ETH_ALEN); - return 0; - } - return -ENODEV; -} - -static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) -{ - struct platform_device *pdev = of_find_device_by_node(np); - struct nvmem_cell *cell; - const void *mac; - size_t len; - int ret; - - /* Try lookup by device first, there might be a nvmem_cell_lookup - * associated with a given device. - */ - if (pdev) { - ret = nvmem_get_mac_address(&pdev->dev, addr); - put_device(&pdev->dev); - return ret; - } - - cell = of_nvmem_cell_get(np, "mac-address"); - if (IS_ERR(cell)) - return PTR_ERR(cell); - - mac = nvmem_cell_read(cell, &len); - nvmem_cell_put(cell); - - if (IS_ERR(mac)) - return PTR_ERR(mac); - - if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { - kfree(mac); - return -EINVAL; - } - - memcpy(addr, mac, ETH_ALEN); - kfree(mac); - - return 0; -} - -/** - * of_get_mac_address() - * @np: Caller's Device Node - * @addr: Pointer to a six-byte array for the result - * - * Search the device tree for the best MAC address to use. 'mac-address' is - * checked first, because that is supposed to contain to "most recent" MAC - * address. If that isn't set, then 'local-mac-address' is checked next, - * because that is the default address. If that isn't set, then the obsolete - * 'address' is checked, just in case we're using an old device tree. If any - * of the above isn't set, then try to get MAC address from nvmem cell named - * 'mac-address'. - * - * Note that the 'address' property is supposed to contain a virtual address of - * the register set, but some DTS files have redefined that property to be the - * MAC address. - * - * All-zero MAC addresses are rejected, because those could be properties that - * exist in the device tree, but were not set by U-Boot. For example, the - * DTS could define 'mac-address' and 'local-mac-address', with zero MAC - * addresses. Some older U-Boots only initialized 'local-mac-address'. In - * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists - * but is all zeros. - * - * Return: 0 on success and errno in case of error. -*/ -int of_get_mac_address(struct device_node *np, u8 *addr) -{ - int ret; - - if (!np) - return -ENODEV; - - ret = of_get_mac_addr(np, "mac-address", addr); - if (!ret) - return 0; - - ret = of_get_mac_addr(np, "local-mac-address", addr); - if (!ret) - return 0; - - ret = of_get_mac_addr(np, "address", addr); - if (!ret) - return 0; - - return of_get_mac_addr_nvmem(np, addr); -} -EXPORT_SYMBOL(of_get_mac_address); diff --git a/include/linux/of_net.h b/include/linux/of_net.h index daef3b0d9270..cf31188329b5 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_OF_NET +#ifdef CONFIG_OF #include struct net_device; diff --git a/net/core/Makefile b/net/core/Makefile index 35ced6201814..4268846f2f47 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -36,3 +36,4 @@ obj-$(CONFIG_FAILOVER) += failover.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o +obj-$(CONFIG_OF) += of_net.o diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a4ae65263384..d7f9ee830d34 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1927,7 +1927,7 @@ static struct class net_class __ro_after_init = { .get_ownership = net_get_ownership, }; -#ifdef CONFIG_OF_NET +#ifdef CONFIG_OF static int of_dev_node_match(struct device *dev, const void *data) { for (; dev; dev = dev->parent) { diff --git a/net/core/of_net.c b/net/core/of_net.c new file mode 100644 index 000000000000..dbac3a172a11 --- /dev/null +++ b/net/core/of_net.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * OF helpers for network devices. + * + * Initially copied out of arch/powerpc/kernel/prom_parse.c + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * of_get_phy_mode - Get phy mode for given device_node + * @np: Pointer to the given device_node + * @interface: Pointer to the result + * + * The function gets phy interface string from property 'phy-mode' or + * 'phy-connection-type'. The index in phy_modes table is set in + * interface and 0 returned. In case of error interface is set to + * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV. + */ +int of_get_phy_mode(struct device_node *np, phy_interface_t *interface) +{ + const char *pm; + int err, i; + + *interface = PHY_INTERFACE_MODE_NA; + + err = of_property_read_string(np, "phy-mode", &pm); + if (err < 0) + err = of_property_read_string(np, "phy-connection-type", &pm); + if (err < 0) + return err; + + for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) + if (!strcasecmp(pm, phy_modes(i))) { + *interface = i; + return 0; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(of_get_phy_mode); + +static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) +{ + struct property *pp = of_find_property(np, name, NULL); + + if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) { + memcpy(addr, pp->value, ETH_ALEN); + return 0; + } + return -ENODEV; +} + +static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) +{ + struct platform_device *pdev = of_find_device_by_node(np); + struct nvmem_cell *cell; + const void *mac; + size_t len; + int ret; + + /* Try lookup by device first, there might be a nvmem_cell_lookup + * associated with a given device. + */ + if (pdev) { + ret = nvmem_get_mac_address(&pdev->dev, addr); + put_device(&pdev->dev); + return ret; + } + + cell = of_nvmem_cell_get(np, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + memcpy(addr, mac, ETH_ALEN); + kfree(mac); + + return 0; +} + +/** + * of_get_mac_address() + * @np: Caller's Device Node + * @addr: Pointer to a six-byte array for the result + * + * Search the device tree for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. If any + * of the above isn't set, then try to get MAC address from nvmem cell named + * 'mac-address'. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the device tree, but were not set by U-Boot. For example, the + * DTS could define 'mac-address' and 'local-mac-address', with zero MAC + * addresses. Some older U-Boots only initialized 'local-mac-address'. In + * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists + * but is all zeros. + * + * Return: 0 on success and errno in case of error. +*/ +int of_get_mac_address(struct device_node *np, u8 *addr) +{ + int ret; + + if (!np) + return -ENODEV; + + ret = of_get_mac_addr(np, "mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "local-mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "address", addr); + if (!ret) + return 0; + + return of_get_mac_addr_nvmem(np, addr); +} +EXPORT_SYMBOL(of_get_mac_address); -- cgit v1.2.3 From 9285523b41746110814058f99a940f3edf2b5582 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Oct 2021 11:00:37 +0200 Subject: net: of: fix stub of_net helpers for CONFIG_NET=n [ Upstream commit 8b017fbe0bbb98dd71fb4850f6b9cc0e136a26b8 ] Moving the of_net code from drivers/of/ to net/core means we no longer stub out the helpers when networking is disabled, which leads to a randconfig build failure with at least one ARM platform that calls this from non-networking code: arm-linux-gnueabi-ld: arch/arm/mach-mvebu/kirkwood.o: in function `kirkwood_dt_eth_fixup': kirkwood.c:(.init.text+0x54): undefined reference to `of_get_mac_address' Restore the way this worked before by changing that #ifdef check back to testing for both CONFIG_OF and CONFIG_NET. Fixes: e330fb14590c ("of: net: move of_net under net/") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20211014090055.2058949-1-arnd@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/linux/of_net.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index cf31188329b5..55460ecfa50a 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -8,7 +8,7 @@ #include -#ifdef CONFIG_OF +#if defined(CONFIG_OF) && defined(CONFIG_NET) #include struct net_device; -- cgit v1.2.3 From 4020d2e14f13155fe1f2a7adeb19825210a5749e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Feb 2022 16:14:32 +0200 Subject: xfrm: enforce validity of offload input flags commit 7c76ecd9c99b6e9a771d813ab1aa7fa428b3ade1 upstream. struct xfrm_user_offload has flags variable that received user input, but kernel didn't check if valid bits were provided. It caused a situation where not sanitized input was forwarded directly to the drivers. For example, XFRM_OFFLOAD_IPV6 define that was exposed, was used by strongswan, but not implemented in the kernel at all. As a solution, check and sanitize input flags to forward XFRM_OFFLOAD_INBOUND to the drivers. Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/xfrm.h | 6 ++++++ net/xfrm/xfrm_device.c | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4e29d7851890..65e13a099b1a 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -511,6 +511,12 @@ struct xfrm_user_offload { int ifindex; __u8 flags; }; +/* This flag was exposed without any kernel code that supporting it. + * Unfortunately, strongswan has the code that uses sets this flag, + * which makes impossible to reuse this bit. + * + * So leave it here to make sure that it won't be reused by mistake. + */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index e843b0d9e2a6..c255aac6b816 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -223,6 +223,9 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->encap || x->tfcpad) return -EINVAL; + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + return -EINVAL; + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -261,7 +264,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->dev = dev; xso->real_dev = dev; xso->num_exthdrs = 1; - xso->flags = xuo->flags; + /* Don't forward bit that is not implemented */ + xso->flags = xuo->flags & ~XFRM_OFFLOAD_IPV6; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { -- cgit v1.2.3 From dd648bd1b33a828f62befa696b206c688da0ec43 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 28 Feb 2022 06:22:22 +0100 Subject: netfilter: nf_queue: fix possible use-after-free commit c3873070247d9e3c7a6b0cf9bf9b45e8018427b1 upstream. Eric Dumazet says: The sock_hold() side seems suspect, because there is no guarantee that sk_refcnt is not already 0. On failure, we cannot queue the packet and need to indicate an error. The packet will be dropped by the caller. v2: split skb prefetch hunk into separate change Fixes: 271b72c7fa82c ("udp: RCU handling for Unicast packets.") Reported-by: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_queue.h | 2 +- net/netfilter/nf_queue.c | 13 +++++++++---- net/netfilter/nfnetlink_queue.c | 12 +++++++++--- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 9eed51e920e8..980daa6e1e3a 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -37,7 +37,7 @@ void nf_register_queue_handler(const struct nf_queue_handler *qh); void nf_unregister_queue_handler(void); void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict); -void nf_queue_entry_get_refs(struct nf_queue_entry *entry); +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry); void nf_queue_entry_free(struct nf_queue_entry *entry); static inline void init_hashrandom(u32 *jhash_initval) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5ab0680db445..e39549c55945 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,19 +96,21 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) } /* Bump dev refs so they don't vanish while packet is out */ -void nf_queue_entry_get_refs(struct nf_queue_entry *entry) +bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; + if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) + return false; + dev_hold(state->in); dev_hold(state->out); - if (state->sk) - sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_hold(entry->physin); dev_hold(entry->physout); #endif + return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -196,7 +198,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, __nf_queue_entry_init_physdevs(entry); - nf_queue_entry_get_refs(entry); + if (!nf_queue_entry_get_refs(entry)) { + kfree(entry); + return -ENOTCONN; + } switch (entry->state.pf) { case AF_INET: diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 959527708e38..8787d0613ad8 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -710,9 +710,15 @@ static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); - if (entry) - nf_queue_entry_get_refs(entry); - return entry; + + if (!entry) + return NULL; + + if (nf_queue_entry_get_refs(entry)) + return entry; + + kfree(entry); + return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -- cgit v1.2.3 From 3411613611a5cddf7e80908010dc87cb527dd13b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Feb 2022 10:16:57 +0100 Subject: sched: Fix yet more sched_fork() races commit b1e8206582f9d680cff7d04828708c8b6ab32957 upstream. Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sched/task.h | 4 ++-- kernel/fork.c | 13 ++++++++++++- kernel/sched/core.c | 34 +++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 058d7f371e25..8db25fcc1eba 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index 28aee1a8875b..89475c994ca9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2296,6 +2296,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_put_pidfd; + /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do @@ -2405,7 +2416,7 @@ static __latent_entropy struct task_struct *copy_process( fd_install(pidfd, pidfile); proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 57e5c7914296..a0747eaa2dba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1199,9 +1199,8 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4359,7 +4358,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -4377,6 +4376,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4392,18 +4392,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4414,7 +4419,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -6903,7 +6911,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); @@ -7194,7 +7202,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* @@ -9432,7 +9440,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: -- cgit v1.2.3 From b355d6a14ba709da67dbcceb5e9d8a73dfd3a99a Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:23:42 -0800 Subject: HID: add mapping for KEY_DICTATE commit bfa26ba343c727e055223be04e08f2ebdd43c293 upstream. Numerous keyboards are adding dictate keys which allows for text messages to be dictated by a microphone. This patch adds a new key definition KEY_DICTATE and maps 0x0c/0x0d8 usage code to this new keycode. Additionally hid-debug is adjusted to recognize this new usage code as well. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303021501.1.I5dbf50eb1a7a6734ee727bda4a8573358c6d3ec0@changeid Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-debug.c | 1 + drivers/hid/hid-input.c | 1 + include/uapi/linux/input-event-codes.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include') diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index fa57d05badf7..4d41ec9d8b7d 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -934,6 +934,7 @@ static const char *keys[KEY_MAX + 1] = { [KEY_ASSISTANT] = "Assistant", [KEY_KBD_LAYOUT_NEXT] = "KbdLayoutNext", [KEY_EMOJI_PICKER] = "EmojiPicker", + [KEY_DICTATE] = "Dictate", [KEY_BRIGHTNESS_MIN] = "BrightnessMin", [KEY_BRIGHTNESS_MAX] = "BrightnessMax", [KEY_BRIGHTNESS_AUTO] = "BrightnessAuto", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 3d33c0c06cbb..86bb6a88364a 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -991,6 +991,7 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x0cd: map_key_clear(KEY_PLAYPAUSE); break; case 0x0cf: map_key_clear(KEY_VOICECOMMAND); break; + case 0x0d8: map_key_clear(KEY_DICTATE); break; case 0x0d9: map_key_clear(KEY_EMOJI_PICKER); break; case 0x0e0: map_abs_clear(ABS_VOLUME); break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 225ec87d4f22..4db5d41848e4 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -612,6 +612,7 @@ #define KEY_ASSISTANT 0x247 /* AL Context-aware desktop assistant */ #define KEY_KBD_LAYOUT_NEXT 0x248 /* AC Next Keyboard Layout Select */ #define KEY_EMOJI_PICKER 0x249 /* Show/hide emoji picker (HUTRR101) */ +#define KEY_DICTATE 0x24a /* Start or Stop Voice Dictation Session (HUTRR99) */ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ -- cgit v1.2.3 From aa6d3eef28f348f1fed15e9ebac9752f93b8a9b9 Mon Sep 17 00:00:00 2001 From: William Mahon Date: Thu, 3 Mar 2022 18:26:22 -0800 Subject: HID: add mapping for KEY_ALL_APPLICATIONS commit 327b89f0acc4c20a06ed59e4d9af7f6d804dc2e2 upstream. This patch adds a new key definition for KEY_ALL_APPLICATIONS and aliases KEY_DASHBOARD to it. It also maps the 0x0c/0x2a2 usage code to KEY_ALL_APPLICATIONS. Signed-off-by: William Mahon Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220303035618.1.I3a7746ad05d270161a18334ae06e3b6db1a1d339@changeid Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-debug.c | 4 +++- drivers/hid/hid-input.c | 2 ++ include/uapi/linux/input-event-codes.h | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c index 4d41ec9d8b7d..f48d3534e020 100644 --- a/drivers/hid/hid-debug.c +++ b/drivers/hid/hid-debug.c @@ -825,7 +825,9 @@ static const char *keys[KEY_MAX + 1] = { [KEY_F22] = "F22", [KEY_F23] = "F23", [KEY_F24] = "F24", [KEY_PLAYCD] = "PlayCD", [KEY_PAUSECD] = "PauseCD", [KEY_PROG3] = "Prog3", - [KEY_PROG4] = "Prog4", [KEY_SUSPEND] = "Suspend", + [KEY_PROG4] = "Prog4", + [KEY_ALL_APPLICATIONS] = "AllApplications", + [KEY_SUSPEND] = "Suspend", [KEY_CLOSE] = "Close", [KEY_PLAY] = "Play", [KEY_FASTFORWARD] = "FastForward", [KEY_BASSBOOST] = "BassBoost", [KEY_PRINT] = "Print", [KEY_HP] = "HP", diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 86bb6a88364a..3172987a9ebd 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -1083,6 +1083,8 @@ static void hidinput_configure_usage(struct hid_input *hidinput, struct hid_fiel case 0x29d: map_key_clear(KEY_KBD_LAYOUT_NEXT); break; + case 0x2a2: map_key_clear(KEY_ALL_APPLICATIONS); break; + case 0x2c7: map_key_clear(KEY_KBDINPUTASSIST_PREV); break; case 0x2c8: map_key_clear(KEY_KBDINPUTASSIST_NEXT); break; case 0x2c9: map_key_clear(KEY_KBDINPUTASSIST_PREVGROUP); break; diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 4db5d41848e4..7989d9483ea7 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -278,7 +278,8 @@ #define KEY_PAUSECD 201 #define KEY_PROG3 202 #define KEY_PROG4 203 -#define KEY_DASHBOARD 204 /* AL Dashboard */ +#define KEY_ALL_APPLICATIONS 204 /* AC Desktop Show All Applications */ +#define KEY_DASHBOARD KEY_ALL_APPLICATIONS #define KEY_SUSPEND 205 #define KEY_CLOSE 206 /* AC Close */ #define KEY_PLAY 207 -- cgit v1.2.3 From 8b893496892eb0b51e4d6d523697eb265f6d5d1b Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Wed, 26 Jan 2022 16:00:18 +0100 Subject: Revert "xfrm: xfrm_state_mtu should return at least 1280 for ipv6" commit a6d95c5a628a09be129f25d5663a7e9db8261f51 upstream. This reverts commit b515d2637276a3810d6595e10ab02c13bfd0b63a. Commit b515d2637276a3810d6595e10ab02c13bfd0b63a ("xfrm: xfrm_state_mtu should return at least 1280 for ipv6") in v5.14 breaks the TCP MSS calculation in ipsec transport mode, resulting complete stalls of TCP connections. This happens when the (P)MTU is 1280 or slighly larger. The desired formula for the MSS is: MSS = (MTU - ESP_overhead) - IP header - TCP header However, the above commit clamps the (MTU - ESP_overhead) to a minimum of 1280, turning the formula into MSS = max(MTU - ESP overhead, 1280) - IP header - TCP header With the (P)MTU near 1280, the calculated MSS is too large and the resulting TCP packets never make it to the destination because they are over the actual PMTU. The above commit also causes suboptimal double fragmentation in xfrm tunnel mode, as described in https://lore.kernel.org/netdev/20210429202529.codhwpc7w6kbudug@dwarf.suse.cz/ The original problem the above commit was trying to fix is now fixed by commit 6596a0229541270fb8d38d989f91b78838e5e9da ("xfrm: fix MTU regression"). Signed-off-by: Jiri Bohac Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman --- include/net/xfrm.h | 1 - net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- net/xfrm/xfrm_state.c | 14 ++------------ 4 files changed, 4 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2b1ce8534993..301a164f17e9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1567,7 +1567,6 @@ void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 851f542928a3..e1b1d080e908 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -671,7 +671,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index f0bac6f7ab6b..883b53fd7846 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -708,7 +708,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); u32 padto; - padto = min(x->tfcpad, __xfrm_state_mtu(x, dst->child_mtu_cached)); + padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached)); if (skb->len < padto) esp.tfclen = padto - skb->len; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 78d51399a0f4..100b4b3723e7 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2571,7 +2571,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_delete_tunnel); -u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) +u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; @@ -2602,17 +2602,7 @@ u32 __xfrm_state_mtu(struct xfrm_state *x, int mtu) return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } -EXPORT_SYMBOL_GPL(__xfrm_state_mtu); - -u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) -{ - mtu = __xfrm_state_mtu(x, mtu); - - if (x->props.family == AF_INET6 && mtu < IPV6_MIN_MTU) - return IPV6_MIN_MTU; - - return mtu; -} +EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) { -- cgit v1.2.3 From 316e4a16524a2d2ce321f57c1abe4df9ef90f950 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 18 Feb 2022 11:49:08 -0800 Subject: x86/speculation: Include unprivileged eBPF status in Spectre v2 mitigation reporting commit 44a3918c8245ab10c6c9719dd12e7a8d291980d8 upstream. With unprivileged eBPF enabled, eIBRS (without retpoline) is vulnerable to Spectre v2 BHB-based attacks. When both are enabled, print a warning message and report it in the 'spectre_v2' sysfs vulnerabilities file. Signed-off-by: Josh Poimboeuf Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner [fllinden@amazon.com: backported to 5.15] Signed-off-by: Frank van der Linden Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 35 +++++++++++++++++++++++++++++------ include/linux/bpf.h | 12 ++++++++++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 48 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 693f92b3af1b..1de7f1fe8939 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -650,6 +651,16 @@ static inline const char *spectre_v2_module_string(void) static inline const char *spectre_v2_module_string(void) { return ""; } #endif +#define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" + +#ifdef CONFIG_BPF_SYSCALL +void unpriv_ebpf_notify(int new_state) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && !new_state) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); +} +#endif + static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); @@ -994,6 +1005,9 @@ static void __init spectre_v2_select_mitigation(void) break; } + if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); + if (spectre_v2_in_eibrs_mode(mode)) { /* Force it so VMEXIT will restore correctly */ x86_spec_ctrl_base |= SPEC_CTRL_IBRS; @@ -1780,6 +1794,20 @@ static char *ibpb_state(void) return ""; } +static ssize_t spectre_v2_show_state(char *buf) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n"); + + return sprintf(buf, "%s%s%s%s%s%s\n", + spectre_v2_strings[spectre_v2_enabled], + ibpb_state(), + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + stibp_state(), + boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + spectre_v2_module_string()); +} + static ssize_t srbds_show_state(char *buf) { return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); @@ -1805,12 +1833,7 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return sprintf(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: - return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - ibpb_state(), - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - stibp_state(), - boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", - spectre_v2_module_string()); + return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d9049f2a78ca..15b690a0cecb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1666,6 +1666,12 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); + +static inline bool unprivileged_ebpf_enabled(void) +{ + return !sysctl_unprivileged_bpf_disabled; +} + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1884,6 +1890,12 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, { return NULL; } + +static inline bool unprivileged_ebpf_enabled(void) +{ + return false; +} + #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..0586047f7323 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -228,6 +228,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -245,6 +249,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ -- cgit v1.2.3 From 50e700a117669e072fb9e47ff3ea49e4a8cacf04 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 18 Nov 2021 13:59:46 +0000 Subject: arm64: entry: Add vectors that have the bhb mitigation sequences commit ba2689234be92024e5635d30fe744f4853ad97db upstream. Some CPUs affected by Spectre-BHB need a sequence of branches, or a firmware call to be run before any indirect branch. This needs to go in the vectors. No CPU needs both. While this can be patched in, it would run on all CPUs as there is a single set of vectors. If only one part of a big/little combination is affected, the unaffected CPUs have to run the mitigation too. Create extra vectors that include the sequence. Subsequent patches will allow affected CPUs to select this set of vectors. Later patches will modify the loop count to match what the CPU requires. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/assembler.h | 24 +++++++++++++++++ arch/arm64/include/asm/vectors.h | 34 ++++++++++++++++++++++++ arch/arm64/kernel/entry.S | 53 +++++++++++++++++++++++++++++++------- arch/arm64/kernel/proton-pack.c | 16 ++++++++++++ include/linux/arm-smccc.h | 5 ++++ 5 files changed, 123 insertions(+), 9 deletions(-) create mode 100644 arch/arm64/include/asm/vectors.h (limited to 'include') diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index bfa58409a4d4..0f0a9be8cfcb 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -830,4 +830,28 @@ alternative_endif #endif /* GNU_PROPERTY_AARCH64_FEATURE_1_DEFAULT */ + .macro __mitigate_spectre_bhb_loop tmp +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + mov \tmp, #32 +.Lspectre_bhb_loop\@: + b . + 4 + subs \tmp, \tmp, #1 + b.ne .Lspectre_bhb_loop\@ + sb +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm + + /* Save/restores x0-x3 to the stack */ + .macro __mitigate_spectre_bhb_fw +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + stp x0, x1, [sp, #-16]! + stp x2, x3, [sp, #-16]! + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_3 +alternative_cb smccc_patch_fw_mitigation_conduit + nop // Patched to SMC/HVC #0 +alternative_cb_end + ldp x2, x3, [sp], #16 + ldp x0, x1, [sp], #16 +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + .endm #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/vectors.h b/arch/arm64/include/asm/vectors.h new file mode 100644 index 000000000000..16ca74260375 --- /dev/null +++ b/arch/arm64/include/asm/vectors.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 ARM Ltd. + */ +#ifndef __ASM_VECTORS_H +#define __ASM_VECTORS_H + +/* + * Note: the order of this enum corresponds to two arrays in entry.S: + * tramp_vecs and __bp_harden_el1_vectors. By default the canonical + * 'full fat' vectors are used directly. + */ +enum arm64_bp_harden_el1_vectors { +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + /* + * Perform the BHB loop mitigation, before branching to the canonical + * vectors. + */ + EL1_VECTOR_BHB_LOOP, + + /* + * Make the SMC call for firmware mitigation, before branching to the + * canonical vectors. + */ + EL1_VECTOR_BHB_FW, +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + + /* + * Remap the kernel before branching to the canonical vectors. + */ + EL1_VECTOR_KPTI, +}; + +#endif /* __ASM_VECTORS_H */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2c5dbeb5ceb9..ddd4665d861e 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -646,13 +646,26 @@ alternative_else_nop_endif sub \dst, \dst, PAGE_SIZE .endm - .macro tramp_ventry, vector_start, regsize, kpti + +#define BHB_MITIGATION_NONE 0 +#define BHB_MITIGATION_LOOP 1 +#define BHB_MITIGATION_FW 2 + + .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif + .if \bhb == BHB_MITIGATION_LOOP + /* + * This sequence must appear before the first indirect branch. i.e. the + * ret out of tramp_ventry. It appears here because x30 is free. + */ + __mitigate_spectre_bhb_loop x30 + .endif // \bhb == BHB_MITIGATION_LOOP + .if \kpti == 1 /* * Defend against branch aliasing attacks by pushing a dummy @@ -680,6 +693,15 @@ alternative_else_nop_endif ldr x30, =vectors .endif // \kpti == 1 + .if \bhb == BHB_MITIGATION_FW + /* + * The firmware sequence must appear before the first indirect branch. + * i.e. the ret out of tramp_ventry. But it also needs the stack to be + * mapped to save/restore the registers the SMC clobbers. + */ + __mitigate_spectre_bhb_fw + .endif // \bhb == BHB_MITIGATION_FW + add x30, x30, #(1b - \vector_start + 4) ret .org 1b + 128 // Did we overflow the ventry slot? @@ -687,6 +709,9 @@ alternative_else_nop_endif .macro tramp_exit, regsize = 64 adr x30, tramp_vectors +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + add x30, x30, SZ_4K +#endif msr vbar_el1, x30 ldr lr, [sp, #S_LR] tramp_unmap_kernel x29 @@ -698,26 +723,32 @@ alternative_else_nop_endif sb .endm - .macro generate_tramp_vector, kpti + .macro generate_tramp_vector, kpti, bhb .Lvector_start\@: .space 0x400 .rept 4 - tramp_ventry .Lvector_start\@, 64, \kpti + tramp_ventry .Lvector_start\@, 64, \kpti, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, \kpti + tramp_ventry .Lvector_start\@, 32, \kpti, \bhb .endr .endm #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 /* * Exception vectors trampoline. + * The order must match __bp_harden_el1_vectors and the + * arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.tramp.text", "ax" .align 11 SYM_CODE_START_NOALIGN(tramp_vectors) - generate_tramp_vector kpti=1 +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_LOOP + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ + generate_tramp_vector kpti=1, bhb=BHB_MITIGATION_NONE SYM_CODE_END(tramp_vectors) SYM_CODE_START(tramp_exit_native) @@ -744,7 +775,7 @@ SYM_DATA_END(__entry_tramp_data_start) * Exception vectors for spectre mitigations on entry from EL1 when * kpti is not in use. */ - .macro generate_el1_vector + .macro generate_el1_vector, bhb .Lvector_start\@: kernel_ventry 1, t, 64, sync // Synchronous EL1t kernel_ventry 1, t, 64, irq // IRQ EL1t @@ -757,17 +788,21 @@ SYM_DATA_END(__entry_tramp_data_start) kernel_ventry 1, h, 64, error // Error EL1h .rept 4 - tramp_ventry .Lvector_start\@, 64, kpti=0 + tramp_ventry .Lvector_start\@, 64, 0, \bhb .endr .rept 4 - tramp_ventry .Lvector_start\@, 32, kpti=0 + tramp_ventry .Lvector_start\@, 32, 0, \bhb .endr .endm +/* The order must match tramp_vecs and the arm64_bp_harden_el1_vectors enum. */ .pushsection ".entry.text", "ax" .align 11 SYM_CODE_START(__bp_harden_el1_vectors) - generate_el1_vector +#ifdef CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY + generate_el1_vector bhb=BHB_MITIGATION_LOOP + generate_el1_vector bhb=BHB_MITIGATION_FW +#endif /* CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY */ SYM_CODE_END(__bp_harden_el1_vectors) .popsection diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 9394f21d7566..6a5eeb8beea3 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -770,3 +770,19 @@ int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) return -ENODEV; } } + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); +} + +/* Patched to NOP when enabled */ +void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt, + __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + BUG_ON(nr_inst != 1); +} diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 63ccb5252190..220c8c60e021 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -92,6 +92,11 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +#define ARM_SMCCC_ARCH_WORKAROUND_3 \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_32, \ + 0, 0x3fff) + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ ARM_SMCCC_SMC_32, \ -- cgit v1.2.3 From 1dd5b4b230f6d1345708c6204ccacdf83d53feaf Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 25 Feb 2022 16:05:41 +0100 Subject: xen/grant-table: add gnttab_try_end_foreign_access() Commit 6b1775f26a2da2b05a6dc8ec2b5d14e9a4701a1a upstream. Add a new grant table function gnttab_try_end_foreign_access(), which will remove and free a grant if it is not in use. Its main use case is to either free a grant if it is no longer in use, or to take some other action if it is still in use. This other action can be an error exit, or (e.g. in the case of blkfront persistent grant feature) some special handling. This is CVE-2022-23036, CVE-2022-23038 / part of XSA-396. Reported-by: Demi Marie Obenour Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman --- drivers/xen/grant-table.c | 14 ++++++++++++-- include/xen/grant_table.h | 12 ++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3729bea0c989..1b82e7a3722a 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -435,11 +435,21 @@ static void gnttab_add_deferred(grant_ref_t ref, bool readonly, what, ref, page ? page_to_pfn(page) : -1); } +int gnttab_try_end_foreign_access(grant_ref_t ref) +{ + int ret = _gnttab_end_foreign_access_ref(ref, 0); + + if (ret) + put_free_entry(ref); + + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_try_end_foreign_access); + void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page) { - if (gnttab_end_foreign_access_ref(ref, readonly)) { - put_free_entry(ref); + if (gnttab_try_end_foreign_access(ref)) { if (page != 0) put_page(virt_to_page(page)); } else diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index cb854df031ce..358d2817741b 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -104,10 +104,22 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * access has been ended, free the given page too. Access will be ended * immediately iff the grant entry is not in use, otherwise it will happen * some time later. page may be 0, in which case no freeing will occur. + * Note that the granted page might still be accessed (read or write) by the + * other side after gnttab_end_foreign_access() returns, so even if page was + * specified as 0 it is not allowed to just reuse the page for other + * purposes immediately. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); +/* + * End access through the given grant reference, iff the grant entry is + * no longer in use. In case of success ending foreign access, the + * grant reference is deallocated. + * Return 1 if the grant entry was freed, 0 if it is still in use. + */ +int gnttab_try_end_foreign_access(grant_ref_t ref); + int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); -- cgit v1.2.3 From 90c5f198b923b996f55d582d4e3cc8acd7aa4e57 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 25 Feb 2022 16:05:42 +0100 Subject: xen: remove gnttab_query_foreign_access() Commit 1dbd11ca75fe664d3e54607547771d021f531f59 upstream. Remove gnttab_query_foreign_access(), as it is unused and unsafe to use. All previous use cases assumed a grant would not be in use after gnttab_query_foreign_access() returned 0. This information is useless in best case, as it only refers to a situation in the past, which could have changed already. Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman --- drivers/xen/grant-table.c | 25 ------------------------- include/xen/grant_table.h | 2 -- 2 files changed, 27 deletions(-) (limited to 'include') diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 1b82e7a3722a..e6548910e79f 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -133,13 +133,6 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); - /* - * Query the status of a grant entry. Ref parameter is reference of - * queried grant entry, return value is the status of queried entry. - * Detailed status(writing/reading) can be gotten from the return value - * by bit operations. - */ - int (*query_foreign_access)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -284,22 +277,6 @@ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, } EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); -static int gnttab_query_foreign_access_v1(grant_ref_t ref) -{ - return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); -} - -static int gnttab_query_foreign_access_v2(grant_ref_t ref) -{ - return grstatus[ref] & (GTF_reading|GTF_writing); -} - -int gnttab_query_foreign_access(grant_ref_t ref) -{ - return gnttab_interface->query_foreign_access(ref); -} -EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); - static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) { u16 flags, nflags; @@ -1427,7 +1404,6 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, - .query_foreign_access = gnttab_query_foreign_access_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1439,7 +1415,6 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, - .query_foreign_access = gnttab_query_foreign_access_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 358d2817741b..ab9e692a0ef4 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -125,8 +125,6 @@ int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); -int gnttab_query_foreign_access(grant_ref_t ref); - /* * operations on reserved batches of grant references */ -- cgit v1.2.3 From 27dc69aa55687ea288d0be6ce8f036330d1dddb5 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 25 Feb 2022 16:05:43 +0100 Subject: xen/gnttab: fix gnttab_end_foreign_access() without page specified Commit 42baefac638f06314298087394b982ead9ec444b upstream. gnttab_end_foreign_access() is used to free a grant reference and optionally to free the associated page. In case the grant is still in use by the other side processing is being deferred. This leads to a problem in case no page to be freed is specified by the caller: the caller doesn't know that the page is still mapped by the other side and thus should not be used for other purposes. The correct way to handle this situation is to take an additional reference to the granted page in case handling is being deferred and to drop that reference when the grant reference could be freed finally. This requires that there are no users of gnttab_end_foreign_access() left directly repurposing the granted page after the call, as this might result in clobbered data or information leaks via the not yet freed grant reference. This is part of CVE-2022-23041 / XSA-396. Reported-by: Simon Gaiser Signed-off-by: Juergen Gross Reviewed-by: Jan Beulich Signed-off-by: Greg Kroah-Hartman --- drivers/xen/grant-table.c | 36 +++++++++++++++++++++++++++++------- include/xen/grant_table.h | 7 ++++++- 2 files changed, 35 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index e6548910e79f..5c83d41766c8 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -133,6 +133,10 @@ struct gnttab_ops { * return the frame. */ unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); + /* + * Read the frame number related to a given grant reference. + */ + unsigned long (*read_frame)(grant_ref_t ref); }; struct unmap_refs_callback_data { @@ -330,6 +334,16 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) } EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); +static unsigned long gnttab_read_frame_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].frame; +} + +static unsigned long gnttab_read_frame_v2(grant_ref_t ref) +{ + return gnttab_shared.v2[ref].full_page.frame; +} + struct deferred_entry { struct list_head list; grant_ref_t ref; @@ -359,12 +373,9 @@ static void gnttab_handle_deferred(struct timer_list *unused) spin_unlock_irqrestore(&gnttab_list_lock, flags); if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { put_free_entry(entry->ref); - if (entry->page) { - pr_debug("freeing g.e. %#x (pfn %#lx)\n", - entry->ref, page_to_pfn(entry->page)); - put_page(entry->page); - } else - pr_info("freeing g.e. %#x\n", entry->ref); + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + put_page(entry->page); kfree(entry); entry = NULL; } else { @@ -389,9 +400,18 @@ static void gnttab_handle_deferred(struct timer_list *unused) static void gnttab_add_deferred(grant_ref_t ref, bool readonly, struct page *page) { - struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; const char *what = KERN_WARNING "leaking"; + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { + unsigned long gfn = gnttab_interface->read_frame(ref); + + page = pfn_to_page(gfn_to_pfn(gfn)); + get_page(page); + } + if (entry) { unsigned long flags; @@ -1404,6 +1424,7 @@ static const struct gnttab_ops gnttab_v1_ops = { .update_entry = gnttab_update_entry_v1, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, + .read_frame = gnttab_read_frame_v1, }; static const struct gnttab_ops gnttab_v2_ops = { @@ -1415,6 +1436,7 @@ static const struct gnttab_ops gnttab_v2_ops = { .update_entry = gnttab_update_entry_v2, .end_foreign_access_ref = gnttab_end_foreign_access_ref_v2, .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v2, + .read_frame = gnttab_read_frame_v2, }; static bool gnttab_need_v2(void) diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index ab9e692a0ef4..c9fea9389ebe 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -107,7 +107,12 @@ int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); * Note that the granted page might still be accessed (read or write) by the * other side after gnttab_end_foreign_access() returns, so even if page was * specified as 0 it is not allowed to just reuse the page for other - * purposes immediately. + * purposes immediately. gnttab_end_foreign_access() will take an additional + * reference to the granted page in this case, which is dropped only after + * the grant is no longer in use. + * This requires that multi page allocations for areas subject to + * gnttab_end_foreign_access() are done via alloc_pages_exact() (and freeing + * via free_pages_exact()) in order to avoid high order pages. */ void gnttab_end_foreign_access(grant_ref_t ref, int readonly, unsigned long page); -- cgit v1.2.3 From 4aaabbffc3b0658ce80eebdde9bafa20a3f932e0 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 7 Mar 2022 13:11:39 +0100 Subject: esp: Fix possible buffer overflow in ESP transformation [ Upstream commit ebe48d368e97d007bfeb76fcb065d6cfc4c96645 ] The maximum message size that can be send is bigger than the maximum site that skb_page_frag_refill can allocate. So it is possible to write beyond the allocated buffer. Fix this by doing a fallback to COW in that case. v2: Avoid get get_order() costs as suggested by Linus Torvalds. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Reported-by: valis Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- include/net/esp.h | 2 ++ net/ipv4/esp4.c | 5 +++++ net/ipv6/esp6.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d95..90cd02ff77ef 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e1b1d080e908..70e6c87fbe3d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,6 +446,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -455,6 +456,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 883b53fd7846..b7b573085bd5 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,6 +483,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -491,6 +492,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } + allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); + if (allocsz > ESP_SKB_FRAG_MAXSIZE) + goto cow; + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; -- cgit v1.2.3 From b3d4a7dcb9cae0b74ac9909a5421f9c20d60d380 Mon Sep 17 00:00:00 2001 From: Mohammad Kabat Date: Thu, 25 Mar 2021 14:38:55 +0200 Subject: net/mlx5: Fix size field in bufferx_reg struct [ Upstream commit ac77998b7ac3044f0509b097da9637184598980d ] According to HW spec the field "size" should be 16 bits in bufferx register. Fixes: e281682bf294 ("net/mlx5_core: HW data structs/types definitions cleanup") Signed-off-by: Mohammad Kabat Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Sasha Levin --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 25d775764a5a..fdf4589ab4d4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9508,8 +9508,8 @@ struct mlx5_ifc_bufferx_reg_bits { u8 reserved_at_0[0x6]; u8 lossy[0x1]; u8 epsb[0x1]; - u8 reserved_at_8[0xc]; - u8 size[0xc]; + u8 reserved_at_8[0x8]; + u8 size[0x10]; u8 xoff_threshold[0x10]; u8 xon_threshold[0x10]; -- cgit v1.2.3 From 7403f4118ab94be837ab9d770507537a8057bc63 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Fri, 11 Feb 2022 02:12:52 +0100 Subject: swiotlb: fix info leak with DMA_FROM_DEVICE [ Upstream commit ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e ] The problem I'm addressing was discovered by the LTP test covering cve-2018-1000204. A short description of what happens follows: 1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV and a corresponding dxferp. The peculiar thing about this is that TUR is not reading from the device. 2) In sg_start_req() the invocation of blk_rq_map_user() effectively bounces the user-space buffer. As if the device was to transfer into it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in sg_build_indirect()") we make sure this first bounce buffer is allocated with GFP_ZERO. 3) For the rest of the story we keep ignoring that we have a TUR, so the device won't touch the buffer we prepare as if the we had a DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device and the buffer allocated by SG is mapped by the function virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here scatter-gather and not scsi generics). This mapping involves bouncing via the swiotlb (we need swiotlb to do virtio in protected guest like s390 Secure Execution, or AMD SEV). 4) When the SCSI TUR is done, we first copy back the content of the second (that is swiotlb) bounce buffer (which most likely contains some previous IO data), to the first bounce buffer, which contains all zeros. Then we copy back the content of the first bounce buffer to the user-space buffer. 5) The test case detects that the buffer, which it zero-initialized, ain't all zeros and fails. One can argue that this is an swiotlb problem, because without swiotlb we leak all zeros, and the swiotlb should be transparent in a sense that it does not affect the outcome (if all other participants are well behaved). Copying the content of the original buffer into the swiotlb buffer is the only way I can think of to make swiotlb transparent in such scenarios. So let's do just that if in doubt, but allow the driver to tell us that the whole mapped buffer is going to be overwritten, in which case we can preserve the old behavior and avoid the performance impact of the extra bounce. Signed-off-by: Halil Pasic Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- Documentation/core-api/dma-attributes.rst | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..17706dc91ec9 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..6150d11a607e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 87c40517e822..aca0690550e2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -579,7 +579,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -- cgit v1.2.3 From 2c1f97af38be151527380796d31d3c9adb054bf9 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Sat, 5 Mar 2022 18:07:14 +0100 Subject: swiotlb: rework "fix info leak with DMA_FROM_DEVICE" commit aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13 upstream. Unfortunately, we ended up merging an old version of the patch "fix info leak with DMA_FROM_DEVICE" instead of merging the latest one. Christoph (the swiotlb maintainer), he asked me to create an incremental fix (after I have pointed this out the mix up, and asked him for guidance). So here we go. The main differences between what we got and what was agreed are: * swiotlb_sync_single_for_device is also required to do an extra bounce * We decided not to introduce DMA_ATTR_OVERWRITE until we have exploiters * The implantation of DMA_ATTR_OVERWRITE is flawed: DMA_ATTR_OVERWRITE must take precedence over DMA_ATTR_SKIP_CPU_SYNC Thus this patch removes DMA_ATTR_OVERWRITE, and makes swiotlb_sync_single_for_device() bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer. Let me note, that if the size used with dma_sync_* API is less than the size used with dma_[un]map_*, under certain circumstances we may still end up with swiotlb not being transparent. In that sense, this is no perfect fix either. To get this bullet proof, we would have to bounce the entire mapping/bounce buffer. For that we would have to figure out the starting address, and the size of the mapping in swiotlb_sync_single_for_device(). While this does seem possible, there seems to be no firm consensus on how things are supposed to work. Signed-off-by: Halil Pasic Fixes: ddbd89deb7d3 ("swiotlb: fix info leak with DMA_FROM_DEVICE") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- Documentation/core-api/dma-attributes.rst | 8 -------- include/linux/dma-mapping.h | 8 -------- kernel/dma/swiotlb.c | 23 +++++++++++++++-------- 3 files changed, 15 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 17706dc91ec9..1887d92e8e92 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,11 +130,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). - -DMA_ATTR_OVERWRITE ------------------- - -This is a hint to the DMA-mapping subsystem that the device is expected to -overwrite the entire mapped size, thus the caller does not require any of the -previous buffer contents to be preserved. This allows bounce-buffering -implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 6150d11a607e..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,14 +61,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index aca0690550e2..e58dce93c661 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -578,10 +578,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || - dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -648,10 +652,13 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) - swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); + /* + * Unconditional bounce is necessary to avoid corruption on + * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite + * the whole lengt of the bounce buffer. + */ + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + BUG_ON(!valid_dma_direction(dir)); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, -- cgit v1.2.3 From 22823b1a0dc95810a430b831ef00cf9ac98aedf0 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:56:15 -0500 Subject: virtio: unexport virtio_finalize_features commit 838d6d3461db0fdbf33fc5f8a69c27b50b4a46da upstream. virtio_finalize_features is only used internally within virtio. No reason to export it. Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck Acked-by: Jason Wang Signed-off-by: Greg Kroah-Hartman --- drivers/virtio/virtio.c | 3 +-- include/linux/virtio.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 236081afe9a2..ed3646054346 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -166,7 +166,7 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status) } EXPORT_SYMBOL_GPL(virtio_add_status); -int virtio_finalize_features(struct virtio_device *dev) +static int virtio_finalize_features(struct virtio_device *dev) { int ret = dev->config->finalize_features(dev); unsigned status; @@ -202,7 +202,6 @@ int virtio_finalize_features(struct virtio_device *dev) } return 0; } -EXPORT_SYMBOL_GPL(virtio_finalize_features); static int virtio_dev_probe(struct device *_d) { diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 41edbc01ffa4..1af8d65d4c8f 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -133,7 +133,6 @@ bool is_virtio_device(struct device *dev); void virtio_break_device(struct virtio_device *dev); void virtio_config_changed(struct virtio_device *dev); -int virtio_finalize_features(struct virtio_device *dev); #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); -- cgit v1.2.3 From cbb726e6c652149e3bef4e4bc976a3864ec23c2b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 14 Jan 2022 14:58:41 -0500 Subject: virtio: acknowledge all features before access commit 4fa59ede95195f267101a1b8916992cf3f245cdb upstream. The feature negotiation was designed in a way that makes it possible for devices to know which config fields will be accessed by drivers. This is broken since commit 404123c2db79 ("virtio: allow drivers to validate features") with fallout in at least block and net. We have a partial work-around in commit 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") which at least lets devices find out which format should config space have, but this is a partial fix: guests should not access config space without acknowledging features since otherwise we'll never be able to change the config space format. To fix, split finalize_features from virtio_finalize_features and call finalize_features with all feature bits before validation, and then - if validation changed any bits - once again after. Since virtio_finalize_features no longer writes out features rename it to virtio_features_ok - since that is what it does: checks that features are ok with the device. As a side effect, this also reduces the amount of hypervisor accesses - we now only acknowledge features once unless we are clearing any features when validating (which is uncommon). IRC I think that this was more or less always the intent in the spec but unfortunately the way the spec is worded does not say this explicitly, I plan to address this at the spec level, too. Acked-by: Jason Wang Cc: stable@vger.kernel.org Fixes: 404123c2db79 ("virtio: allow drivers to validate features") Fixes: 2f9a174f918e ("virtio: write back F_VERSION_1 before validate") Cc: "Halil Pasic" Signed-off-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/virtio/virtio.c | 39 ++++++++++++++++++++++----------------- include/linux/virtio_config.h | 3 ++- 2 files changed, 24 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index ed3646054346..c2b733ef95b0 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -166,14 +166,13 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status) } EXPORT_SYMBOL_GPL(virtio_add_status); -static int virtio_finalize_features(struct virtio_device *dev) +/* Do some validation, then set FEATURES_OK */ +static int virtio_features_ok(struct virtio_device *dev) { - int ret = dev->config->finalize_features(dev); unsigned status; + int ret; might_sleep(); - if (ret) - return ret; ret = arch_has_restricted_virtio_memory_access(); if (ret) { @@ -238,17 +237,6 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } - /* - * Some devices detect legacy solely via F_VERSION_1. Write - * F_VERSION_1 to force LE config space accesses before FEATURES_OK for - * these when needed. - */ - if (drv->validate && !virtio_legacy_is_little_endian() - && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) { - dev->features = BIT_ULL(VIRTIO_F_VERSION_1); - dev->config->finalize_features(dev); - } - if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else @@ -259,13 +247,26 @@ static int virtio_dev_probe(struct device *_d) if (device_features & (1ULL << i)) __virtio_set_bit(dev, i); + err = dev->config->finalize_features(dev); + if (err) + goto err; + if (drv->validate) { + u64 features = dev->features; + err = drv->validate(dev); if (err) goto err; + + /* Did validation change any features? Then write them again. */ + if (features != dev->features) { + err = dev->config->finalize_features(dev); + if (err) + goto err; + } } - err = virtio_finalize_features(dev); + err = virtio_features_ok(dev); if (err) goto err; @@ -489,7 +490,11 @@ int virtio_device_restore(struct virtio_device *dev) /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); - ret = virtio_finalize_features(dev); + ret = dev->config->finalize_features(dev); + if (ret) + goto err; + + ret = virtio_features_ok(dev); if (ret) goto err; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8519b3ae5d52..b341dd62aa4d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -62,8 +62,9 @@ struct virtio_shm_region { * Returns the first 64 feature bits (all we currently need). * @finalize_features: confirm what device features we'll be using. * vdev: the virtio_device - * This gives the final feature bits for the device: it can change + * This sends the driver feature bits to the device: it can change * the dev->feature bits if it wants. + * Note: despite the name this can be called any number of times. * Returns 0 on success or error status * @bus_name: return the bus name associated with the device (optional) * vdev: the virtio_device -- cgit v1.2.3 From 1b09f28f70a5046acd64138075ae3f095238b045 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Mar 2022 13:23:31 +0000 Subject: watch_queue: Fix filter limit check commit c993ee0f9f81caf5767a50d1faeba39a0dc82af2 upstream. In watch_queue_set_filter(), there are a couple of places where we check that the filter type value does not exceed what the type_filter bitmap can hold. One place calculates the number of bits by: if (tf[i].type >= sizeof(wfilter->type_filter) * 8) which is fine, but the second does: if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) which is not. This can lead to a couple of out-of-bounds writes due to a too-large type: (1) __set_bit() on wfilter->type_filter (2) Writing more elements in wfilter->filters[] than we allocated. Fix this by just using the proper WATCH_TYPE__NR instead, which is the number of types we actually know about. The bug may cause an oops looking something like: BUG: KASAN: slab-out-of-bounds in watch_queue_set_filter+0x659/0x740 Write of size 4 at addr ffff88800d2c66bc by task watch_queue_oob/611 ... Call Trace: dump_stack_lvl+0x45/0x59 print_address_description.constprop.0+0x1f/0x150 ... kasan_report.cold+0x7f/0x11b ... watch_queue_set_filter+0x659/0x740 ... __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Allocated by task 611: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 watch_queue_set_filter+0x23a/0x740 __x64_sys_ioctl+0x127/0x190 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88800d2c66a0 which belongs to the cache kmalloc-32 of size 32 The buggy address is located 28 bytes inside of 32-byte region [ffff88800d2c66a0, ffff88800d2c66c0) Fixes: c73be61cede5 ("pipe: Add general notification queue support") Reported-by: Jann Horn Signed-off-by: David Howells Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/watch_queue.h | 3 ++- kernel/watch_queue.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index c994d1b2cdba..3b9a40ae8bdb 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -28,7 +28,8 @@ struct watch_type_filter { struct watch_filter { union { struct rcu_head rcu; - unsigned long type_filter[2]; /* Bitmask of accepted types */ + /* Bitmask of accepted types */ + DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c9eb20dd2c5..427b0318e303 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -320,7 +320,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ - if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } @@ -336,7 +336,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { - if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; -- cgit v1.2.3 From 69b80587f650875a473c66df2a1120c0e656362b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Sep 2021 14:33:25 +0200 Subject: block: drop unused includes in commit b81e0c2372e65e5627864ba034433b64b2fc73f5 upstream. Drop various include not actually used in genhd.h itself, and move the remaning includes closer together. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210920123328.1399408-15-hch@lst.de Signed-off-by: Jens Axboe Reported-by: Sudip Mukherjee a Reported-by: "H. Nikolaus Schaller" Reported-by: Guenter Roeck Cc: "Maciej W. Rozycki" [ resolves MIPS build failure by luck, root cause needs to be fixed in Linus's tree properly, but this is needed for now to fix the build - gregkh ] Signed-off-by: Greg Kroah-Hartman --- arch/um/drivers/ubd_kern.c | 1 + block/genhd.c | 1 + block/holder.c | 1 + block/partitions/core.c | 1 + drivers/block/amiflop.c | 1 + drivers/block/ataflop.c | 1 + drivers/block/floppy.c | 1 + drivers/block/swim.c | 1 + drivers/block/xen-blkfront.c | 1 + drivers/md/md.c | 1 + drivers/s390/block/dasd_genhd.c | 1 + drivers/scsi/sd.c | 1 + drivers/scsi/sg.c | 1 + drivers/scsi/sr.c | 1 + drivers/scsi/st.c | 1 + include/linux/genhd.h | 14 ++------------ include/linux/part_stat.h | 1 + 17 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index cd9dc0556e91..fefd343412c7 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/block/genhd.c b/block/genhd.c index 2dcedbe4ef04..0276a2846adf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/block/holder.c b/block/holder.c index 9dc084182337..27cddce1b446 100644 --- a/block/holder.c +++ b/block/holder.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include struct bd_holder_disk { struct list_head list; diff --git a/block/partitions/core.c b/block/partitions/core.c index 7bea19dd9458..b9e9af84f518 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -5,6 +5,7 @@ * Copyright (C) 2020 Christoph Hellwig */ #include +#include #include #include #include diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 8b1714021498..1ed557cb5ed2 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index aab48b292a3b..82faaa458157 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 4a6a74177b3c..0f58594c5a4d 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -184,6 +184,7 @@ static int print_unex = 1; #include #include #include +#include #include #include #include diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 7ccc8d2a41bc..3911d0833e1b 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 1becbbb3be13..390817cf1221 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/md.c b/drivers/md/md.c index 2d31a079be33..5ce2648cbe5b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index fa966e0db6ca..3a6f3af240fa 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -14,6 +14,7 @@ #define KMSG_COMPONENT "dasd" #include +#include #include #include diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 78ead3369779..564a21b5da9d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 8f05248920e8..3c98f08dc25d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -31,6 +31,7 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include #include #include +#include #include #include #include diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 1203374828b9..973d6e079b02 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index ae8636d3780b..9933722acfd9 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -32,6 +32,7 @@ static const char *verstr = "20160209"; #include #include #include +#include #include #include #include diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 0f5315c2b5a3..0b48a0cf4262 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -12,12 +12,10 @@ #include #include -#include -#include -#include #include #include -#include +#include +#include extern const struct device_type disk_type; extern struct device_type part_type; @@ -26,14 +24,6 @@ extern struct class block_class; #define DISK_MAX_PARTS 256 #define DISK_NAME_LEN 32 -#include -#include -#include -#include -#include -#include -#include - #define PARTITION_META_INFO_VOLNAMELTH 64 /* * Enough for the string representation of any kind of UUID plus NULL. diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index d2558121d48c..6f7949b2fd8d 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -3,6 +3,7 @@ #define _LINUX_PART_STAT_H #include +#include struct disk_stats { u64 nsecs[NR_STAT_GROUPS]; -- cgit v1.2.3 From caf18e4da9bf28c79a22685dbd3a2c5f32df7c6b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 14 Mar 2022 01:24:59 +0200 Subject: Revert "net: dsa: mv88e6xxx: flush switchdev FDB workqueue before removing VLAN" This reverts commit 2566a89b9e163b2fcd104d6005e0149f197b8a48 which is commit a2614140dc0f467a83aa3bb4b6ee2d6480a76202 upstream. The above change depends on upstream commit 0faf890fc519 ("net: dsa: drop rtnl_lock from dsa_slave_switchdev_event_work"), which is not present in linux-5.15.y. Without that change, waiting for the switchdev workqueue causes deadlocks on the rtnl_mutex. Backporting the dependency commit isn't trivial/desirable, since it requires that the following dependencies of the dependency are also backported: df405910ab9f net: dsa: sja1105: wait for dynamic config command completion on writes too eb016afd83a9 net: dsa: sja1105: serialize access to the dynamic config interface 2468346c5677 net: mscc: ocelot: serialize access to the MAC table f7eb4a1c0864 net: dsa: b53: serialize access to the ARL table cf231b436f7c net: dsa: lantiq_gswip: serialize access to the PCE registers 338a3a4745aa net: dsa: introduce locking for the address lists on CPU and DSA ports and then this bugfix on top: 8940e6b669ca ("net: dsa: avoid call to __dev_set_promiscuity() while rtnl_mutex isn't held") Reported-by: Daniel Suchy Signed-off-by: Vladimir Oltean Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/mv88e6xxx/chip.c | 7 ------- include/net/dsa.h | 1 - net/dsa/dsa.c | 1 - net/dsa/dsa_priv.h | 1 + 4 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 263da7e2d6be..056e3b65cd27 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2291,13 +2291,6 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, if (!mv88e6xxx_max_vid(chip)) return -EOPNOTSUPP; - /* The ATU removal procedure needs the FID to be mapped in the VTU, - * but FDB deletion runs concurrently with VLAN deletion. Flush the DSA - * switchdev workqueue to ensure that all FDB entries are deleted - * before we remove the VLAN. - */ - dsa_flush_workqueue(); - mv88e6xxx_reg_lock(chip); err = mv88e6xxx_port_get_pvid(chip, port, &pvid); diff --git a/include/net/dsa.h b/include/net/dsa.h index 49e5ece9361c..d784e76113b8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1056,7 +1056,6 @@ void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); -void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 4ff03fb262e0..41f36ad8b0ec 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -349,7 +349,6 @@ void dsa_flush_workqueue(void) { flush_workqueue(dsa_owq); } -EXPORT_SYMBOL_GPL(dsa_flush_workqueue); int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 33ab7d7af9eb..a5c9bc7b66c6 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -170,6 +170,7 @@ void dsa_tag_driver_put(const struct dsa_device_ops *ops); const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); +void dsa_flush_workqueue(void); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) -- cgit v1.2.3 From 8184e926165fc9bd6e6d222e2d4fe0871c2ed19a Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Sep 2021 15:26:07 +0800 Subject: mctp: locking, lifetime and validity changes for sk_keys We will want to invalidate sk_keys in a future change, which will require a boolean flag to mark invalidated items in the socket & net namespace lists. We'll also need to take a reference to keys, held over non-atomic contexts, so we need a refcount on keys also. This change adds a validity flag (currently always true) and refcount to struct mctp_sk_key. With a refcount on the keys, using RCU no longer makes much sense; we have exact indications on the lifetime of keys. So, we also change the RCU list traversal to a locked implementation. OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 73c618456dc5cf2acb597256d633060cf75de8d6) Signed-off-by: Joel Stanley --- include/net/mctp.h | 46 ++++++++++++++------- net/mctp/af_mctp.c | 14 +++---- net/mctp/route.c | 118 +++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 125 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index ffd2c23bd76d..ac59537bdf7c 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -67,30 +67,36 @@ struct mctp_sock { /* Key for matching incoming packets to sockets or reassembly contexts. * Packets are matched on (src,dest,tag). * - * Lifetime requirements: + * Lifetime / locking requirements: * - * - keys are free()ed via RCU + * - individual key data (ie, the struct itself) is protected by key->lock; + * changes must be made with that lock held. + * + * - the lookup fields: peer_addr, local_addr and tag are set before the + * key is added to lookup lists, and never updated. + * + * - A ref to the key must be held (throuh key->refs) if a pointer to the + * key is to be accessed after key->lock is released. * * - a mctp_sk_key contains a reference to a struct sock; this is valid * for the life of the key. On sock destruction (through unhash), the key is - * removed from lists (see below), and will not be observable after a RCU - * grace period. - * - * any RX occurring within that grace period may still queue to the socket, - * but will hit the SOCK_DEAD case before the socket is freed. + * removed from lists (see below), and marked invalid. * * - these mctp_sk_keys appear on two lists: * 1) the struct mctp_sock->keys list * 2) the struct netns_mctp->keys list * - * updates to either list are performed under the netns_mctp->keys - * lock. + * presences on these lists requires a (single) refcount to be held; both + * lists are updated as a single operation. + * + * Updates and lookups in either list are performed under the + * netns_mctp->keys lock. Lookup functions will need to lock the key and + * take a reference before unlocking the keys_lock. Consequently, the list's + * keys_lock *cannot* be acquired with the individual key->lock held. * * - a key may have a sk_buff attached as part of an in-progress message - * reassembly (->reasm_head). The reassembly context is protected by - * reasm_lock, which may be acquired with the keys lock (above) held, if - * necessary. Consequently, keys lock *cannot* be acquired with the - * reasm_lock held. + * reassembly (->reasm_head). The reasm data is protected by the individual + * key->lock. * * - there are two destruction paths for a mctp_sk_key: * @@ -116,14 +122,22 @@ struct mctp_sk_key { /* per-socket list */ struct hlist_node sklist; + /* lock protects against concurrent updates to the reassembly and + * expiry data below. + */ + spinlock_t lock; + + /* Keys are referenced during the output path, which may sleep */ + refcount_t refs; + /* incoming fragment reassembly context */ - spinlock_t reasm_lock; struct sk_buff *reasm_head; struct sk_buff **reasm_tailp; bool reasm_dead; u8 last_seq; - struct rcu_head rcu; + /* key validity */ + bool valid; }; struct mctp_skb_cb { @@ -191,6 +205,8 @@ int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb); int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); +void mctp_key_unref(struct mctp_sk_key *key); + /* routing <--> device interface */ unsigned int mctp_default_net(struct net *net); int mctp_default_net_set(struct net *net, unsigned int index); diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 85cc1a28cbe9..92dadb2b09bb 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -276,21 +276,21 @@ static void mctp_sk_unhash(struct sock *sk) /* remove tag allocations */ spin_lock_irqsave(&net->mctp.keys_lock, flags); hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { - hlist_del_rcu(&key->sklist); - hlist_del_rcu(&key->hlist); + hlist_del(&key->sklist); + hlist_del(&key->hlist); - spin_lock(&key->reasm_lock); + spin_lock(&key->lock); if (key->reasm_head) kfree_skb(key->reasm_head); key->reasm_head = NULL; key->reasm_dead = true; - spin_unlock(&key->reasm_lock); + key->valid = false; + spin_unlock(&key->lock); - kfree_rcu(key, rcu); + /* key is no longer on the lookup lists, unref */ + mctp_key_unref(key); } spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - - synchronize_rcu(); } static struct proto mctp_proto = { diff --git a/net/mctp/route.c b/net/mctp/route.c index 65943188991d..96a0377b4329 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -83,25 +83,43 @@ static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, return true; } +/* returns a key (with key->lock held, and refcounted), or NULL if no such + * key exists. + */ static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, - mctp_eid_t peer) + mctp_eid_t peer, + unsigned long *irqflags) + __acquires(&key->lock) { struct mctp_sk_key *key, *ret; + unsigned long flags; struct mctp_hdr *mh; u8 tag; - WARN_ON(!rcu_read_lock_held()); - mh = mctp_hdr(skb); tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); ret = NULL; + spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_for_each_entry_rcu(key, &net->mctp.keys, hlist) { - if (mctp_key_match(key, mh->dest, peer, tag)) { + hlist_for_each_entry(key, &net->mctp.keys, hlist) { + if (!mctp_key_match(key, mh->dest, peer, tag)) + continue; + + spin_lock(&key->lock); + if (key->valid) { + refcount_inc(&key->refs); ret = key; break; } + spin_unlock(&key->lock); + } + + if (ret) { + spin_unlock(&net->mctp.keys_lock); + *irqflags = flags; + } else { + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); } return ret; @@ -121,11 +139,19 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, key->local_addr = local; key->tag = tag; key->sk = &msk->sk; - spin_lock_init(&key->reasm_lock); + key->valid = true; + spin_lock_init(&key->lock); + refcount_set(&key->refs, 1); return key; } +void mctp_key_unref(struct mctp_sk_key *key) +{ + if (refcount_dec_and_test(&key->refs)) + kfree(key); +} + static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) { struct net *net = sock_net(&msk->sk); @@ -138,12 +164,17 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { if (mctp_key_match(tmp, key->local_addr, key->peer_addr, key->tag)) { - rc = -EEXIST; - break; + spin_lock(&tmp->lock); + if (tmp->valid) + rc = -EEXIST; + spin_unlock(&tmp->lock); + if (rc) + break; } } if (!rc) { + refcount_inc(&key->refs); hlist_add_head(&key->hlist, &net->mctp.keys); hlist_add_head(&key->sklist, &msk->keys); } @@ -153,28 +184,35 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) return rc; } -/* Must be called with key->reasm_lock, which it will release. Will schedule - * the key for an RCU free. +/* We're done with the key; unset valid and remove from lists. There may still + * be outstanding refs on the key though... */ static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, unsigned long flags) - __releases(&key->reasm_lock) + __releases(&key->lock) { struct sk_buff *skb; skb = key->reasm_head; key->reasm_head = NULL; key->reasm_dead = true; - spin_unlock_irqrestore(&key->reasm_lock, flags); + key->valid = false; + spin_unlock_irqrestore(&key->lock, flags); spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_del_rcu(&key->hlist); - hlist_del_rcu(&key->sklist); + hlist_del(&key->hlist); + hlist_del(&key->sklist); spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - kfree_rcu(key, rcu); + + /* one unref for the lists */ + mctp_key_unref(key); + + /* and one for the local reference */ + mctp_key_unref(key); if (skb) kfree_skb(skb); + } static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) @@ -248,8 +286,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) rcu_read_lock(); - /* lookup socket / reasm context, exactly matching (src,dest,tag) */ - key = mctp_lookup_key(net, skb, mh->src); + /* lookup socket / reasm context, exactly matching (src,dest,tag). + * we hold a ref on the key, and key->lock held. + */ + key = mctp_lookup_key(net, skb, mh->src, &f); if (flags & MCTP_HDR_FLAG_SOM) { if (key) { @@ -260,10 +300,12 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * key for reassembly - we'll create a more specific * one for future packets if required (ie, !EOM). */ - key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY); + key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); if (key) { msk = container_of(key->sk, struct mctp_sock, sk); + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); key = NULL; } } @@ -282,11 +324,11 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (flags & MCTP_HDR_FLAG_EOM) { sock_queue_rcv_skb(&msk->sk, skb); if (key) { - spin_lock_irqsave(&key->reasm_lock, f); /* we've hit a pending reassembly; not much we * can do but drop it */ __mctp_key_unlock_drop(key, net, f); + key = NULL; } rc = 0; goto out_unlock; @@ -303,7 +345,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) goto out_unlock; } - /* we can queue without the reasm lock here, as the + /* we can queue without the key lock here, as the * key isn't observable yet */ mctp_frag_queue(key, skb); @@ -318,17 +360,17 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (rc) kfree(key); - } else { - /* existing key: start reassembly */ - spin_lock_irqsave(&key->reasm_lock, f); + /* we don't need to release key->lock on exit */ + key = NULL; + } else { if (key->reasm_head || key->reasm_dead) { /* duplicate start? drop everything */ __mctp_key_unlock_drop(key, net, f); rc = -EEXIST; + key = NULL; } else { rc = mctp_frag_queue(key, skb); - spin_unlock_irqrestore(&key->reasm_lock, f); } } @@ -337,8 +379,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * using the message-specific key */ - spin_lock_irqsave(&key->reasm_lock, f); - /* we need to be continuing an existing reassembly... */ if (!key->reasm_head) rc = -EINVAL; @@ -352,8 +392,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) sock_queue_rcv_skb(key->sk, key->reasm_head); key->reasm_head = NULL; __mctp_key_unlock_drop(key, net, f); - } else { - spin_unlock_irqrestore(&key->reasm_lock, f); + key = NULL; } } else { @@ -363,6 +402,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) out_unlock: rcu_read_unlock(); + if (key) { + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); + } out: if (rc) kfree_skb(skb); @@ -459,6 +502,7 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, */ hlist_add_head_rcu(&key->hlist, &mns->keys); hlist_add_head_rcu(&key->sklist, &msk->keys); + refcount_inc(&key->refs); } /* Allocate a locally-owned tag value for (saddr, daddr), and reserve @@ -492,14 +536,26 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk, * tags. If we find a conflict, clear that bit from tagbits */ hlist_for_each_entry(tmp, &mns->keys, hlist) { + /* We can check the lookup fields (*_addr, tag) without the + * lock held, they don't change over the lifetime of the key. + */ + /* if we don't own the tag, it can't conflict */ if (tmp->tag & MCTP_HDR_FLAG_TO) continue; - if ((tmp->peer_addr == daddr || - tmp->peer_addr == MCTP_ADDR_ANY) && - tmp->local_addr == saddr) + if (!((tmp->peer_addr == daddr || + tmp->peer_addr == MCTP_ADDR_ANY) && + tmp->local_addr == saddr)) + continue; + + spin_lock(&tmp->lock); + /* key must still be valid. If we find a match, clear the + * potential tag value + */ + if (tmp->valid) tagbits &= ~(1 << tmp->tag); + spin_unlock(&tmp->lock); if (!tagbits) break; -- cgit v1.2.3 From 686359701767e3f9e5de8386ccc8e64b8a91520d Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Sep 2021 15:26:08 +0800 Subject: mctp: Add refcounts to mctp_dev Currently, we tie the struct mctp_dev lifetime to the underlying struct net_device, and hold/put that device as a proxy for a separate mctp_dev refcount. This works because we're not holding any references to the mctp_dev that are different from the netdev lifetime. In a future change we'll break that assumption though, as we'll need to hold mctp_dev references in a workqueue, which might live past the netdev unregister notification. In order to support that, this change introduces a refcount on the mctp_dev, currently taken by the net_device->mctp_ptr reference, and released on netdev unregister events. We can then use this for future references that might outlast the net device. OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 43f55f23f70881e9c397557f15c8090b368d0af2) Signed-off-by: Joel Stanley --- include/net/mctpdevice.h | 5 +++++ net/mctp/device.c | 25 ++++++++++++++++--------- net/mctp/neigh.c | 4 ++-- net/mctp/route.c | 4 ++-- 4 files changed, 25 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h index 71a11012fac7..3a439463f055 100644 --- a/include/net/mctpdevice.h +++ b/include/net/mctpdevice.h @@ -17,6 +17,8 @@ struct mctp_dev { struct net_device *dev; + refcount_t refs; + unsigned int net; /* Only modified under RTNL. Reads have addrs_lock held */ @@ -32,4 +34,7 @@ struct mctp_dev { struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); struct mctp_dev *__mctp_dev_get(const struct net_device *dev); +void mctp_dev_hold(struct mctp_dev *mdev); +void mctp_dev_put(struct mctp_dev *mdev); + #endif /* __NET_MCTPDEVICE_H */ diff --git a/net/mctp/device.c b/net/mctp/device.c index c34963974cc1..f556c6d01abc 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -35,14 +35,6 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev) return rtnl_dereference(dev->mctp_ptr); } -static void mctp_dev_destroy(struct mctp_dev *mdev) -{ - struct net_device *dev = mdev->dev; - - dev_put(dev); - kfree_rcu(mdev, rcu); -} - static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb, struct mctp_dev *mdev, mctp_eid_t eid) { @@ -255,6 +247,19 @@ static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +void mctp_dev_hold(struct mctp_dev *mdev) +{ + refcount_inc(&mdev->refs); +} + +void mctp_dev_put(struct mctp_dev *mdev) +{ + if (refcount_dec_and_test(&mdev->refs)) { + dev_put(mdev->dev); + kfree_rcu(mdev, rcu); + } +} + static struct mctp_dev *mctp_add_dev(struct net_device *dev) { struct mctp_dev *mdev; @@ -270,7 +275,9 @@ static struct mctp_dev *mctp_add_dev(struct net_device *dev) mdev->net = mctp_default_net(dev_net(dev)); /* associate to net_device */ + refcount_set(&mdev->refs, 1); rcu_assign_pointer(dev->mctp_ptr, mdev); + dev_hold(dev); mdev->dev = dev; @@ -345,7 +352,7 @@ static void mctp_unregister(struct net_device *dev) mctp_neigh_remove_dev(mdev); kfree(mdev->addrs); - mctp_dev_destroy(mdev); + mctp_dev_put(mdev); } static int mctp_register(struct net_device *dev) diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 90ed2f02d1fb..5cc042121493 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -47,7 +47,7 @@ static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, } INIT_LIST_HEAD(&neigh->list); neigh->dev = mdev; - dev_hold(neigh->dev->dev); + mctp_dev_hold(neigh->dev); neigh->eid = eid; neigh->source = source; memcpy(neigh->ha, lladdr, lladdr_len); @@ -63,7 +63,7 @@ static void __mctp_neigh_free(struct rcu_head *rcu) { struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu); - dev_put(neigh->dev->dev); + mctp_dev_put(neigh->dev); kfree(neigh); } diff --git a/net/mctp/route.c b/net/mctp/route.c index 96a0377b4329..eb89c622a4bf 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -455,7 +455,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) static void mctp_route_release(struct mctp_route *rt) { if (refcount_dec_and_test(&rt->refs)) { - dev_put(rt->dev->dev); + mctp_dev_put(rt->dev); kfree_rcu(rt, rcu); } } @@ -815,7 +815,7 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, rt->max = daddr_start + daddr_extent; rt->mtu = mtu; rt->dev = mdev; - dev_hold(rt->dev->dev); + mctp_dev_hold(rt->dev); rt->type = type; rt->output = rtfn; -- cgit v1.2.3 From 628aab648216b5f7a33747b571b42f5494b0918e Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Sep 2021 15:26:09 +0800 Subject: mctp: Implement a timeout for tags Currently, a MCTP (local-eid,remote-eid,tag) tuple is allocated to a socket on send, and only expires when the socket is closed. This change introduces a tag timeout, freeing the tuple after a fixed expiry - currently six seconds. This is greater than (but close to) the max response timeout in upper-layer bindings. OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 7b14e15ae6f4850392800482efb54d5cf4ae300c) Signed-off-by: Joel Stanley --- include/net/mctp.h | 10 ++++++++++ net/mctp/af_mctp.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ net/mctp/route.c | 8 ++++++++ 3 files changed, 62 insertions(+) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index ac59537bdf7c..2a83443bdfac 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -62,6 +62,11 @@ struct mctp_sock { * by sk->net->keys_lock */ struct hlist_head keys; + + /* mechanism for expiring allocated keys; will release an allocated + * tag, and any netdev state for a request/response pairing + */ + struct timer_list key_expiry; }; /* Key for matching incoming packets to sockets or reassembly contexts. @@ -107,6 +112,8 @@ struct mctp_sock { * the (complete) reply, or during reassembly errors. Here, we clean up * the reassembly context (marking reasm_dead, to prevent another from * starting), and remove the socket from the netns & socket lists. + * + * - through an expiry timeout, on a per-socket timer */ struct mctp_sk_key { mctp_eid_t peer_addr; @@ -138,6 +145,9 @@ struct mctp_sk_key { /* key validity */ bool valid; + + /* expiry timeout; valid (above) cleared on expiry */ + unsigned long expiry; }; struct mctp_skb_cb { diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 92dadb2b09bb..fd0b2dae2cbb 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -236,16 +236,60 @@ static const struct proto_ops mctp_dgram_ops = { .sendpage = sock_no_sendpage, }; +static void mctp_sk_expire_keys(struct timer_list *timer) +{ + struct mctp_sock *msk = container_of(timer, struct mctp_sock, + key_expiry); + struct net *net = sock_net(&msk->sk); + unsigned long next_expiry, flags; + struct mctp_sk_key *key; + struct hlist_node *tmp; + bool next_expiry_valid = false; + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { + spin_lock(&key->lock); + + if (!time_after_eq(key->expiry, jiffies)) { + key->valid = false; + hlist_del_rcu(&key->hlist); + hlist_del_rcu(&key->sklist); + spin_unlock(&key->lock); + mctp_key_unref(key); + continue; + } + + if (next_expiry_valid) { + if (time_before(key->expiry, next_expiry)) + next_expiry = key->expiry; + } else { + next_expiry = key->expiry; + next_expiry_valid = true; + } + spin_unlock(&key->lock); + } + + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + if (next_expiry_valid) + mod_timer(timer, next_expiry); +} + static int mctp_sk_init(struct sock *sk) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); INIT_HLIST_HEAD(&msk->keys); + timer_setup(&msk->key_expiry, mctp_sk_expire_keys, 0); return 0; } static void mctp_sk_close(struct sock *sk, long timeout) { + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + del_timer_sync(&msk->key_expiry); sk_common_release(sk); } diff --git a/net/mctp/route.c b/net/mctp/route.c index eb89c622a4bf..11ba7995cb2d 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -24,6 +24,8 @@ #include static const unsigned int mctp_message_maxlen = 64 * 1024; +static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; + /* route output callbacks */ static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) @@ -175,6 +177,9 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) if (!rc) { refcount_inc(&key->refs); + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + hlist_add_head(&key->hlist, &net->mctp.keys); hlist_add_head(&key->sklist, &msk->keys); } @@ -497,6 +502,9 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, lockdep_assert_held(&mns->keys_lock); + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + /* we hold the net->key_lock here, allowing updates to both * then net and sk */ -- cgit v1.2.3 From 7af59d80e3cf48559dba2e587444d94dae7ef2c9 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 29 Sep 2021 15:26:10 +0800 Subject: mctp: Add tracepoints for tag/key handling The tag allocation, release and bind events are somewhat opaque outside the kernel; this change adds a few tracepoints to assist in instrumentation and debugging. OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 4f9e1ba6de45aa8797a83f1fe5b82ec4bac16899) Signed-off-by: Joel Stanley --- include/trace/events/mctp.h | 75 +++++++++++++++++++++++++++++++++++++++++++++ net/mctp/af_mctp.c | 6 ++++ net/mctp/route.c | 12 +++++++- 3 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/mctp.h (limited to 'include') diff --git a/include/trace/events/mctp.h b/include/trace/events/mctp.h new file mode 100644 index 000000000000..175b057c507f --- /dev/null +++ b/include/trace/events/mctp.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mctp + +#if !defined(_TRACE_MCTP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MCTP_H + +#include + +#ifndef __TRACE_MCTP_ENUMS +#define __TRACE_MCTP_ENUMS +enum { + MCTP_TRACE_KEY_TIMEOUT, + MCTP_TRACE_KEY_REPLIED, + MCTP_TRACE_KEY_INVALIDATED, + MCTP_TRACE_KEY_CLOSED, +}; +#endif /* __TRACE_MCTP_ENUMS */ + +TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT); +TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED); +TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED); +TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED); + +TRACE_EVENT(mctp_key_acquire, + TP_PROTO(const struct mctp_sk_key *key), + TP_ARGS(key), + TP_STRUCT__entry( + __field(__u8, paddr) + __field(__u8, laddr) + __field(__u8, tag) + ), + TP_fast_assign( + __entry->paddr = key->peer_addr; + __entry->laddr = key->local_addr; + __entry->tag = key->tag; + ), + TP_printk("local %d, peer %d, tag %1x", + __entry->laddr, + __entry->paddr, + __entry->tag + ) +); + +TRACE_EVENT(mctp_key_release, + TP_PROTO(const struct mctp_sk_key *key, int reason), + TP_ARGS(key, reason), + TP_STRUCT__entry( + __field(__u8, paddr) + __field(__u8, laddr) + __field(__u8, tag) + __field(int, reason) + ), + TP_fast_assign( + __entry->paddr = key->peer_addr; + __entry->laddr = key->local_addr; + __entry->tag = key->tag; + __entry->reason = reason; + ), + TP_printk("local %d, peer %d, tag %1x %s", + __entry->laddr, + __entry->paddr, + __entry->tag, + __print_symbolic(__entry->reason, + { MCTP_TRACE_KEY_TIMEOUT, "timeout" }, + { MCTP_TRACE_KEY_REPLIED, "replied" }, + { MCTP_TRACE_KEY_INVALIDATED, "invalidated" }, + { MCTP_TRACE_KEY_CLOSED, "closed" }) + ) +); + +#endif + +#include diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index fd0b2dae2cbb..3e9bdfa10671 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -16,6 +16,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + /* socket implementation */ static int mctp_release(struct socket *sock) @@ -252,6 +255,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer) spin_lock(&key->lock); if (!time_after_eq(key->expiry, jiffies)) { + trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT); key->valid = false; hlist_del_rcu(&key->hlist); hlist_del_rcu(&key->sklist); @@ -323,6 +327,8 @@ static void mctp_sk_unhash(struct sock *sk) hlist_del(&key->sklist); hlist_del(&key->hlist); + trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED); + spin_lock(&key->lock); if (key->reasm_head) kfree_skb(key->reasm_head); diff --git a/net/mctp/route.c b/net/mctp/route.c index 11ba7995cb2d..465bdc848f9b 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -23,10 +23,11 @@ #include #include +#include + static const unsigned int mctp_message_maxlen = 64 * 1024; static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; - /* route output callbacks */ static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) { @@ -332,6 +333,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) /* we've hit a pending reassembly; not much we * can do but drop it */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_REPLIED); __mctp_key_unlock_drop(key, net, f); key = NULL; } @@ -365,12 +368,16 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (rc) kfree(key); + trace_mctp_key_acquire(key); + /* we don't need to release key->lock on exit */ key = NULL; } else { if (key->reasm_head || key->reasm_dead) { /* duplicate start? drop everything */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_INVALIDATED); __mctp_key_unlock_drop(key, net, f); rc = -EEXIST; key = NULL; @@ -396,6 +403,7 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) if (!rc && flags & MCTP_HDR_FLAG_EOM) { sock_queue_rcv_skb(key->sk, key->reasm_head); key->reasm_head = NULL; + trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED); __mctp_key_unlock_drop(key, net, f); key = NULL; } @@ -572,6 +580,8 @@ static int mctp_alloc_local_tag(struct mctp_sock *msk, if (tagbits) { key->tag = __ffs(tagbits); mctp_reserve_tag(net, key, msk); + trace_mctp_key_acquire(key); + *tagp = key->tag; rc = 0; } -- cgit v1.2.3 From f00785cbf91a3ef807270f2297d6c691a2201a6c Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 26 Oct 2021 09:57:28 +0800 Subject: mctp: Implement extended addressing This change allows an extended address struct - struct sockaddr_mctp_ext - to be passed to sendmsg/recvmsg. This allows userspace to specify output ifindex and physical address information (for sendmsg) or receive the input ifindex/physaddr for incoming messages (for recvmsg). This is typically used by userspace for MCTP address discovery and assignment operations. The extended addressing facility is conditional on a new sockopt: MCTP_OPT_ADDR_EXT; userspace must explicitly enable addressing before the kernel will consume/populate the extended address data. Includes a fix for an uninitialised var: Reported-by: kernel test robot Backport: exclude SOL_MPTCP OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 99ce45d5e7dbde399997a630f45ac9f654fa4bcc) Signed-off-by: Joel Stanley --- include/linux/socket.h | 2 + include/net/mctp.h | 13 +++++-- include/uapi/linux/mctp.h | 11 ++++++ net/mctp/af_mctp.c | 86 ++++++++++++++++++++++++++++++++++++----- net/mctp/route.c | 98 +++++++++++++++++++++++++++++++++++------------ 5 files changed, 171 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 041d6032a348..441516c5db3a 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -364,6 +364,8 @@ struct ucred { #define SOL_KCM 281 #define SOL_TLS 282 #define SOL_XDP 283 +/* #define SOL_MPTCP 284 - not yet included in 5.15 */ +#define SOL_MCTP 285 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/net/mctp.h b/include/net/mctp.h index 2a83443bdfac..23bec708f4c7 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -58,6 +59,9 @@ struct mctp_sock { mctp_eid_t bind_addr; __u8 bind_type; + /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ + bool addr_ext; + /* list of mctp_sk_key, for incoming tag lookup. updates protected * by sk->net->keys_lock */ @@ -153,7 +157,10 @@ struct mctp_sk_key { struct mctp_skb_cb { unsigned int magic; unsigned int net; + int ifindex; /* extended/direct addressing if set */ mctp_eid_t src; + unsigned char halen; + unsigned char haddr[MAX_ADDR_LEN]; }; /* skb control-block accessors with a little extra debugging for initial @@ -177,6 +184,7 @@ static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) { struct mctp_skb_cb *cb = (void *)skb->cb; + BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); WARN_ON(cb->magic != 0x4d435450); return (void *)(skb->cb); } @@ -189,8 +197,7 @@ static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) * * Updates to the route table are performed under rtnl; all reads under RCU, * so routes cannot be referenced over a RCU grace period. Specifically: A - * caller cannot block between mctp_route_lookup and passing the route to - * mctp_do_route. + * caller cannot block between mctp_route_lookup and mctp_route_release() */ struct mctp_route { mctp_eid_t min, max; @@ -210,8 +217,6 @@ struct mctp_route { struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr); -int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb); - int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index 6acd4ccafbf7..07b0318716fc 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -11,6 +11,7 @@ #include #include +#include typedef __u8 mctp_eid_t; @@ -28,6 +29,14 @@ struct sockaddr_mctp { __u8 __smctp_pad1; }; +struct sockaddr_mctp_ext { + struct sockaddr_mctp smctp_base; + int smctp_ifindex; + __u8 smctp_halen; + __u8 __smctp_pad0[3]; + __u8 smctp_haddr[MAX_ADDR_LEN]; +}; + #define MCTP_NET_ANY 0x0 #define MCTP_ADDR_NULL 0x00 @@ -36,4 +45,6 @@ struct sockaddr_mctp { #define MCTP_TAG_MASK 0x07 #define MCTP_TAG_OWNER 0x08 +#define MCTP_OPT_ADDR_EXT 1 + #endif /* __UAPI_MCTP_H */ diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index cd2837ec60d1..bc88159f8844 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -86,6 +86,7 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) const int hlen = MCTP_HEADER_MAXLEN + sizeof(struct mctp_hdr); int rc, addrlen = msg->msg_namelen; struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb; struct mctp_route *rt; struct sk_buff *skb; @@ -111,11 +112,6 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (addr->smctp_network == MCTP_NET_ANY) addr->smctp_network = mctp_default_net(sock_net(sk)); - rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, - addr->smctp_addr.s_addr); - if (!rt) - return -EHOSTUNREACH; - skb = sock_alloc_send_skb(sk, hlen + 1 + len, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) @@ -127,19 +123,45 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) *(u8 *)skb_put(skb, 1) = addr->smctp_type; rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len); - if (rc < 0) { - kfree_skb(skb); - return rc; - } + if (rc < 0) + goto err_free; /* set up cb */ cb = __mctp_cb(skb); cb->net = addr->smctp_network; + /* direct addressing */ + if (msk->addr_ext && addrlen >= sizeof(struct sockaddr_mctp_ext)) { + DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, + extaddr, msg->msg_name); + + if (extaddr->smctp_halen > sizeof(cb->haddr)) { + rc = -EINVAL; + goto err_free; + } + + cb->ifindex = extaddr->smctp_ifindex; + cb->halen = extaddr->smctp_halen; + memcpy(cb->haddr, extaddr->smctp_haddr, cb->halen); + + rt = NULL; + } else { + rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, + addr->smctp_addr.s_addr); + if (!rt) { + rc = -EHOSTUNREACH; + goto err_free; + } + } + rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr, addr->smctp_tag); return rc ? : len; + +err_free: + kfree_skb(skb); + return rc; } static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, @@ -147,6 +169,7 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, { DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct sk_buff *skb; size_t msglen; u8 type; @@ -194,6 +217,16 @@ static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); addr->__smctp_pad1 = 0; msg->msg_namelen = sizeof(*addr); + + if (msk->addr_ext) { + DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, ae, + msg->msg_name); + msg->msg_namelen = sizeof(*ae); + ae->smctp_ifindex = cb->ifindex; + ae->smctp_halen = cb->halen; + memset(ae->smctp_haddr, 0x0, sizeof(ae->smctp_haddr)); + memcpy(ae->smctp_haddr, cb->haddr, cb->halen); + } } rc = len; @@ -209,12 +242,45 @@ out_free: static int mctp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { - return -EINVAL; + struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk); + int val; + + if (level != SOL_MCTP) + return -EINVAL; + + if (optname == MCTP_OPT_ADDR_EXT) { + if (optlen != sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + msk->addr_ext = val; + return 0; + } + + return -ENOPROTOOPT; } static int mctp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { + struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk); + int len, val; + + if (level != SOL_MCTP) + return -EINVAL; + + if (get_user(len, optlen)) + return -EFAULT; + + if (optname == MCTP_OPT_ADDR_EXT) { + if (len != sizeof(int)) + return -EINVAL; + val = !!msk->addr_ext; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; + } + return -EINVAL; } diff --git a/net/mctp/route.c b/net/mctp/route.c index 580a78b15626..929fbdb316e5 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -434,6 +434,7 @@ static unsigned int mctp_route_mtu(struct mctp_route *rt) static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) { + struct mctp_skb_cb *cb = mctp_cb(skb); struct mctp_hdr *hdr = mctp_hdr(skb); char daddr_buf[MAX_ADDR_LEN]; char *daddr = NULL; @@ -448,9 +449,14 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) return -EMSGSIZE; } - /* If lookup fails let the device handle daddr==NULL */ - if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) - daddr = daddr_buf; + if (cb->ifindex) { + /* direct route; use the hwaddr we stashed in sendmsg */ + daddr = cb->haddr; + } else { + /* If lookup fails let the device handle daddr==NULL */ + if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) + daddr = daddr_buf; + } rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol), daddr, skb->dev->dev_addr, skb->len); @@ -649,16 +655,6 @@ static struct mctp_route *mctp_route_lookup_null(struct net *net, return NULL; } -/* sends a skb to rt and releases the route. */ -int mctp_do_route(struct mctp_route *rt, struct sk_buff *skb) -{ - int rc; - - rc = rt->output(rt, skb); - mctp_route_release(rt); - return rc; -} - static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, unsigned int mtu, u8 tag) { @@ -725,7 +721,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, /* copy message payload */ skb_copy_bits(skb, pos, skb_transport_header(skb2), size); - /* do route, but don't drop the rt reference */ + /* do route */ rc = rt->output(rt, skb2); if (rc) break; @@ -734,7 +730,6 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, pos += size; } - mctp_route_release(rt); consume_skb(skb); return rc; } @@ -744,15 +739,51 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_route tmp_rt; + struct net_device *dev; struct mctp_hdr *hdr; unsigned long flags; unsigned int mtu; mctp_eid_t saddr; + bool ext_rt; int rc; u8 tag; - if (WARN_ON(!rt->dev)) + rc = -ENODEV; + + if (rt) { + ext_rt = false; + dev = NULL; + + if (WARN_ON(!rt->dev)) + goto out_release; + + } else if (cb->ifindex) { + ext_rt = true; + rt = &tmp_rt; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex); + if (!dev) { + rcu_read_unlock(); + return rc; + } + + rt->dev = __mctp_dev_get(dev); + rcu_read_unlock(); + + if (!rt->dev) + goto out_release; + + /* establish temporary route - we set up enough to keep + * mctp_route_output happy + */ + rt->output = mctp_route_output; + rt->mtu = 0; + + } else { return -EINVAL; + } spin_lock_irqsave(&rt->dev->addrs_lock, flags); if (rt->dev->num_addrs == 0) { @@ -765,18 +796,17 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); if (rc) - return rc; + goto out_release; if (req_tag & MCTP_HDR_FLAG_TO) { rc = mctp_alloc_local_tag(msk, saddr, daddr, &tag); if (rc) - return rc; + goto out_release; tag |= MCTP_HDR_FLAG_TO; } else { tag = req_tag; } - skb->protocol = htons(ETH_P_MCTP); skb->priority = 0; skb_reset_transport_header(skb); @@ -796,12 +826,22 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, mtu = mctp_route_mtu(rt); if (skb->len + sizeof(struct mctp_hdr) <= mtu) { - hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | - tag; - return mctp_do_route(rt, skb); + hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | + MCTP_HDR_FLAG_EOM | tag; + rc = rt->output(rt, skb); } else { - return mctp_do_fragment_route(rt, skb, mtu, tag); + rc = mctp_do_fragment_route(rt, skb, mtu, tag); } + +out_release: + if (!ext_rt) + mctp_route_release(rt); + + if (dev) + dev_put(dev); + + return rc; + } /* route management */ @@ -943,8 +983,15 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX) goto err_drop; - cb = __mctp_cb(skb); + /* MCTP drivers must populate halen/haddr */ + if (dev->type == ARPHRD_MCTP) { + cb = mctp_cb(skb); + } else { + cb = __mctp_cb(skb); + cb->halen = 0; + } cb->net = READ_ONCE(mdev->net); + cb->ifindex = dev->ifindex; rt = mctp_route_lookup(net, cb->net, mh->dest); @@ -955,7 +1002,8 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, if (!rt) goto err_drop; - mctp_do_route(rt, skb); + rt->output(rt, skb); + mctp_route_release(rt); return NET_RX_SUCCESS; -- cgit v1.2.3 From 2fd0af45b971a26594bbec28275478d3ce67a758 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 29 Oct 2021 11:01:44 +0800 Subject: mctp: Add flow extension to skb This change adds a new skb extension for MCTP, to represent a request/response flow. The intention is to use this in a later change to allow i2c controllers to correctly configure a multiplexer over a flow. Since we have a cleanup function in the core path (if an extension is present), we'll need to make CONFIG_MCTP a bool, rather than a tristate. Includes a fix for a build warning with clang: Reported-by: kernel test robot OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 78476d315e190533757ab894255c4f2c2f254bce) Signed-off-by: Joel Stanley --- include/linux/skbuff.h | 3 +++ include/net/mctp.h | 7 +++++++ net/core/skbuff.c | 19 +++++++++++++++++++ net/mctp/Kconfig | 7 ++++++- 4 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 532f5d402f06..40afb1f8123d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4258,6 +4258,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MPTCP) SKB_EXT_MPTCP, +#endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + SKB_EXT_MCTP, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/net/mctp.h b/include/net/mctp.h index 23bec708f4c7..7a5ba801703c 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -189,6 +189,13 @@ static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) return (void *)(skb->cb); } +/* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, + * indicating the flow to the device driver. + */ +struct mctp_flow { + struct mctp_sk_key *key; +}; + /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 449a96e358ad..520e28637750 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4429,6 +4430,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MPTCP) [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), #endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -4445,6 +4449,9 @@ static __always_inline unsigned int skb_ext_total_length(void) #endif #if IS_ENABLED(CONFIG_MPTCP) skb_ext_type_len[SKB_EXT_MPTCP] + +#endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + skb_ext_type_len[SKB_EXT_MCTP] + #endif 0; } @@ -6518,6 +6525,14 @@ static void skb_ext_put_sp(struct sec_path *sp) } #endif +#ifdef CONFIG_MCTP_FLOWS +static void skb_ext_put_mctp(struct mctp_flow *flow) +{ + if (flow->key) + mctp_key_unref(flow->key); +} +#endif + void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; @@ -6553,6 +6568,10 @@ free_now: if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); #endif +#ifdef CONFIG_MCTP_FLOWS + if (__skb_ext_exist(ext, SKB_EXT_MCTP)) + skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); +#endif kmem_cache_free(skbuff_ext_cache, ext); } diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig index 868c92272cbd..3a5c0e70da77 100644 --- a/net/mctp/Kconfig +++ b/net/mctp/Kconfig @@ -1,7 +1,7 @@ menuconfig MCTP depends on NET - tristate "MCTP core protocol support" + bool "MCTP core protocol support" help Management Component Transport Protocol (MCTP) is an in-system protocol for communicating between management controllers and @@ -16,3 +16,8 @@ config MCTP_TEST bool "MCTP core tests" if !KUNIT_ALL_TESTS depends on MCTP=y && KUNIT=y default KUNIT_ALL_TESTS + +config MCTP_FLOWS + bool + depends on MCTP + select SKB_EXTENSIONS -- cgit v1.2.3 From 8076763b378e560dc50eb357a6b6d6c6f12e5ac4 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 29 Oct 2021 11:01:45 +0800 Subject: mctp: Pass flow data & flow release events to drivers Now that we have an extension for MCTP data in skbs, populate the flow when a key has been created for the packet, and add a device driver operation to inform of flow destruction. Includes a fix for a warning with test builds: Reported-by: kernel test robot OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 67737c457281dd199ceb9e31b6ba7efd3bfe566d) Signed-off-by: Joel Stanley --- include/net/mctp.h | 6 ++++++ include/net/mctpdevice.h | 16 ++++++++++++++ net/mctp/device.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ net/mctp/route.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 126 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index 7a5ba801703c..7e35ec79b909 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -152,6 +152,12 @@ struct mctp_sk_key { /* expiry timeout; valid (above) cleared on expiry */ unsigned long expiry; + + /* free to use for device flow state tracking. Initialised to + * zero on initial key creation + */ + unsigned long dev_flow_state; + struct mctp_dev *dev; }; struct mctp_skb_cb { diff --git a/include/net/mctpdevice.h b/include/net/mctpdevice.h index 3a439463f055..5c0d04b5c12c 100644 --- a/include/net/mctpdevice.h +++ b/include/net/mctpdevice.h @@ -14,6 +14,8 @@ #include #include +struct mctp_sk_key; + struct mctp_dev { struct net_device *dev; @@ -21,6 +23,8 @@ struct mctp_dev { unsigned int net; + const struct mctp_netdev_ops *ops; + /* Only modified under RTNL. Reads have addrs_lock held */ u8 *addrs; size_t num_addrs; @@ -29,12 +33,24 @@ struct mctp_dev { struct rcu_head rcu; }; +struct mctp_netdev_ops { + void (*release_flow)(struct mctp_dev *dev, + struct mctp_sk_key *key); +}; + #define MCTP_INITIAL_DEFAULT_NET 1 struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev); struct mctp_dev *__mctp_dev_get(const struct net_device *dev); +int mctp_register_netdev(struct net_device *dev, + const struct mctp_netdev_ops *ops); +void mctp_unregister_netdev(struct net_device *dev); + void mctp_dev_hold(struct mctp_dev *mdev); void mctp_dev_put(struct mctp_dev *mdev); +void mctp_dev_set_key(struct mctp_dev *dev, struct mctp_sk_key *key); +void mctp_dev_release_key(struct mctp_dev *dev, struct mctp_sk_key *key); + #endif /* __NET_MCTPDEVICE_H */ diff --git a/net/mctp/device.c b/net/mctp/device.c index 3827d62f52c9..8799ee77e7b7 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -260,6 +260,24 @@ void mctp_dev_put(struct mctp_dev *mdev) } } +void mctp_dev_release_key(struct mctp_dev *dev, struct mctp_sk_key *key) + __must_hold(&key->lock) +{ + if (!dev) + return; + if (dev->ops && dev->ops->release_flow) + dev->ops->release_flow(dev, key); + key->dev = NULL; + mctp_dev_put(dev); +} + +void mctp_dev_set_key(struct mctp_dev *dev, struct mctp_sk_key *key) + __must_hold(&key->lock) +{ + mctp_dev_hold(dev); + key->dev = dev; +} + static struct mctp_dev *mctp_add_dev(struct net_device *dev) { struct mctp_dev *mdev; @@ -414,6 +432,39 @@ static int mctp_dev_notify(struct notifier_block *this, unsigned long event, return NOTIFY_OK; } +static int mctp_register_netdevice(struct net_device *dev, + const struct mctp_netdev_ops *ops) +{ + struct mctp_dev *mdev; + + mdev = mctp_add_dev(dev); + if (IS_ERR(mdev)) + return PTR_ERR(mdev); + + mdev->ops = ops; + + return register_netdevice(dev); +} + +int mctp_register_netdev(struct net_device *dev, + const struct mctp_netdev_ops *ops) +{ + int rc; + + rtnl_lock(); + rc = mctp_register_netdevice(dev, ops); + rtnl_unlock(); + + return rc; +} +EXPORT_SYMBOL_GPL(mctp_register_netdev); + +void mctp_unregister_netdev(struct net_device *dev) +{ + unregister_netdev(dev); +} +EXPORT_SYMBOL_GPL(mctp_unregister_netdev); + static struct rtnl_af_ops mctp_af_ops = { .family = AF_MCTP, .fill_link_af = mctp_fill_link_af, diff --git a/net/mctp/route.c b/net/mctp/route.c index 546db9a89905..cdf09c2a7007 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -29,6 +29,8 @@ static const unsigned int mctp_message_maxlen = 64 * 1024; static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev); + /* route output callbacks */ static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) { @@ -152,8 +154,19 @@ static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, void mctp_key_unref(struct mctp_sk_key *key) { - if (refcount_dec_and_test(&key->refs)) - kfree(key); + unsigned long flags; + + if (!refcount_dec_and_test(&key->refs)) + return; + + /* even though no refs exist here, the lock allows us to stay + * consistent with the locking requirement of mctp_dev_release_key + */ + spin_lock_irqsave(&key->lock, flags); + mctp_dev_release_key(key->dev, key); + spin_unlock_irqrestore(&key->lock, flags); + + kfree(key); } static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) @@ -204,6 +217,7 @@ static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, key->reasm_head = NULL; key->reasm_dead = true; key->valid = false; + mctp_dev_release_key(key->dev, key); spin_unlock_irqrestore(&key->lock, flags); spin_lock_irqsave(&net->mctp.keys_lock, flags); @@ -222,6 +236,40 @@ static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, } +#ifdef CONFIG_MCTP_FLOWS +static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) +{ + struct mctp_flow *flow; + + flow = skb_ext_add(skb, SKB_EXT_MCTP); + if (!flow) + return; + + refcount_inc(&key->refs); + flow->key = key; +} + +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) +{ + struct mctp_sk_key *key; + struct mctp_flow *flow; + + flow = skb_ext_find(skb, SKB_EXT_MCTP); + if (!flow) + return; + + key = flow->key; + + if (WARN_ON(key->dev && key->dev != dev)) + return; + + mctp_dev_set_key(dev, key); +} +#else +static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {} +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) {} +#endif + static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) { struct mctp_hdr *hdr = mctp_hdr(skb); @@ -465,6 +513,8 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) return -EHOSTUNREACH; } + mctp_flow_prepare_output(skb, route->dev); + rc = dev_queue_xmit(skb); if (rc) rc = net_xmit_errno(rc); @@ -803,6 +853,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, rc = PTR_ERR(key); goto out_release; } + mctp_skb_set_flow(skb, key); /* done with the key in this scope */ mctp_key_unref(key); tag |= MCTP_HDR_FLAG_TO; -- cgit v1.2.3 From c3b90c09091acb9da82bfaf1914b0c9805ac9844 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 23 Nov 2021 15:50:46 +0800 Subject: mctp: Add MCTP-over-serial transport binding This change adds a MCTP Serial transport binding, as defined by DMTF specificiation DSP0253 - "MCTP Serial Transport Binding". This is implemented as a new serial line discipline, and can be attached to arbitrary tty devices. From the Kconfig description: This driver provides an MCTP-over-serial interface, through a serial line-discipline, as defined by DMTF specification "DSP0253 - MCTP Serial Transport Binding". By attaching the ldisc to a serial device, we get a new net device to transport MCTP packets. This allows communication with external MCTP endpoints which use serial as their transport. It can also be used as an easy way to provide MCTP connectivity between virtual machines, by forwarding data between simple virtual serial devices. Say y here if you need to connect to MCTP endpoints over serial. To compile as a module, use m; the module will be called mctp-serial. Once the N_MCTP line discipline is set [using ioctl(TCIOSETD)], we get a new netdev suitable for MCTP communication. The 'mctp' utility[1] provides a simple wrapper for this ioctl, using 'link serial ': # mctp link serial /dev/ttyS0 & # mctp link dev lo index 1 address 0x00:00:00:00:00:00 net 1 mtu 65536 up dev mctpserial0 index 5 address 0x(no-addr) net 1 mtu 68 down [1]: https://github.com/CodeConstruct/mctp OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit a0c2ccd9b5ad0a9e838158404e041b5a8ff762dd) Signed-off-by: Joel Stanley --- drivers/net/mctp/Kconfig | 18 ++ drivers/net/mctp/Makefile | 1 + drivers/net/mctp/mctp-serial.c | 510 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/tty.h | 1 + 4 files changed, 530 insertions(+) create mode 100644 drivers/net/mctp/mctp-serial.c (limited to 'include') diff --git a/drivers/net/mctp/Kconfig b/drivers/net/mctp/Kconfig index d8f966cedc89..2929471395ae 100644 --- a/drivers/net/mctp/Kconfig +++ b/drivers/net/mctp/Kconfig @@ -3,6 +3,24 @@ if MCTP menu "MCTP Device Drivers" +config MCTP_SERIAL + tristate "MCTP serial transport" + depends on TTY + select CRC_CCITT + help + This driver provides an MCTP-over-serial interface, through a + serial line-discipline, as defined by DMTF specification "DSP0253 - + MCTP Serial Transport Binding". By attaching the ldisc to a serial + device, we get a new net device to transport MCTP packets. + + This allows communication with external MCTP endpoints which use + serial as their transport. It can also be used as an easy way to + provide MCTP connectivity between virtual machines, by forwarding + data between simple virtual serial devices. + + Say y here if you need to connect to MCTP endpoints over serial. To + compile as a module, use m; the module will be called mctp-serial. + endmenu endif diff --git a/drivers/net/mctp/Makefile b/drivers/net/mctp/Makefile index e69de29bb2d1..d32622613ce4 100644 --- a/drivers/net/mctp/Makefile +++ b/drivers/net/mctp/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_MCTP_SERIAL) += mctp-serial.o diff --git a/drivers/net/mctp/mctp-serial.c b/drivers/net/mctp/mctp-serial.c new file mode 100644 index 000000000000..9ac0e187f36e --- /dev/null +++ b/drivers/net/mctp/mctp-serial.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - serial transport + * binding. This driver is an implementation of the DMTF specificiation + * "DSP0253 - Management Component Transport Protocol (MCTP) Serial Transport + * Binding", available at: + * + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0253_1.0.0.pdf + * + * This driver provides DSP0253-type MCTP-over-serial transport using a Linux + * tty device, by setting the N_MCTP line discipline on the tty. + * + * Copyright (c) 2021 Code Construct + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define MCTP_SERIAL_MTU 68 /* base mtu (64) + mctp header */ +#define MCTP_SERIAL_FRAME_MTU (MCTP_SERIAL_MTU + 6) /* + serial framing */ + +#define MCTP_SERIAL_VERSION 0x1 /* DSP0253 defines a single version: 1 */ + +#define BUFSIZE MCTP_SERIAL_FRAME_MTU + +#define BYTE_FRAME 0x7e +#define BYTE_ESC 0x7d + +static DEFINE_IDA(mctp_serial_ida); + +enum mctp_serial_state { + STATE_IDLE, + STATE_START, + STATE_HEADER, + STATE_DATA, + STATE_ESCAPE, + STATE_TRAILER, + STATE_DONE, + STATE_ERR, +}; + +struct mctp_serial { + struct net_device *netdev; + struct tty_struct *tty; + + int idx; + + /* protects our rx & tx state machines; held during both paths */ + spinlock_t lock; + + struct work_struct tx_work; + enum mctp_serial_state txstate, rxstate; + u16 txfcs, rxfcs, rxfcs_rcvd; + unsigned int txlen, rxlen; + unsigned int txpos, rxpos; + unsigned char txbuf[BUFSIZE], + rxbuf[BUFSIZE]; +}; + +static bool needs_escape(unsigned char c) +{ + return c == BYTE_ESC || c == BYTE_FRAME; +} + +static int next_chunk_len(struct mctp_serial *dev) +{ + int i; + + /* either we have no bytes to send ... */ + if (dev->txpos == dev->txlen) + return 0; + + /* ... or the next byte to send is an escaped byte; requiring a + * single-byte chunk... + */ + if (needs_escape(dev->txbuf[dev->txpos])) + return 1; + + /* ... or we have one or more bytes up to the next escape - this chunk + * will be those non-escaped bytes, and does not include the escaped + * byte. + */ + for (i = 1; i + dev->txpos + 1 < dev->txlen; i++) { + if (needs_escape(dev->txbuf[dev->txpos + i + 1])) + break; + } + + return i; +} + +static int write_chunk(struct mctp_serial *dev, unsigned char *buf, int len) +{ + return dev->tty->ops->write(dev->tty, buf, len); +} + +static void mctp_serial_tx_work(struct work_struct *work) +{ + struct mctp_serial *dev = container_of(work, struct mctp_serial, + tx_work); + unsigned char c, buf[3]; + unsigned long flags; + int len, txlen; + + spin_lock_irqsave(&dev->lock, flags); + + /* txstate represents the next thing to send */ + switch (dev->txstate) { + case STATE_START: + dev->txpos = 0; + fallthrough; + case STATE_HEADER: + buf[0] = BYTE_FRAME; + buf[1] = MCTP_SERIAL_VERSION; + buf[2] = dev->txlen; + + if (!dev->txpos) + dev->txfcs = crc_ccitt(0, buf + 1, 2); + + txlen = write_chunk(dev, buf + dev->txpos, 3 - dev->txpos); + if (txlen <= 0) { + dev->txstate = STATE_ERR; + } else { + dev->txpos += txlen; + if (dev->txpos == 3) { + dev->txstate = STATE_DATA; + dev->txpos = 0; + } + } + break; + + case STATE_ESCAPE: + buf[0] = dev->txbuf[dev->txpos] & ~0x20; + txlen = write_chunk(dev, buf, 1); + if (txlen <= 0) { + dev->txstate = STATE_ERR; + } else { + dev->txpos += txlen; + if (dev->txpos == dev->txlen) { + dev->txstate = STATE_TRAILER; + dev->txpos = 0; + } + } + + break; + + case STATE_DATA: + len = next_chunk_len(dev); + if (len) { + c = dev->txbuf[dev->txpos]; + if (len == 1 && needs_escape(c)) { + buf[0] = BYTE_ESC; + buf[1] = c & ~0x20; + dev->txfcs = crc_ccitt_byte(dev->txfcs, c); + txlen = write_chunk(dev, buf, 2); + if (txlen == 2) + dev->txpos++; + else if (txlen == 1) + dev->txstate = STATE_ESCAPE; + else + dev->txstate = STATE_ERR; + } else { + txlen = write_chunk(dev, + dev->txbuf + dev->txpos, + len); + if (txlen <= 0) { + dev->txstate = STATE_ERR; + } else { + dev->txfcs = crc_ccitt(dev->txfcs, + dev->txbuf + + dev->txpos, + txlen); + dev->txpos += txlen; + } + } + if (dev->txstate == STATE_DATA && + dev->txpos == dev->txlen) { + dev->txstate = STATE_TRAILER; + dev->txpos = 0; + } + break; + } + dev->txstate = STATE_TRAILER; + dev->txpos = 0; + fallthrough; + + case STATE_TRAILER: + buf[0] = dev->txfcs >> 8; + buf[1] = dev->txfcs & 0xff; + buf[2] = BYTE_FRAME; + txlen = write_chunk(dev, buf + dev->txpos, 3 - dev->txpos); + if (txlen <= 0) { + dev->txstate = STATE_ERR; + } else { + dev->txpos += txlen; + if (dev->txpos == 3) { + dev->txstate = STATE_DONE; + dev->txpos = 0; + } + } + break; + default: + netdev_err_once(dev->netdev, "invalid tx state %d\n", + dev->txstate); + } + + if (dev->txstate == STATE_DONE) { + dev->netdev->stats.tx_packets++; + dev->netdev->stats.tx_bytes += dev->txlen; + dev->txlen = 0; + dev->txpos = 0; + clear_bit(TTY_DO_WRITE_WAKEUP, &dev->tty->flags); + dev->txstate = STATE_IDLE; + spin_unlock_irqrestore(&dev->lock, flags); + + netif_wake_queue(dev->netdev); + } else { + spin_unlock_irqrestore(&dev->lock, flags); + } +} + +static netdev_tx_t mctp_serial_tx(struct sk_buff *skb, struct net_device *ndev) +{ + struct mctp_serial *dev = netdev_priv(ndev); + unsigned long flags; + + WARN_ON(dev->txstate != STATE_IDLE); + + if (skb->len > MCTP_SERIAL_MTU) { + dev->netdev->stats.tx_dropped++; + goto out; + } + + spin_lock_irqsave(&dev->lock, flags); + netif_stop_queue(dev->netdev); + skb_copy_bits(skb, 0, dev->txbuf, skb->len); + dev->txpos = 0; + dev->txlen = skb->len; + dev->txstate = STATE_START; + spin_unlock_irqrestore(&dev->lock, flags); + + set_bit(TTY_DO_WRITE_WAKEUP, &dev->tty->flags); + schedule_work(&dev->tx_work); + +out: + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static void mctp_serial_tty_write_wakeup(struct tty_struct *tty) +{ + struct mctp_serial *dev = tty->disc_data; + + schedule_work(&dev->tx_work); +} + +static void mctp_serial_rx(struct mctp_serial *dev) +{ + struct mctp_skb_cb *cb; + struct sk_buff *skb; + + if (dev->rxfcs != dev->rxfcs_rcvd) { + dev->netdev->stats.rx_dropped++; + dev->netdev->stats.rx_crc_errors++; + return; + } + + skb = netdev_alloc_skb(dev->netdev, dev->rxlen); + if (!skb) { + dev->netdev->stats.rx_dropped++; + return; + } + + skb->protocol = htons(ETH_P_MCTP); + skb_put_data(skb, dev->rxbuf, dev->rxlen); + skb_reset_network_header(skb); + + cb = __mctp_cb(skb); + cb->halen = 0; + + netif_rx_ni(skb); + dev->netdev->stats.rx_packets++; + dev->netdev->stats.rx_bytes += dev->rxlen; +} + +static void mctp_serial_push_header(struct mctp_serial *dev, unsigned char c) +{ + switch (dev->rxpos) { + case 0: + if (c == BYTE_FRAME) + dev->rxpos++; + else + dev->rxstate = STATE_ERR; + break; + case 1: + if (c == MCTP_SERIAL_VERSION) { + dev->rxpos++; + dev->rxfcs = crc_ccitt_byte(0, c); + } else { + dev->rxstate = STATE_ERR; + } + break; + case 2: + if (c > MCTP_SERIAL_FRAME_MTU) { + dev->rxstate = STATE_ERR; + } else { + dev->rxlen = c; + dev->rxpos = 0; + dev->rxstate = STATE_DATA; + dev->rxfcs = crc_ccitt_byte(dev->rxfcs, c); + } + break; + } +} + +static void mctp_serial_push_trailer(struct mctp_serial *dev, unsigned char c) +{ + switch (dev->rxpos) { + case 0: + dev->rxfcs_rcvd = c << 8; + dev->rxpos++; + break; + case 1: + dev->rxfcs_rcvd |= c; + dev->rxpos++; + break; + case 2: + if (c != BYTE_FRAME) { + dev->rxstate = STATE_ERR; + } else { + mctp_serial_rx(dev); + dev->rxlen = 0; + dev->rxpos = 0; + dev->rxstate = STATE_IDLE; + } + break; + } +} + +static void mctp_serial_push(struct mctp_serial *dev, unsigned char c) +{ + switch (dev->rxstate) { + case STATE_IDLE: + dev->rxstate = STATE_HEADER; + fallthrough; + case STATE_HEADER: + mctp_serial_push_header(dev, c); + break; + + case STATE_ESCAPE: + c |= 0x20; + fallthrough; + case STATE_DATA: + if (dev->rxstate != STATE_ESCAPE && c == BYTE_ESC) { + dev->rxstate = STATE_ESCAPE; + } else { + dev->rxfcs = crc_ccitt_byte(dev->rxfcs, c); + dev->rxbuf[dev->rxpos] = c; + dev->rxpos++; + dev->rxstate = STATE_DATA; + if (dev->rxpos == dev->rxlen) { + dev->rxpos = 0; + dev->rxstate = STATE_TRAILER; + } + } + break; + + case STATE_TRAILER: + mctp_serial_push_trailer(dev, c); + break; + + case STATE_ERR: + if (c == BYTE_FRAME) + dev->rxstate = STATE_IDLE; + break; + + default: + netdev_err_once(dev->netdev, "invalid rx state %d\n", + dev->rxstate); + } +} + +static void mctp_serial_tty_receive_buf(struct tty_struct *tty, + const unsigned char *c, + const char *f, int len) +{ + struct mctp_serial *dev = tty->disc_data; + int i; + + if (!netif_running(dev->netdev)) + return; + + /* we don't (currently) use the flag bytes, just data. */ + for (i = 0; i < len; i++) + mctp_serial_push(dev, c[i]); +} + +static const struct net_device_ops mctp_serial_netdev_ops = { + .ndo_start_xmit = mctp_serial_tx, +}; + +static void mctp_serial_setup(struct net_device *ndev) +{ + ndev->type = ARPHRD_MCTP; + ndev->mtu = MCTP_SERIAL_MTU; + ndev->hard_header_len = 0; + ndev->addr_len = 0; + ndev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + ndev->flags = IFF_NOARP; + ndev->netdev_ops = &mctp_serial_netdev_ops; + ndev->needs_free_netdev = true; +} + +static int mctp_serial_open(struct tty_struct *tty) +{ + struct mctp_serial *dev; + struct net_device *ndev; + char name[32]; + int idx, rc; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!tty->ops->write) + return -EOPNOTSUPP; + + if (tty->disc_data) + return -EEXIST; + + idx = ida_alloc(&mctp_serial_ida, GFP_KERNEL); + if (idx < 0) + return idx; + + snprintf(name, sizeof(name), "mctpserial%d", idx); + ndev = alloc_netdev(sizeof(*dev), name, NET_NAME_ENUM, + mctp_serial_setup); + if (!ndev) { + rc = -ENOMEM; + goto free_ida; + } + + dev = netdev_priv(ndev); + dev->idx = idx; + dev->tty = tty; + dev->netdev = ndev; + dev->txstate = STATE_IDLE; + dev->rxstate = STATE_IDLE; + spin_lock_init(&dev->lock); + INIT_WORK(&dev->tx_work, mctp_serial_tx_work); + + rc = register_netdev(ndev); + if (rc) + goto free_netdev; + + tty->receive_room = 64 * 1024; + tty->disc_data = dev; + + return 0; + +free_netdev: + free_netdev(ndev); + +free_ida: + ida_free(&mctp_serial_ida, idx); + return rc; +} + +static void mctp_serial_close(struct tty_struct *tty) +{ + struct mctp_serial *dev = tty->disc_data; + int idx = dev->idx; + + unregister_netdev(dev->netdev); + ida_free(&mctp_serial_ida, idx); +} + +static struct tty_ldisc_ops mctp_ldisc = { + .owner = THIS_MODULE, + .num = N_MCTP, + .name = "mctp", + .open = mctp_serial_open, + .close = mctp_serial_close, + .receive_buf = mctp_serial_tty_receive_buf, + .write_wakeup = mctp_serial_tty_write_wakeup, +}; + +static int __init mctp_serial_init(void) +{ + return tty_register_ldisc(&mctp_ldisc); +} + +static void __exit mctp_serial_exit(void) +{ + tty_unregister_ldisc(&mctp_ldisc); +} + +module_init(mctp_serial_init); +module_exit(mctp_serial_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jeremy Kerr "); +MODULE_DESCRIPTION("MCTP Serial transport"); diff --git a/include/uapi/linux/tty.h b/include/uapi/linux/tty.h index 376cccf397be..a58deb3061eb 100644 --- a/include/uapi/linux/tty.h +++ b/include/uapi/linux/tty.h @@ -38,5 +38,6 @@ #define N_NCI 25 /* NFC NCI UART */ #define N_SPEAKUP 26 /* Speakup communication with synths */ #define N_NULL 27 /* Null ldisc used for error handling */ +#define N_MCTP 28 /* MCTP-over-serial */ #endif /* _UAPI_LINUX_TTY_H */ -- cgit v1.2.3 From 0c130167aa350f0eaacec1aa4ae820120313e608 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Mon, 20 Dec 2021 10:31:04 +0800 Subject: mctp: emit RTM_NEWADDR and RTM_DELADDR Userspace can receive notification of MCTP address changes via RTNLGRP_MCTP_IFADDR rtnetlink multicast group. OpenBMC-Staging-Count: 1 Signed-off-by: Matt Johnston Link: https://lore.kernel.org/r/20211220023104.1965509-1-matt@codeconstruct.com.au Signed-off-by: Jakub Kicinski (cherry picked from commit dbcefdeb2a58039f4c81d0361056fbdd9be906a1) Signed-off-by: Joel Stanley --- include/uapi/linux/rtnetlink.h | 2 ++ net/mctp/device.c | 53 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 5888492a5257..93d934cc4613 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -754,6 +754,8 @@ enum rtnetlink_groups { #define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP RTNLGRP_BRVLAN, #define RTNLGRP_BRVLAN RTNLGRP_BRVLAN + RTNLGRP_MCTP_IFADDR, +#define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) diff --git a/net/mctp/device.c b/net/mctp/device.c index 8799ee77e7b7..ef2755f82f87 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -35,14 +35,24 @@ struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev) return rtnl_dereference(dev->mctp_ptr); } -static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb, - struct mctp_dev *mdev, mctp_eid_t eid) +static int mctp_addrinfo_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(1) // IFA_LOCAL + + nla_total_size(1) // IFA_ADDRESS + ; +} + +/* flag should be NLM_F_MULTI for dump calls */ +static int mctp_fill_addrinfo(struct sk_buff *skb, + struct mctp_dev *mdev, mctp_eid_t eid, + int msg_type, u32 portid, u32 seq, int flag) { struct ifaddrmsg *hdr; struct nlmsghdr *nlh; - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWADDR, sizeof(*hdr), NLM_F_MULTI); + nlh = nlmsg_put(skb, portid, seq, + msg_type, sizeof(*hdr), flag); if (!nlh) return -EMSGSIZE; @@ -72,10 +82,14 @@ static int mctp_dump_dev_addrinfo(struct mctp_dev *mdev, struct sk_buff *skb, struct netlink_callback *cb) { struct mctp_dump_cb *mcb = (void *)cb->ctx; + u32 portid, seq; int rc = 0; + portid = NETLINK_CB(cb->skb).portid; + seq = cb->nlh->nlmsg_seq; for (; mcb->a_idx < mdev->num_addrs; mcb->a_idx++) { - rc = mctp_fill_addrinfo(skb, cb, mdev, mdev->addrs[mcb->a_idx]); + rc = mctp_fill_addrinfo(skb, mdev, mdev->addrs[mcb->a_idx], + RTM_NEWADDR, portid, seq, NLM_F_MULTI); if (rc < 0) break; } @@ -127,6 +141,32 @@ out: return skb->len; } +static void mctp_addr_notify(struct mctp_dev *mdev, mctp_eid_t eid, int msg_type, + struct sk_buff *req_skb, struct nlmsghdr *req_nlh) +{ + u32 portid = NETLINK_CB(req_skb).portid; + struct net *net = dev_net(mdev->dev); + struct sk_buff *skb; + int rc = -ENOBUFS; + + skb = nlmsg_new(mctp_addrinfo_size(), GFP_KERNEL); + if (!skb) + goto out; + + rc = mctp_fill_addrinfo(skb, mdev, eid, msg_type, + portid, req_nlh->nlmsg_seq, 0); + if (rc < 0) { + WARN_ON_ONCE(rc == -EMSGSIZE); + goto out; + } + + rtnl_notify(skb, net, portid, RTNLGRP_MCTP_IFADDR, req_nlh, GFP_KERNEL); + return; +out: + kfree_skb(skb); + rtnl_set_sk_err(net, RTNLGRP_MCTP_IFADDR, rc); +} + static const struct nla_policy ifa_mctp_policy[IFA_MAX + 1] = { [IFA_ADDRESS] = { .type = NLA_U8 }, [IFA_LOCAL] = { .type = NLA_U8 }, @@ -189,6 +229,7 @@ static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, kfree(tmp_addrs); + mctp_addr_notify(mdev, addr->s_addr, RTM_NEWADDR, skb, nlh); mctp_route_add_local(mdev, addr->s_addr); return 0; @@ -244,6 +285,8 @@ static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, mdev->num_addrs--; spin_unlock_irqrestore(&mdev->addrs_lock, flags); + mctp_addr_notify(mdev, addr->s_addr, RTM_DELADDR, skb, nlh); + return 0; } -- cgit v1.2.3 From 09ee56d93db6eb9c8919891edc20a398cacdd27c Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 9 Feb 2022 12:05:55 +0800 Subject: mctp: Add helper for address match checking Currently, we have a couple of paths that check that an EID matches, or the match value is MCTP_ADDR_ANY. Rather than open coding this, add a little helper. OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 8069b22d656f6e1922352bff90ab78e6fab73779) Signed-off-by: Joel Stanley --- include/net/mctp.h | 5 +++++ net/mctp/route.c | 8 +++----- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index 7e35ec79b909..706d329dd8e8 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -45,6 +45,11 @@ static inline bool mctp_address_ok(mctp_eid_t eid) return eid >= 8 && eid < 255; } +static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) +{ + return match == eid || match == MCTP_ADDR_ANY; +} + static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb) { return (struct mctp_hdr *)skb_network_header(skb); diff --git a/net/mctp/route.c b/net/mctp/route.c index 8d9f4ff3e285..654467a7aeae 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -64,8 +64,7 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) if (msk->bind_type != type) continue; - if (msk->bind_addr != MCTP_ADDR_ANY && - msk->bind_addr != mh->dest) + if (!mctp_address_matches(msk->bind_addr, mh->dest)) continue; return msk; @@ -616,9 +615,8 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, if (tmp->tag & MCTP_HDR_FLAG_TO) continue; - if (!((tmp->peer_addr == daddr || - tmp->peer_addr == MCTP_ADDR_ANY) && - tmp->local_addr == saddr)) + if (!(mctp_address_matches(tmp->peer_addr, daddr) && + tmp->local_addr == saddr)) continue; spin_lock(&tmp->lock); -- cgit v1.2.3 From 75ce2111151a757729dd33d498b1d1cf58af8636 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Wed, 9 Feb 2022 12:05:57 +0800 Subject: mctp: Add SIOCMCTP{ALLOC,DROP}TAG ioctls for tag control This change adds a couple of new ioctls for mctp sockets: SIOCMCTPALLOCTAG and SIOCMCTPDROPTAG. These ioctls provide facilities for explicit allocation / release of tags, overriding the automatic allocate-on-send/release-on-reply and timeout behaviours. This allows userspace more control over messages that may not fit a simple request/response model. In order to indicate a pre-allocated tag to the sendmsg() syscall, we introduce a new flag to the struct sockaddr_mctp.smctp_tag value: MCTP_TAG_PREALLOC. Additional changes from Jeremy Kerr . Contains a fix that was: Reported-by: kernel test robot OpenBMC-Staging-Count: 1 Signed-off-by: Matt Johnston Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller (cherry picked from commit 63ed1aab3d40aa61aaa66819bdce9377ac7f40fa) Signed-off-by: Joel Stanley --- Documentation/networking/mctp.rst | 48 ++++++++++ include/net/mctp.h | 11 ++- include/trace/events/mctp.h | 5 +- include/uapi/linux/mctp.h | 18 ++++ net/mctp/af_mctp.c | 189 ++++++++++++++++++++++++++++++++------ net/mctp/route.c | 114 +++++++++++++++++------ 6 files changed, 329 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/Documentation/networking/mctp.rst b/Documentation/networking/mctp.rst index 46f74bffce0f..c628cb5406d2 100644 --- a/Documentation/networking/mctp.rst +++ b/Documentation/networking/mctp.rst @@ -212,6 +212,54 @@ remote address is already known, or the message does not require a reply. Like the send calls, sockets will only receive responses to requests they have sent (TO=1) and may only respond (TO=0) to requests they have received. +``ioctl(SIOCMCTPALLOCTAG)`` and ``ioctl(SIOCMCTPDROPTAG)`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +These tags give applications more control over MCTP message tags, by allocating +(and dropping) tag values explicitly, rather than the kernel automatically +allocating a per-message tag at ``sendmsg()`` time. + +In general, you will only need to use these ioctls if your MCTP protocol does +not fit the usual request/response model. For example, if you need to persist +tags across multiple requests, or a request may generate more than one response. +In these cases, the ioctls allow you to decouple the tag allocation (and +release) from individual message send and receive operations. + +Both ioctls are passed a pointer to a ``struct mctp_ioc_tag_ctl``: + +.. code-block:: C + + struct mctp_ioc_tag_ctl { + mctp_eid_t peer_addr; + __u8 tag; + __u16 flags; + }; + +``SIOCMCTPALLOCTAG`` allocates a tag for a specific peer, which an application +can use in future ``sendmsg()`` calls. The application populates the +``peer_addr`` member with the remote EID. Other fields must be zero. + +On return, the ``tag`` member will be populated with the allocated tag value. +The allocated tag will have the following tag bits set: + + - ``MCTP_TAG_OWNER``: it only makes sense to allocate tags if you're the tag + owner + + - ``MCTP_TAG_PREALLOC``: to indicate to ``sendmsg()`` that this is a + preallocated tag. + + - ... and the actual tag value, within the least-significant three bits + (``MCTP_TAG_MASK``). Note that zero is a valid tag value. + +The tag value should be used as-is for the ``smctp_tag`` member of ``struct +sockaddr_mctp``. + +``SIOCMCTPDROPTAG`` releases a tag that has been previously allocated by a +``SIOCMCTPALLOCTAG`` ioctl. The ``peer_addr`` must be the same as used for the +allocation, and the ``tag`` value must match exactly the tag returned from the +allocation (including the ``MCTP_TAG_OWNER`` and ``MCTP_TAG_PREALLOC`` bits). +The ``flags`` field must be zero. + Kernel internals ================ diff --git a/include/net/mctp.h b/include/net/mctp.h index 706d329dd8e8..e80a4baf8379 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -126,7 +126,7 @@ struct mctp_sock { */ struct mctp_sk_key { mctp_eid_t peer_addr; - mctp_eid_t local_addr; + mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ /* we hold a ref to sk when set */ @@ -163,6 +163,12 @@ struct mctp_sk_key { */ unsigned long dev_flow_state; struct mctp_dev *dev; + + /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire + * automatically on timeout or response, instead SIOCMCTPDROPTAG + * is used. + */ + bool manual_alloc; }; struct mctp_skb_cb { @@ -239,6 +245,9 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); +struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t daddr, mctp_eid_t saddr, + bool manual, u8 *tagp); /* routing <--> device interface */ unsigned int mctp_default_net(struct net *net); diff --git a/include/trace/events/mctp.h b/include/trace/events/mctp.h index 175b057c507f..165cf25f77a7 100644 --- a/include/trace/events/mctp.h +++ b/include/trace/events/mctp.h @@ -15,6 +15,7 @@ enum { MCTP_TRACE_KEY_REPLIED, MCTP_TRACE_KEY_INVALIDATED, MCTP_TRACE_KEY_CLOSED, + MCTP_TRACE_KEY_DROPPED, }; #endif /* __TRACE_MCTP_ENUMS */ @@ -22,6 +23,7 @@ TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED); +TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_DROPPED); TRACE_EVENT(mctp_key_acquire, TP_PROTO(const struct mctp_sk_key *key), @@ -66,7 +68,8 @@ TRACE_EVENT(mctp_key_release, { MCTP_TRACE_KEY_TIMEOUT, "timeout" }, { MCTP_TRACE_KEY_REPLIED, "replied" }, { MCTP_TRACE_KEY_INVALIDATED, "invalidated" }, - { MCTP_TRACE_KEY_CLOSED, "closed" }) + { MCTP_TRACE_KEY_CLOSED, "closed" }, + { MCTP_TRACE_KEY_DROPPED, "dropped" }) ) ); diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index 07b0318716fc..154ab56651f1 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -44,7 +44,25 @@ struct sockaddr_mctp_ext { #define MCTP_TAG_MASK 0x07 #define MCTP_TAG_OWNER 0x08 +#define MCTP_TAG_PREALLOC 0x10 #define MCTP_OPT_ADDR_EXT 1 +#define SIOCMCTPALLOCTAG (SIOCPROTOPRIVATE + 0) +#define SIOCMCTPDROPTAG (SIOCPROTOPRIVATE + 1) + +struct mctp_ioc_tag_ctl { + mctp_eid_t peer_addr; + + /* For SIOCMCTPALLOCTAG: must be passed as zero, kernel will + * populate with the allocated tag value. Returned tag value will + * always have TO and PREALLOC set. + * + * For SIOCMCTPDROPTAG: userspace provides tag value to drop, from + * a prior SIOCMCTPALLOCTAG call (and so must have TO and PREALLOC set). + */ + __u8 tag; + __u16 flags; +}; + #endif /* __UAPI_MCTP_H */ diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index c921de63b494..f0702d920d8d 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -6,6 +6,7 @@ * Copyright (c) 2021 Google */ +#include #include #include #include @@ -21,6 +22,8 @@ /* socket implementation */ +static void mctp_sk_expire_keys(struct timer_list *timer); + static int mctp_release(struct socket *sock) { struct sock *sk = sock->sk; @@ -99,13 +102,20 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sk_buff *skb; if (addr) { + const u8 tagbits = MCTP_TAG_MASK | MCTP_TAG_OWNER | + MCTP_TAG_PREALLOC; + if (addrlen < sizeof(struct sockaddr_mctp)) return -EINVAL; if (addr->smctp_family != AF_MCTP) return -EINVAL; if (!mctp_sockaddr_is_ok(addr)) return -EINVAL; - if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER)) + if (addr->smctp_tag & ~tagbits) + return -EINVAL; + /* can't preallocate a non-owned tag */ + if (addr->smctp_tag & MCTP_TAG_PREALLOC && + !(addr->smctp_tag & MCTP_TAG_OWNER)) return -EINVAL; } else { @@ -248,6 +258,32 @@ out_free: return rc; } +/* We're done with the key; invalidate, stop reassembly, and remove from lists. + */ +static void __mctp_key_remove(struct mctp_sk_key *key, struct net *net, + unsigned long flags, unsigned long reason) +__releases(&key->lock) +__must_hold(&net->mctp.keys_lock) +{ + struct sk_buff *skb; + + trace_mctp_key_release(key, reason); + skb = key->reasm_head; + key->reasm_head = NULL; + key->reasm_dead = true; + key->valid = false; + mctp_dev_release_key(key->dev, key); + spin_unlock_irqrestore(&key->lock, flags); + + hlist_del(&key->hlist); + hlist_del(&key->sklist); + + /* unref for the lists */ + mctp_key_unref(key); + + kfree_skb(skb); +} + static int mctp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -293,6 +329,115 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname, return -EINVAL; } +static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg) +{ + struct net *net = sock_net(&msk->sk); + struct mctp_sk_key *key = NULL; + struct mctp_ioc_tag_ctl ctl; + unsigned long flags; + u8 tag; + + if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl))) + return -EFAULT; + + if (ctl.tag) + return -EINVAL; + + if (ctl.flags) + return -EINVAL; + + key = mctp_alloc_local_tag(msk, ctl.peer_addr, MCTP_ADDR_ANY, + true, &tag); + if (IS_ERR(key)) + return PTR_ERR(key); + + ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC; + if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) { + spin_lock_irqsave(&key->lock, flags); + __mctp_key_remove(key, net, flags, MCTP_TRACE_KEY_DROPPED); + mctp_key_unref(key); + return -EFAULT; + } + + mctp_key_unref(key); + return 0; +} + +static int mctp_ioctl_droptag(struct mctp_sock *msk, unsigned long arg) +{ + struct net *net = sock_net(&msk->sk); + struct mctp_ioc_tag_ctl ctl; + unsigned long flags, fl2; + struct mctp_sk_key *key; + struct hlist_node *tmp; + int rc; + u8 tag; + + if (copy_from_user(&ctl, (void __user *)arg, sizeof(ctl))) + return -EFAULT; + + if (ctl.flags) + return -EINVAL; + + /* Must be a local tag, TO set, preallocated */ + if ((ctl.tag & ~MCTP_TAG_MASK) != (MCTP_TAG_OWNER | MCTP_TAG_PREALLOC)) + return -EINVAL; + + tag = ctl.tag & MCTP_TAG_MASK; + rc = -EINVAL; + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { + /* we do an irqsave here, even though we know the irq state, + * so we have the flags to pass to __mctp_key_remove + */ + spin_lock_irqsave(&key->lock, fl2); + if (key->manual_alloc && + ctl.peer_addr == key->peer_addr && + tag == key->tag) { + __mctp_key_remove(key, net, fl2, + MCTP_TRACE_KEY_DROPPED); + rc = 0; + } else { + spin_unlock_irqrestore(&key->lock, fl2); + } + } + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + return rc; +} + +static int mctp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk); + + switch (cmd) { + case SIOCMCTPALLOCTAG: + return mctp_ioctl_alloctag(msk, arg); + case SIOCMCTPDROPTAG: + return mctp_ioctl_droptag(msk, arg); + } + + return -EINVAL; +} + +#ifdef CONFIG_COMPAT +static int mctp_compat_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = compat_ptr(arg); + + switch (cmd) { + /* These have compatible ptr layouts */ + case SIOCMCTPALLOCTAG: + case SIOCMCTPDROPTAG: + return mctp_ioctl(sock, cmd, (unsigned long)argp); + } + + return -ENOIOCTLCMD; +} +#endif + static const struct proto_ops mctp_dgram_ops = { .family = PF_MCTP, .release = mctp_release, @@ -302,7 +447,7 @@ static const struct proto_ops mctp_dgram_ops = { .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, - .ioctl = sock_no_ioctl, + .ioctl = mctp_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, @@ -312,6 +457,9 @@ static const struct proto_ops mctp_dgram_ops = { .recvmsg = mctp_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, +#ifdef CONFIG_COMPAT + .compat_ioctl = mctp_compat_ioctl, +#endif }; static void mctp_sk_expire_keys(struct timer_list *timer) @@ -319,7 +467,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer) struct mctp_sock *msk = container_of(timer, struct mctp_sock, key_expiry); struct net *net = sock_net(&msk->sk); - unsigned long next_expiry, flags; + unsigned long next_expiry, flags, fl2; struct mctp_sk_key *key; struct hlist_node *tmp; bool next_expiry_valid = false; @@ -327,15 +475,16 @@ static void mctp_sk_expire_keys(struct timer_list *timer) spin_lock_irqsave(&net->mctp.keys_lock, flags); hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { - spin_lock(&key->lock); + /* don't expire. manual_alloc is immutable, no locking + * required. + */ + if (key->manual_alloc) + continue; + spin_lock_irqsave(&key->lock, fl2); if (!time_after_eq(key->expiry, jiffies)) { - trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT); - key->valid = false; - hlist_del_rcu(&key->hlist); - hlist_del_rcu(&key->sklist); - spin_unlock(&key->lock); - mctp_key_unref(key); + __mctp_key_remove(key, net, fl2, + MCTP_TRACE_KEY_TIMEOUT); continue; } @@ -346,7 +495,7 @@ static void mctp_sk_expire_keys(struct timer_list *timer) next_expiry = key->expiry; next_expiry_valid = true; } - spin_unlock(&key->lock); + spin_unlock_irqrestore(&key->lock, fl2); } spin_unlock_irqrestore(&net->mctp.keys_lock, flags); @@ -387,9 +536,9 @@ static void mctp_sk_unhash(struct sock *sk) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct net *net = sock_net(sk); + unsigned long flags, fl2; struct mctp_sk_key *key; struct hlist_node *tmp; - unsigned long flags; /* remove from any type-based binds */ mutex_lock(&net->mctp.bind_lock); @@ -399,20 +548,8 @@ static void mctp_sk_unhash(struct sock *sk) /* remove tag allocations */ spin_lock_irqsave(&net->mctp.keys_lock, flags); hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { - hlist_del(&key->sklist); - hlist_del(&key->hlist); - - trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED); - - spin_lock(&key->lock); - kfree_skb(key->reasm_head); - key->reasm_head = NULL; - key->reasm_dead = true; - key->valid = false; - spin_unlock(&key->lock); - - /* key is no longer on the lookup lists, unref */ - mctp_key_unref(key); + spin_lock_irqsave(&key->lock, fl2); + __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_CLOSED); } spin_unlock_irqrestore(&net->mctp.keys_lock, flags); } diff --git a/net/mctp/route.c b/net/mctp/route.c index 35f72e99e188..17e3482aa770 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -203,29 +203,38 @@ static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) return rc; } -/* We're done with the key; unset valid and remove from lists. There may still - * be outstanding refs on the key though... +/* Helper for mctp_route_input(). + * We're done with the key; unlock and unref the key. + * For the usual case of automatic expiry we remove the key from lists. + * In the case that manual allocation is set on a key we release the lock + * and local ref, reset reassembly, but don't remove from lists. */ -static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, - unsigned long flags) - __releases(&key->lock) +static void __mctp_key_done_in(struct mctp_sk_key *key, struct net *net, + unsigned long flags, unsigned long reason) +__releases(&key->lock) { struct sk_buff *skb; + trace_mctp_key_release(key, reason); skb = key->reasm_head; key->reasm_head = NULL; - key->reasm_dead = true; - key->valid = false; - mctp_dev_release_key(key->dev, key); + + if (!key->manual_alloc) { + key->reasm_dead = true; + key->valid = false; + mctp_dev_release_key(key->dev, key); + } spin_unlock_irqrestore(&key->lock, flags); - spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_del(&key->hlist); - hlist_del(&key->sklist); - spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + if (!key->manual_alloc) { + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_del(&key->hlist); + hlist_del(&key->sklist); + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - /* one unref for the lists */ - mctp_key_unref(key); + /* unref for the lists */ + mctp_key_unref(key); + } /* and one for the local reference */ mctp_key_unref(key); @@ -379,9 +388,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) /* we've hit a pending reassembly; not much we * can do but drop it */ - trace_mctp_key_release(key, - MCTP_TRACE_KEY_REPLIED); - __mctp_key_unlock_drop(key, net, f); + __mctp_key_done_in(key, net, f, + MCTP_TRACE_KEY_REPLIED); key = NULL; } rc = 0; @@ -423,9 +431,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) } else { if (key->reasm_head || key->reasm_dead) { /* duplicate start? drop everything */ - trace_mctp_key_release(key, - MCTP_TRACE_KEY_INVALIDATED); - __mctp_key_unlock_drop(key, net, f); + __mctp_key_done_in(key, net, f, + MCTP_TRACE_KEY_INVALIDATED); rc = -EEXIST; key = NULL; } else { @@ -448,10 +455,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * the reassembly/response key */ if (!rc && flags & MCTP_HDR_FLAG_EOM) { + msk = container_of(key->sk, struct mctp_sock, sk); sock_queue_rcv_skb(key->sk, key->reasm_head); key->reasm_head = NULL; - trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED); - __mctp_key_unlock_drop(key, net, f); + __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED); key = NULL; } @@ -579,9 +586,9 @@ static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, /* Allocate a locally-owned tag value for (saddr, daddr), and reserve * it for the socket msk */ -static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, - mctp_eid_t saddr, - mctp_eid_t daddr, u8 *tagp) +struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t daddr, mctp_eid_t saddr, + bool manual, u8 *tagp) { struct net *net = sock_net(&msk->sk); struct netns_mctp *mns = &net->mctp; @@ -636,6 +643,7 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, mctp_reserve_tag(net, key, msk); trace_mctp_key_acquire(key); + key->manual_alloc = manual; *tagp = key->tag; } @@ -649,6 +657,50 @@ static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, return key; } +static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk, + mctp_eid_t daddr, + u8 req_tag, u8 *tagp) +{ + struct net *net = sock_net(&msk->sk); + struct netns_mctp *mns = &net->mctp; + struct mctp_sk_key *key, *tmp; + unsigned long flags; + + req_tag &= ~(MCTP_TAG_PREALLOC | MCTP_TAG_OWNER); + key = NULL; + + spin_lock_irqsave(&mns->keys_lock, flags); + + hlist_for_each_entry(tmp, &mns->keys, hlist) { + if (tmp->tag != req_tag) + continue; + + if (!mctp_address_matches(tmp->peer_addr, daddr)) + continue; + + if (!tmp->manual_alloc) + continue; + + spin_lock(&tmp->lock); + if (tmp->valid) { + key = tmp; + refcount_inc(&key->refs); + spin_unlock(&tmp->lock); + break; + } + spin_unlock(&tmp->lock); + } + spin_unlock_irqrestore(&mns->keys_lock, flags); + + if (!key) + return ERR_PTR(-ENOENT); + + if (tagp) + *tagp = key->tag; + + return key; +} + /* routing lookups */ static bool mctp_rt_match_eid(struct mctp_route *rt, unsigned int net, mctp_eid_t eid) @@ -843,8 +895,14 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, if (rc) goto out_release; - if (req_tag & MCTP_HDR_FLAG_TO) { - key = mctp_alloc_local_tag(msk, saddr, daddr, &tag); + if (req_tag & MCTP_TAG_OWNER) { + if (req_tag & MCTP_TAG_PREALLOC) + key = mctp_lookup_prealloc_tag(msk, daddr, + req_tag, &tag); + else + key = mctp_alloc_local_tag(msk, daddr, saddr, + false, &tag); + if (IS_ERR(key)) { rc = PTR_ERR(key); goto out_release; @@ -855,7 +913,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, tag |= MCTP_HDR_FLAG_TO; } else { key = NULL; - tag = req_tag; + tag = req_tag & MCTP_TAG_MASK; } skb->protocol = htons(ETH_P_MCTP); -- cgit v1.2.3 From 50126574fd6ab6dcaad87b2c0d0c32d74a4e3db2 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 18 Feb 2022 12:25:53 +0800 Subject: mctp: replace mctp_address_ok with more fine-grained helpers Currently, we have mctp_address_ok(), which checks if an EID is in the "valid" range of 8-254 inclusive. However, 0 and 255 may also be valid addresses, depending on context. 0 is the NULL EID, which may be set when physical addressing is used. 255 is valid as a destination address for broadcasts. This change renames mctp_address_ok to mctp_address_unicast, and adds similar helpers for broadcast and null EIDs, which will be used in an upcoming commit. OpenBMC-Staging-Count: 1 Signed-off-by: Jeremy Kerr Signed-off-by: Jakub Kicinski (cherry picked from commit cb196b725936f6b776ad1d073f66fbe92aa798fa) Signed-off-by: Joel Stanley --- include/net/mctp.h | 12 +++++++++++- net/mctp/device.c | 2 +- net/mctp/neigh.c | 2 +- net/mctp/route.c | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index e80a4baf8379..d37268fe6825 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -40,11 +40,21 @@ struct mctp_hdr { #define MCTP_INITIAL_DEFAULT_NET 1 -static inline bool mctp_address_ok(mctp_eid_t eid) +static inline bool mctp_address_unicast(mctp_eid_t eid) { return eid >= 8 && eid < 255; } +static inline bool mctp_address_broadcast(mctp_eid_t eid) +{ + return eid == 255; +} + +static inline bool mctp_address_null(mctp_eid_t eid) +{ + return eid == 0; +} + static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) { return match == eid || match == MCTP_ADDR_ANY; diff --git a/net/mctp/device.c b/net/mctp/device.c index ef2755f82f87..380ef4cf8948 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -208,7 +208,7 @@ static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!mdev) return -ENODEV; - if (!mctp_address_ok(addr->s_addr)) + if (!mctp_address_unicast(addr->s_addr)) return -EINVAL; /* Prevent duplicates. Under RTNL so don't need to lock for reading */ diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 6ad3e33bd4d4..ffa0f9e0983f 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -143,7 +143,7 @@ static int mctp_rtm_newneigh(struct sk_buff *skb, struct nlmsghdr *nlh, } eid = nla_get_u8(tb[NDA_DST]); - if (!mctp_address_ok(eid)) { + if (!mctp_address_unicast(eid)) { NL_SET_ERR_MSG(extack, "Invalid neighbour EID"); return -EINVAL; } diff --git a/net/mctp/route.c b/net/mctp/route.c index 0c4c56e1bd6e..6a11d78cfbab 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -962,7 +962,7 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, struct net *net = dev_net(mdev->dev); struct mctp_route *rt, *ert; - if (!mctp_address_ok(daddr_start)) + if (!mctp_address_unicast(daddr_start)) return -EINVAL; if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) -- cgit v1.2.3 From e6d7e51e109248c9390c512a0e6ad13b3e122a35 Mon Sep 17 00:00:00 2001 From: Yan Yan Date: Tue, 18 Jan 2022 16:00:13 -0800 Subject: xfrm: Check if_id in xfrm_migrate [ Upstream commit c1aca3080e382886e2e58e809787441984a2f89b ] This patch enables distinguishing SAs and SPs based on if_id during the xfrm_migrate flow. This ensures support for xfrm interfaces throughout the SA/SP lifecycle. When there are multiple existing SPs with the same direction, the same xfrm_selector and different endpoint addresses, xfrm_migrate might fail with ENODATA. Specifically, the code path for performing xfrm_migrate is: Stage 1: find policy to migrate with xfrm_migrate_policy_find(sel, dir, type, net) Stage 2: find and update state(s) with xfrm_migrate_state_find(mp, net) Stage 3: update endpoint address(es) of template(s) with xfrm_policy_migrate(pol, m, num_migrate) Currently "Stage 1" always returns the first xfrm_policy that matches, and "Stage 3" looks for the xfrm_tmpl that matches the old endpoint address. Thus if there are multiple xfrm_policy with same selector, direction, type and net, "Stage 1" might rertun a wrong xfrm_policy and "Stage 3" will fail with ENODATA because it cannot find a xfrm_tmpl with the matching endpoint address. The fix is to allow userspace to pass an if_id and add if_id to the matching rule in Stage 1 and Stage 2 since if_id is a unique ID for xfrm_policy and xfrm_state. For compatibility, if_id will only be checked if the attribute is set. Tested with additions to Android's kernel unit test suite: https://android-review.googlesource.com/c/kernel/tests/+/1668886 Signed-off-by: Yan Yan Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- include/net/xfrm.h | 5 +++-- net/key/af_key.c | 2 +- net/xfrm/xfrm_policy.c | 14 ++++++++------ net/xfrm/xfrm_state.c | 7 ++++++- net/xfrm/xfrm_user.c | 6 +++++- 5 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 301a164f17e9..358dfe6fefef 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1679,14 +1679,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/net/key/af_key.c b/net/key/af_key.c index de24a7d474df..9bf52a09b5ff 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2623,7 +2623,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL); + kma ? &k : NULL, net, NULL, 0); out: return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 37b149f63262..02099d113a0a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4259,7 +4259,7 @@ static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, - u8 dir, u8 type, struct net *net) + u8 dir, u8 type, struct net *net, u32 if_id) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; @@ -4268,7 +4268,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; @@ -4280,7 +4281,8 @@ static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector * if ((pol->priority >= priority) && ret) break; - if (xfrm_migrate_selector_match(sel, &pol->selector) && + if ((if_id == 0 || pol->if_id == if_id) && + xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; @@ -4396,7 +4398,7 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, u32 if_id) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4415,14 +4417,14 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { + if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { - if ((x = xfrm_migrate_state_find(mp, net))) { + if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 100b4b3723e7..291236d7676f 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1605,7 +1605,8 @@ out: return NULL; } -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net) +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id) { unsigned int h; struct xfrm_state *x = NULL; @@ -1621,6 +1622,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n continue; if (m->reqid && x->props.reqid != m->reqid) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, @@ -1636,6 +1639,8 @@ struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *n if (x->props.mode != m->mode || x->id.proto != m->proto) continue; + if (if_id != 0 && x->if_id != if_id) + continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e13198f7e4e9..2acba159327c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2592,6 +2592,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + u32 if_id = 0; if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2616,7 +2617,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOMEM; } - err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap); + if (attrs[XFRMA_IF_ID]) + if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + + err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id); kfree(encap); -- cgit v1.2.3 From 093f11b496091fe80443e53d8a7895252c959529 Mon Sep 17 00:00:00 2001 From: Jiyong Park Date: Fri, 11 Mar 2022 11:00:16 +0900 Subject: vsock: each transport cycles only on its own sockets [ Upstream commit 8e6ed963763fe21429eabfc76c69ce2b0163a3dd ] When iterating over sockets using vsock_for_each_connected_socket, make sure that a transport filters out sockets that don't belong to the transport. There actually was an issue caused by this; in a nested VM configuration, destroying the nested VM (which often involves the closing of /dev/vhost-vsock if there was h2g connections to the nested VM) kills not only the h2g connections, but also all existing g2h connections to the (outmost) host which are totally unrelated. Tested: Executed the following steps on Cuttlefish (Android running on a VM) [1]: (1) Enter into an `adb shell` session - to have a g2h connection inside the VM, (2) open and then close /dev/vhost-vsock by `exec 3< /dev/vhost-vsock && exec 3<&-`, (3) observe that the adb session is not reset. [1] https://android.googlesource.com/device/google/cuttlefish/ Fixes: c0cfa2d8a788 ("vsock: add multi-transports support") Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: Jiyong Park Link: https://lore.kernel.org/r/20220311020017.1509316-1-jiyong@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/vhost/vsock.c | 3 ++- include/net/af_vsock.h | 3 ++- net/vmw_vsock/af_vsock.c | 9 +++++++-- net/vmw_vsock/virtio_transport.c | 7 +++++-- net/vmw_vsock/vmci_transport.c | 5 ++++- 5 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index d07a20bbc07b..dcb1585819a1 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -757,7 +757,8 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file) /* Iterating over all connections for all CIDs to find orphans is * inefficient. Room for improvement here. */ - vsock_for_each_connected_socket(vhost_vsock_reset_orphans); + vsock_for_each_connected_socket(&vhost_transport.transport, + vhost_vsock_reset_orphans); /* Don't check the owner, because we are in the release path, so we * need to stop the vsock device in any case. diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index ab207677e0a8..f742e50207fb 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -205,7 +205,8 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); -void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +void vsock_for_each_connected_socket(struct vsock_transport *transport, + void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 91a5c65707ba..5df530e89e5a 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -333,7 +333,8 @@ void vsock_remove_sock(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_remove_sock); -void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) +void vsock_for_each_connected_socket(struct vsock_transport *transport, + void (*fn)(struct sock *sk)) { int i; @@ -342,8 +343,12 @@ void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)) for (i = 0; i < ARRAY_SIZE(vsock_connected_table); i++) { struct vsock_sock *vsk; list_for_each_entry(vsk, &vsock_connected_table[i], - connected_table) + connected_table) { + if (vsk->transport != transport) + continue; + fn(sk_vsock(vsk)); + } } spin_unlock_bh(&vsock_table_lock); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 4f7c99dfd16c..dad9ca65f4f9 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -24,6 +24,7 @@ static struct workqueue_struct *virtio_vsock_workqueue; static struct virtio_vsock __rcu *the_virtio_vsock; static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */ +static struct virtio_transport virtio_transport; /* forward declaration */ struct virtio_vsock { struct virtio_device *vdev; @@ -384,7 +385,8 @@ static void virtio_vsock_event_handle(struct virtio_vsock *vsock, switch (le32_to_cpu(event->id)) { case VIRTIO_VSOCK_EVENT_TRANSPORT_RESET: virtio_vsock_update_guest_cid(vsock); - vsock_for_each_connected_socket(virtio_vsock_reset_sock); + vsock_for_each_connected_socket(&virtio_transport.transport, + virtio_vsock_reset_sock); break; } } @@ -662,7 +664,8 @@ static void virtio_vsock_remove(struct virtio_device *vdev) synchronize_rcu(); /* Reset all connected sockets when the device disappear */ - vsock_for_each_connected_socket(virtio_vsock_reset_sock); + vsock_for_each_connected_socket(&virtio_transport.transport, + virtio_vsock_reset_sock); /* Stop all work handlers to make sure no one is accessing the device, * so we can safely call vdev->config->reset(). diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 7aef34e32bdf..b17dc9745188 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -75,6 +75,8 @@ static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; static int PROTOCOL_OVERRIDE = -1; +static struct vsock_transport vmci_transport; /* forward declaration */ + /* Helper function to convert from a VMCI error code to a VSock error code. */ static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) @@ -882,7 +884,8 @@ static void vmci_transport_qp_resumed_cb(u32 sub_id, const struct vmci_event_data *e_data, void *client_data) { - vsock_for_each_connected_socket(vmci_transport_handle_detach); + vsock_for_each_connected_socket(&vmci_transport, + vmci_transport_handle_detach); } static void vmci_transport_recv_pkt_work(struct work_struct *work) -- cgit v1.2.3 From a1adf00e17282a126d2002d1c5a8da14ac5fff1d Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 15 Mar 2022 10:20:08 +0100 Subject: net: handle ARPHRD_PIMREG in dev_is_mac_header_xmit() [ Upstream commit 4ee06de7729d795773145692e246a06448b1eb7a ] This kind of interface doesn't have a mac header. This patch fixes bpf_redirect() to a PIM interface. Fixes: 27b29f63058d ("bpf: add bpf_redirect() helper") Signed-off-by: Nicolas Dichtel Link: https://lore.kernel.org/r/20220315092008.31423-1-nicolas.dichtel@6wind.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h index b712217f7030..1ed52441972f 100644 --- a/include/linux/if_arp.h +++ b/include/linux/if_arp.h @@ -52,6 +52,7 @@ static inline bool dev_is_mac_header_xmit(const struct net_device *dev) case ARPHRD_VOID: case ARPHRD_NONE: case ARPHRD_RAWIP: + case ARPHRD_PIMREG: return false; default: return true; -- cgit v1.2.3 From 33061d0fba51d2bf70a2ef9645f703c33fe8e438 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 22 Mar 2022 18:07:17 +0100 Subject: ALSA: pcm: Fix races among concurrent hw_params and hw_free calls commit 92ee3c60ec9fe64404dc035e7c41277d74aa26cb upstream. Currently we have neither proper check nor protection against the concurrent calls of PCM hw_params and hw_free ioctls, which may result in a UAF. Since the existing PCM stream lock can't be used for protecting the whole ioctl operations, we need a new mutex to protect those racy calls. This patch introduced a new mutex, runtime->buffer_mutex, and applies it to both hw_params and hw_free ioctl code paths. Along with it, the both functions are slightly modified (the mmap_count check is moved into the state-check block) for code simplicity. Reported-by: Hu Jiahui Cc: Reviewed-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20220322170720.3529-2-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/sound/pcm.h | 1 + sound/core/pcm.c | 2 ++ sound/core/pcm_native.c | 61 +++++++++++++++++++++++++++++++------------------ 3 files changed, 42 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 33451f8ff755..2018b7512d3d 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -398,6 +398,7 @@ struct snd_pcm_runtime { wait_queue_head_t tsleep; /* transfer sleep */ struct fasync_struct *fasync; bool stop_operating; /* sync_stop will be called */ + struct mutex buffer_mutex; /* protect for buffer changes */ /* -- private section -- */ void *private_data; diff --git a/sound/core/pcm.c b/sound/core/pcm.c index ba4a987ed1c6..edd9849210f2 100644 --- a/sound/core/pcm.c +++ b/sound/core/pcm.c @@ -969,6 +969,7 @@ int snd_pcm_attach_substream(struct snd_pcm *pcm, int stream, init_waitqueue_head(&runtime->tsleep); runtime->status->state = SNDRV_PCM_STATE_OPEN; + mutex_init(&runtime->buffer_mutex); substream->runtime = runtime; substream->private_data = pcm->private_data; @@ -1002,6 +1003,7 @@ void snd_pcm_detach_substream(struct snd_pcm_substream *substream) } else { substream->runtime = NULL; } + mutex_destroy(&runtime->buffer_mutex); kfree(runtime); put_pid(substream->pid); substream->pid = NULL; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index d233cb3b41d8..2bd4604978a6 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -672,33 +672,40 @@ static int snd_pcm_hw_params_choose(struct snd_pcm_substream *pcm, return 0; } +#if IS_ENABLED(CONFIG_SND_PCM_OSS) +#define is_oss_stream(substream) ((substream)->oss.oss) +#else +#define is_oss_stream(substream) false +#endif + static int snd_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_runtime *runtime; - int err, usecs; + int err = 0, usecs; unsigned int bits; snd_pcm_uframes_t frames; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; + mutex_lock(&runtime->buffer_mutex); snd_pcm_stream_lock_irq(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_PREPARED: + if (!is_oss_stream(substream) && + atomic_read(&substream->mmap_count)) + err = -EBADFD; break; default: - snd_pcm_stream_unlock_irq(substream); - return -EBADFD; + err = -EBADFD; + break; } snd_pcm_stream_unlock_irq(substream); -#if IS_ENABLED(CONFIG_SND_PCM_OSS) - if (!substream->oss.oss) -#endif - if (atomic_read(&substream->mmap_count)) - return -EBADFD; + if (err) + goto unlock; snd_pcm_sync_stop(substream, true); @@ -786,16 +793,21 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, if (usecs >= 0) cpu_latency_qos_add_request(&substream->latency_pm_qos_req, usecs); - return 0; + err = 0; _error: - /* hardware might be unusable from this time, - so we force application to retry to set - the correct hardware parameter settings */ - snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); - if (substream->ops->hw_free != NULL) - substream->ops->hw_free(substream); - if (substream->managed_buffer_alloc) - snd_pcm_lib_free_pages(substream); + if (err) { + /* hardware might be unusable from this time, + * so we force application to retry to set + * the correct hardware parameter settings + */ + snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); + if (substream->ops->hw_free != NULL) + substream->ops->hw_free(substream); + if (substream->managed_buffer_alloc) + snd_pcm_lib_free_pages(substream); + } + unlock: + mutex_unlock(&runtime->buffer_mutex); return err; } @@ -835,26 +847,31 @@ static int do_hw_free(struct snd_pcm_substream *substream) static int snd_pcm_hw_free(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; - int result; + int result = 0; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; + mutex_lock(&runtime->buffer_mutex); snd_pcm_stream_lock_irq(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_PREPARED: + if (atomic_read(&substream->mmap_count)) + result = -EBADFD; break; default: - snd_pcm_stream_unlock_irq(substream); - return -EBADFD; + result = -EBADFD; + break; } snd_pcm_stream_unlock_irq(substream); - if (atomic_read(&substream->mmap_count)) - return -EBADFD; + if (result) + goto unlock; result = do_hw_free(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); cpu_latency_qos_remove_request(&substream->latency_pm_qos_req); + unlock: + mutex_unlock(&runtime->buffer_mutex); return result; } -- cgit v1.2.3 From 890f78e54b74f2e3f778bfd71d41a62cf893a9dd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 26 Mar 2022 10:42:04 -0700 Subject: Revert "swiotlb: rework "fix info leak with DMA_FROM_DEVICE"" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bddac7c1e02ba47f0570e494c9289acea3062cc1 upstream. This reverts commit aa6f8dcbab473f3a3c7454b74caa46d36cdc5d13. It turns out this breaks at least the ath9k wireless driver, and possibly others. What the ath9k driver does on packet receive is to set up the DMA transfer with: int ath_rx_init(..) .. bf->bf_buf_addr = dma_map_single(sc->dev, skb->data, common->rx_bufsize, DMA_FROM_DEVICE); and then the receive logic (through ath_rx_tasklet()) will fetch incoming packets static bool ath_edma_get_buffers(..) .. dma_sync_single_for_cpu(sc->dev, bf->bf_buf_addr, common->rx_bufsize, DMA_FROM_DEVICE); ret = ath9k_hw_process_rxdesc_edma(ah, rs, skb->data); if (ret == -EINPROGRESS) { /*let device gain the buffer again*/ dma_sync_single_for_device(sc->dev, bf->bf_buf_addr, common->rx_bufsize, DMA_FROM_DEVICE); return false; } and it's worth noting how that first DMA sync: dma_sync_single_for_cpu(..DMA_FROM_DEVICE); is there to make sure the CPU can read the DMA buffer (possibly by copying it from the bounce buffer area, or by doing some cache flush). The iommu correctly turns that into a "copy from bounce bufer" so that the driver can look at the state of the packets. In the meantime, the device may continue to write to the DMA buffer, but we at least have a snapshot of the state due to that first DMA sync. But that _second_ DMA sync: dma_sync_single_for_device(..DMA_FROM_DEVICE); is telling the DMA mapping that the CPU wasn't interested in the area because the packet wasn't there. In the case of a DMA bounce buffer, that is a no-op. Note how it's not a sync for the CPU (the "for_device()" part), and it's not a sync for data written by the CPU (the "DMA_FROM_DEVICE" part). Or rather, it _should_ be a no-op. That's what commit aa6f8dcbab47 broke: it made the code bounce the buffer unconditionally, and changed the DMA_FROM_DEVICE to just unconditionally and illogically be DMA_TO_DEVICE. [ Side note: purely within the confines of the swiotlb driver it wasn't entirely illogical: The reason it did that odd DMA_FROM_DEVICE -> DMA_TO_DEVICE conversion thing is because inside the swiotlb driver, it uses just a swiotlb_bounce() helper that doesn't care about the whole distinction of who the sync is for - only which direction to bounce. So it took the "sync for device" to mean that the CPU must have been the one writing, and thought it meant DMA_TO_DEVICE. ] Also note how the commentary in that commit was wrong, probably due to that whole confusion, claiming that the commit makes the swiotlb code "bounce unconditionally (that is, also when dir == DMA_TO_DEVICE) in order do avoid synchronising back stale data from the swiotlb buffer" which is nonsensical for two reasons: - that "also when dir == DMA_TO_DEVICE" is nonsensical, as that was exactly when it always did - and should do - the bounce. - since this is a sync for the device (not for the CPU), we're clearly fundamentally not coping back stale data from the bounce buffers at all, because we'd be copying *to* the bounce buffers. So that commit was just very confused. It confused the direction of the synchronization (to the device, not the cpu) with the direction of the DMA (from the device). Reported-and-bisected-by: Oleksandr Natalenko Reported-by: Olha Cherevyk Cc: Halil Pasic Cc: Christoph Hellwig Cc: Kalle Valo Cc: Robin Murphy Cc: Toke Høiland-Jørgensen Cc: Maxime Bizon Cc: Johannes Berg Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- Documentation/core-api/dma-attributes.rst | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 23 ++++++++--------------- 3 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..17706dc91ec9 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..6150d11a607e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index e58dce93c661..aca0690550e2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -578,14 +578,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - /* - * When dir == DMA_FROM_DEVICE we could omit the copy from the orig - * to the tlb buffer, if we knew for sure the device will - * overwirte the entire current content. But we don't. Thus - * unconditional bounce may prevent leaking swiotlb content (i.e. - * kernel memory) to user-space. - */ - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -652,13 +648,10 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - /* - * Unconditional bounce is necessary to avoid corruption on - * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite - * the whole lengt of the bounce buffer. - */ - swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); - BUG_ON(!valid_dma_direction(dir)); + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, -- cgit v1.2.3 From 303cd6173dce0a28d26526c77814eb90a41bd898 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 15 Mar 2022 18:34:06 +0300 Subject: NFSD: prevent integer overflow on 32 bit systems commit 23a9dbbe0faf124fc4c139615633b9d12a3a89ef upstream. On a 32 bit system, the "len * sizeof(*p)" operation can have an integer overflow. Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- include/linux/sunrpc/xdr.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index b519609af1d0..4417f667c757 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -731,6 +731,8 @@ xdr_stream_decode_uint32_array(struct xdr_stream *xdr, if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return -EBADMSG; + if (len > SIZE_MAX / sizeof(*p)) + return -EBADMSG; p = xdr_inline_decode(xdr, len * sizeof(*p)); if (unlikely(!p)) return -EBADMSG; -- cgit v1.2.3 From 31d7d7f35045e52c0efe4bbf5beeea02dba6803b Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Tue, 8 Feb 2022 09:52:13 +0100 Subject: mtd: rawnand: protect access to rawnand devices while in suspend commit 8cba323437a49a45756d661f500b324fc2d486fe upstream. Prevent rawnand access while in a suspended state. Commit 013e6292aaf5 ("mtd: rawnand: Simplify the locking") allows the rawnand layer to return errors rather than waiting in a blocking wait. Tested on a iMX6ULL. Fixes: 013e6292aaf5 ("mtd: rawnand: Simplify the locking") Signed-off-by: Sean Nyekjaer Reviewed-by: Boris Brezillon Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20220208085213.1838273-1-sean@geanix.com Signed-off-by: Greg Kroah-Hartman --- drivers/mtd/nand/raw/nand_base.c | 44 ++++++++++++++++++---------------------- include/linux/mtd/rawnand.h | 2 ++ 2 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index d5a2110eb38e..881e768f636f 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -335,16 +335,19 @@ static int nand_isbad_bbm(struct nand_chip *chip, loff_t ofs) * * Return: -EBUSY if the chip has been suspended, 0 otherwise */ -static int nand_get_device(struct nand_chip *chip) +static void nand_get_device(struct nand_chip *chip) { - mutex_lock(&chip->lock); - if (chip->suspended) { + /* Wait until the device is resumed. */ + while (1) { + mutex_lock(&chip->lock); + if (!chip->suspended) { + mutex_lock(&chip->controller->lock); + return; + } mutex_unlock(&chip->lock); - return -EBUSY; - } - mutex_lock(&chip->controller->lock); - return 0; + wait_event(chip->resume_wq, !chip->suspended); + } } /** @@ -573,9 +576,7 @@ static int nand_block_markbad_lowlevel(struct nand_chip *chip, loff_t ofs) nand_erase_nand(chip, &einfo, 0); /* Write bad block marker to OOB */ - ret = nand_get_device(chip); - if (ret) - return ret; + nand_get_device(chip); ret = nand_markbad_bbm(chip, ofs); nand_release_device(chip); @@ -3823,9 +3824,7 @@ static int nand_read_oob(struct mtd_info *mtd, loff_t from, ops->mode != MTD_OPS_RAW) return -ENOTSUPP; - ret = nand_get_device(chip); - if (ret) - return ret; + nand_get_device(chip); if (!ops->datbuf) ret = nand_do_read_oob(chip, from, ops); @@ -4412,13 +4411,11 @@ static int nand_write_oob(struct mtd_info *mtd, loff_t to, struct mtd_oob_ops *ops) { struct nand_chip *chip = mtd_to_nand(mtd); - int ret; + int ret = 0; ops->retlen = 0; - ret = nand_get_device(chip); - if (ret) - return ret; + nand_get_device(chip); switch (ops->mode) { case MTD_OPS_PLACE_OOB: @@ -4478,9 +4475,7 @@ int nand_erase_nand(struct nand_chip *chip, struct erase_info *instr, return -EIO; /* Grab the lock and see if the device is available */ - ret = nand_get_device(chip); - if (ret) - return ret; + nand_get_device(chip); /* Shift to get first page */ page = (int)(instr->addr >> chip->page_shift); @@ -4567,7 +4562,7 @@ static void nand_sync(struct mtd_info *mtd) pr_debug("%s: called\n", __func__); /* Grab the lock and see if the device is available */ - WARN_ON(nand_get_device(chip)); + nand_get_device(chip); /* Release it and go back */ nand_release_device(chip); } @@ -4584,9 +4579,7 @@ static int nand_block_isbad(struct mtd_info *mtd, loff_t offs) int ret; /* Select the NAND device */ - ret = nand_get_device(chip); - if (ret) - return ret; + nand_get_device(chip); nand_select_target(chip, chipnr); @@ -4657,6 +4650,8 @@ static void nand_resume(struct mtd_info *mtd) __func__); } mutex_unlock(&chip->lock); + + wake_up_all(&chip->resume_wq); } /** @@ -5434,6 +5429,7 @@ static int nand_scan_ident(struct nand_chip *chip, unsigned int maxchips, chip->cur_cs = -1; mutex_init(&chip->lock); + init_waitqueue_head(&chip->resume_wq); /* Enforce the right timings for reset/detection */ chip->current_interface_config = nand_get_reset_interface_config(); diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index 5b88cd51fadb..dcf90144d70b 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1240,6 +1240,7 @@ struct nand_secure_region { * @lock: Lock protecting the suspended field. Also used to serialize accesses * to the NAND device * @suspended: Set to 1 when the device is suspended, 0 when it's not + * @resume_wq: wait queue to sleep if rawnand is in suspended state. * @cur_cs: Currently selected target. -1 means no target selected, otherwise we * should always have cur_cs >= 0 && cur_cs < nanddev_ntargets(). * NAND Controller drivers should not modify this value, but they're @@ -1294,6 +1295,7 @@ struct nand_chip { /* Internals */ struct mutex lock; unsigned int suspended : 1; + wait_queue_head_t resume_wq; int cur_cs; int read_retries; struct nand_secure_region *secure_regions; -- cgit v1.2.3 From 6ade94e6afc6b7f6b0829c521cc3caaa7fbb82ab Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 28 Feb 2022 13:36:51 +0200 Subject: scsi: core: sd: Add silence_suspend flag to suppress some PM messages commit af4edb1d50c6d1044cb34bc43621411b7ba2cffe upstream. Kernel messages produced during runtime PM can cause a never-ending cycle because user space utilities (e.g. journald or rsyslog) write the messages back to storage, causing runtime resume, more messages, and so on. Messages that tell of things that are expected to happen are arguably unnecessary, so add a flag to suppress them. This flag is used by the UFS driver. Link: https://lore.kernel.org/r/20220228113652.970857-2-adrian.hunter@intel.com Cc: stable@vger.kernel.org Signed-off-by: Adrian Hunter Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_error.c | 9 +++++++-- drivers/scsi/sd.c | 6 ++++-- include/scsi/scsi_device.h | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 408d49c304b8..bb5a6e0fa49a 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -485,8 +485,13 @@ static void scsi_report_sense(struct scsi_device *sdev, if (sshdr->asc == 0x29) { evt_type = SDEV_EVT_POWER_ON_RESET_OCCURRED; - sdev_printk(KERN_WARNING, sdev, - "Power-on or device reset occurred\n"); + /* + * Do not print message if it is an expected side-effect + * of runtime PM. + */ + if (!sdev->silence_suspend) + sdev_printk(KERN_WARNING, sdev, + "Power-on or device reset occurred\n"); } if (sshdr->asc == 0x2a && sshdr->ascq == 0x01) { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 564a21b5da9d..a713babaee0f 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3628,7 +3628,8 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) return 0; if (sdkp->WCE && sdkp->media_present) { - sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); + if (!sdkp->device->silence_suspend) + sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); ret = sd_sync_cache(sdkp, &sshdr); if (ret) { @@ -3650,7 +3651,8 @@ static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) } if (sdkp->device->manage_start_stop) { - sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); + if (!sdkp->device->silence_suspend) + sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); /* an error is not worth aborting a system sleep */ ret = sd_start_stop_device(sdkp, 0); if (ignore_stop_errors) diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index b97e142a7ca9..3b3dbc37653d 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -206,6 +206,7 @@ struct scsi_device { unsigned rpm_autosuspend:1; /* Enable runtime autosuspend at device * creation time */ unsigned ignore_media_change:1; /* Ignore MEDIA CHANGE on resume */ + unsigned silence_suspend:1; /* Do not print runtime PM related messages */ bool offline_already; /* Device offline message logged */ -- cgit v1.2.3 From 7777744e92a0b30e3e0cce2758d911837011ebd9 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 30 Mar 2022 14:09:03 +0200 Subject: ALSA: pcm: Fix potential AB/BA lock with buffer_mutex and mmap_lock commit bc55cfd5718c7c23e5524582e9fa70b4d10f2433 upstream. syzbot caught a potential deadlock between the PCM runtime->buffer_mutex and the mm->mmap_lock. It was brought by the recent fix to cover the racy read/write and other ioctls, and in that commit, I overlooked a (hopefully only) corner case that may take the revert lock, namely, the OSS mmap. The OSS mmap operation exceptionally allows to re-configure the parameters inside the OSS mmap syscall, where mm->mmap_mutex is already held. Meanwhile, the copy_from/to_user calls at read/write operations also take the mm->mmap_lock internally, hence it may lead to a AB/BA deadlock. A similar problem was already seen in the past and we fixed it with a refcount (in commit b248371628aa). The former fix covered only the call paths with OSS read/write and OSS ioctls, while we need to cover the concurrent access via both ALSA and OSS APIs now. This patch addresses the problem above by replacing the buffer_mutex lock in the read/write operations with a refcount similar as we've used for OSS. The new field, runtime->buffer_accessing, keeps the number of concurrent read/write operations. Unlike the former buffer_mutex protection, this protects only around the copy_from/to_user() calls; the other codes are basically protected by the PCM stream lock. The refcount can be a negative, meaning blocked by the ioctls. If a negative value is seen, the read/write aborts with -EBUSY. In the ioctl side, OTOH, they check this refcount, too, and set to a negative value for blocking unless it's already being accessed. Reported-by: syzbot+6e5c88838328e99c7e1c@syzkaller.appspotmail.com Fixes: dca947d4d26d ("ALSA: pcm: Fix races among concurrent read/write and buffer changes") Cc: Link: https://lore.kernel.org/r/000000000000381a0d05db622a81@google.com Link: https://lore.kernel.org/r/20220330120903.4738-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/sound/pcm.h | 1 + sound/core/pcm.c | 1 + sound/core/pcm_lib.c | 9 +++++---- sound/core/pcm_native.c | 39 ++++++++++++++++++++++++++++++++------- 4 files changed, 39 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 2018b7512d3d..e08bf475d02d 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -399,6 +399,7 @@ struct snd_pcm_runtime { struct fasync_struct *fasync; bool stop_operating; /* sync_stop will be called */ struct mutex buffer_mutex; /* protect for buffer changes */ + atomic_t buffer_accessing; /* >0: in r/w operation, <0: blocked */ /* -- private section -- */ void *private_data; diff --git a/sound/core/pcm.c b/sound/core/pcm.c index edd9849210f2..977d54320a5c 100644 --- a/sound/core/pcm.c +++ b/sound/core/pcm.c @@ -970,6 +970,7 @@ int snd_pcm_attach_substream(struct snd_pcm *pcm, int stream, runtime->status->state = SNDRV_PCM_STATE_OPEN; mutex_init(&runtime->buffer_mutex); + atomic_set(&runtime->buffer_accessing, 0); substream->runtime = runtime; substream->private_data = pcm->private_data; diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index d25b1b005429..491064f55515 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -1905,11 +1905,9 @@ static int wait_for_avail(struct snd_pcm_substream *substream, if (avail >= runtime->twake) break; snd_pcm_stream_unlock_irq(substream); - mutex_unlock(&runtime->buffer_mutex); tout = schedule_timeout(wait_time); - mutex_lock(&runtime->buffer_mutex); snd_pcm_stream_lock_irq(substream); set_current_state(TASK_INTERRUPTIBLE); switch (runtime->status->state) { @@ -2203,7 +2201,6 @@ snd_pcm_sframes_t __snd_pcm_lib_xfer(struct snd_pcm_substream *substream, nonblock = !!(substream->f_flags & O_NONBLOCK); - mutex_lock(&runtime->buffer_mutex); snd_pcm_stream_lock_irq(substream); err = pcm_accessible_state(runtime); if (err < 0) @@ -2258,10 +2255,15 @@ snd_pcm_sframes_t __snd_pcm_lib_xfer(struct snd_pcm_substream *substream, err = -EINVAL; goto _end_unlock; } + if (!atomic_inc_unless_negative(&runtime->buffer_accessing)) { + err = -EBUSY; + goto _end_unlock; + } snd_pcm_stream_unlock_irq(substream); err = writer(substream, appl_ofs, data, offset, frames, transfer); snd_pcm_stream_lock_irq(substream); + atomic_dec(&runtime->buffer_accessing); if (err < 0) goto _end_unlock; err = pcm_accessible_state(runtime); @@ -2291,7 +2293,6 @@ snd_pcm_sframes_t __snd_pcm_lib_xfer(struct snd_pcm_substream *substream, if (xfer > 0 && err >= 0) snd_pcm_update_state(substream, runtime); snd_pcm_stream_unlock_irq(substream); - mutex_unlock(&runtime->buffer_mutex); return xfer > 0 ? (snd_pcm_sframes_t)xfer : err; } EXPORT_SYMBOL(__snd_pcm_lib_xfer); diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 5cac630a141c..f38c2e5e9a29 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -672,6 +672,24 @@ static int snd_pcm_hw_params_choose(struct snd_pcm_substream *pcm, return 0; } +/* acquire buffer_mutex; if it's in r/w operation, return -EBUSY, otherwise + * block the further r/w operations + */ +static int snd_pcm_buffer_access_lock(struct snd_pcm_runtime *runtime) +{ + if (!atomic_dec_unless_positive(&runtime->buffer_accessing)) + return -EBUSY; + mutex_lock(&runtime->buffer_mutex); + return 0; /* keep buffer_mutex, unlocked by below */ +} + +/* release buffer_mutex and clear r/w access flag */ +static void snd_pcm_buffer_access_unlock(struct snd_pcm_runtime *runtime) +{ + mutex_unlock(&runtime->buffer_mutex); + atomic_inc(&runtime->buffer_accessing); +} + #if IS_ENABLED(CONFIG_SND_PCM_OSS) #define is_oss_stream(substream) ((substream)->oss.oss) #else @@ -682,14 +700,16 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_runtime *runtime; - int err = 0, usecs; + int err, usecs; unsigned int bits; snd_pcm_uframes_t frames; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; - mutex_lock(&runtime->buffer_mutex); + err = snd_pcm_buffer_access_lock(runtime); + if (err < 0) + return err; snd_pcm_stream_lock_irq(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_OPEN: @@ -807,7 +827,7 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, snd_pcm_lib_free_pages(substream); } unlock: - mutex_unlock(&runtime->buffer_mutex); + snd_pcm_buffer_access_unlock(runtime); return err; } @@ -852,7 +872,9 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream) if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; - mutex_lock(&runtime->buffer_mutex); + result = snd_pcm_buffer_access_lock(runtime); + if (result < 0) + return result; snd_pcm_stream_lock_irq(substream); switch (runtime->status->state) { case SNDRV_PCM_STATE_SETUP: @@ -871,7 +893,7 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream) snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); cpu_latency_qos_remove_request(&substream->latency_pm_qos_req); unlock: - mutex_unlock(&runtime->buffer_mutex); + snd_pcm_buffer_access_unlock(runtime); return result; } @@ -1356,12 +1378,15 @@ static int snd_pcm_action_nonatomic(const struct action_ops *ops, /* Guarantee the group members won't change during non-atomic action */ down_read(&snd_pcm_link_rwsem); - mutex_lock(&substream->runtime->buffer_mutex); + res = snd_pcm_buffer_access_lock(substream->runtime); + if (res < 0) + goto unlock; if (snd_pcm_stream_linked(substream)) res = snd_pcm_action_group(ops, substream, state, false); else res = snd_pcm_action_single(ops, substream, state); - mutex_unlock(&substream->runtime->buffer_mutex); + snd_pcm_buffer_access_unlock(substream->runtime); + unlock: up_read(&snd_pcm_link_rwsem); return res; } -- cgit v1.2.3 From 73fa1798233ca1bc94831b5f171c94caecc3d9f3 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sat, 12 Mar 2022 11:09:47 +0530 Subject: ext4: fix ext4_fc_stats trace point commit 7af1974af0a9ba8a8ed2e3e947d87dd4d9a78d27 upstream. ftrace's __print_symbolic() requires that any enum values used in the symbol to string translation table be wrapped in a TRACE_DEFINE_ENUM so that the enum value can be decoded from the ftrace ring buffer by user space tooling. This patch also fixes few other problems found in this trace point. e.g. dereferencing structures in TP_printk which should not be done at any cost. Also to avoid checkpatch warnings, this patch removes those whitespaces/tab stops issues. Cc: stable@kernel.org Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Reported-by: Steven Rostedt Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Reviewed-by: Steven Rostedt (Google) Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/b4b9691414c35c62e570b723e661c80674169f9a.1647057583.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- include/trace/events/ext4.h | 78 ++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0ea36b2b0662..61a64d1b2bb6 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -95,6 +95,17 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B); { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) +TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); +TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); + #define show_fc_reason(reason) \ __print_symbolic(reason, \ { EXT4_FC_REASON_XATTR, "XATTR"}, \ @@ -2723,41 +2734,50 @@ TRACE_EVENT(ext4_fc_commit_stop, #define FC_REASON_NAME_STAT(reason) \ show_fc_reason(reason), \ - __entry->sbi->s_fc_stats.fc_ineligible_reason_count[reason] + __entry->fc_ineligible_rc[reason] TRACE_EVENT(ext4_fc_stats, - TP_PROTO(struct super_block *sb), - - TP_ARGS(sb), + TP_PROTO(struct super_block *sb), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(struct ext4_sb_info *, sbi) - __field(int, count) - ), + TP_ARGS(sb), - TP_fast_assign( - __entry->dev = sb->s_dev; - __entry->sbi = EXT4_SB(sb); - ), + TP_STRUCT__entry( + __field(dev_t, dev) + __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX) + __field(unsigned long, fc_commits) + __field(unsigned long, fc_ineligible_commits) + __field(unsigned long, fc_numblks) + ), - TP_printk("dev %d:%d fc ineligible reasons:\n" - "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " - "num_commits:%ld, ineligible: %ld, numblks: %ld", - MAJOR(__entry->dev), MINOR(__entry->dev), - FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), - FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), - FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), - FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), - FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), - FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), - FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), - FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), - FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), - __entry->sbi->s_fc_stats.fc_num_commits, - __entry->sbi->s_fc_stats.fc_ineligible_commits, - __entry->sbi->s_fc_stats.fc_numblks) + TP_fast_assign( + int i; + __entry->dev = sb->s_dev; + for (i = 0; i < EXT4_FC_REASON_MAX; i++) { + __entry->fc_ineligible_rc[i] = + EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i]; + } + __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits; + __entry->fc_ineligible_commits = + EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; + __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks; + ), + + TP_printk("dev %d,%d fc ineligible reasons:\n" + "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u " + "num_commits:%lu, ineligible: %lu, numblks: %lu", + MAJOR(__entry->dev), MINOR(__entry->dev), + FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), + FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), + FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), + FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), + FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), + FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), + FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), + FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), + FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), + __entry->fc_commits, __entry->fc_ineligible_commits, + __entry->fc_numblks) ); #define DEFINE_TRACE_DENTRY_EVENT(__type) \ -- cgit v1.2.3 From 2e76c69c85f9927ceddac0989238cadffc307f1b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 14 Mar 2022 14:30:11 -1000 Subject: block: don't merge across cgroup boundaries if blkcg is enabled commit 6b2b04590b51aa4cf395fcd185ce439cab5961dc upstream. blk-iocost and iolatency are cgroup aware rq-qos policies but they didn't disable merges across different cgroups. This obviously can lead to accounting and control errors but more importantly to priority inversions - e.g. an IO which belongs to a higher priority cgroup or IO class may end up getting throttled incorrectly because it gets merged to an IO issued from a low priority cgroup. Fix it by adding blk_cgroup_mergeable() which is called from merge paths and rejects cross-cgroup and cross-issue_as_root merges. Signed-off-by: Tejun Heo Fixes: d70675121546 ("block: introduce blk-iolatency io controller") Cc: stable@vger.kernel.org # v4.19+ Cc: Josef Bacik Link: https://lore.kernel.org/r/Yi/eE/6zFNyWJ+qd@slm.duckdns.org Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-merge.c | 11 +++++++++++ include/linux/blk-cgroup.h | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/block/blk-merge.c b/block/blk-merge.c index 7a5c81c02c80..bbe66a9010bf 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -561,6 +562,9 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { + if (!blk_cgroup_mergeable(req, bio)) + goto no_merge; + if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; @@ -657,6 +661,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (total_phys_segments > blk_rq_get_max_segments(req)) return 0; + if (!blk_cgroup_mergeable(req, next->bio)) + return 0; + if (blk_integrity_merge_rq(q, req, next) == false) return 0; @@ -863,6 +870,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->rq_disk != bio->bi_bdev->bd_disk) return false; + /* don't merge across cgroup boundaries */ + if (!blk_cgroup_mergeable(rq, bio)) + return false; + /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b4de2010fba5..bc5c04d711bb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -24,6 +24,7 @@ #include #include #include +#include /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) @@ -604,6 +605,21 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } +/** + * blk_cgroup_mergeable - Determine whether to allow or disallow merges + * @rq: request to merge into + * @bio: bio to merge + * + * @bio and @rq should belong to the same cgroup and their issue_as_root should + * match. The latter is necessary as we don't want to throttle e.g. a metadata + * update because it happens to be next to a regular IO. + */ +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) +{ + return rq->bio->bi_blkg == bio->bi_blkg && + bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); +} + void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); @@ -659,6 +675,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) { } static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline void blk_cgroup_bio_start(struct bio *bio) { } +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) -- cgit v1.2.3 From c894ac44786cfed383a6c6b20c1bfb12eb96018a Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 25 Jan 2022 10:12:18 +0100 Subject: fbdev: Hot-unplug firmware fb devices on forced removal commit 27599aacbaefcbf2af7b06b0029459bbf682000d upstream. Hot-unplug all firmware-framebuffer devices as part of removing them via remove_conflicting_framebuffers() et al. Releases all memory regions to be acquired by native drivers. Firmware, such as EFI, install a framebuffer while posting the computer. After removing the firmware-framebuffer device from fbdev, a native driver takes over the hardware and the firmware framebuffer becomes invalid. Firmware-framebuffer drivers, specifically simplefb, don't release their device from Linux' device hierarchy. It still owns the firmware framebuffer and blocks the native drivers from loading. This has been observed in the vmwgfx driver. [1] Initiating a device removal (i.e., hot unplug) as part of remove_conflicting_framebuffers() removes the underlying device and returns the memory range to the system. [1] https://lore.kernel.org/dri-devel/20220117180359.18114-1-zack@kde.org/ v2: * rename variable 'dev' to 'device' (Javier) Signed-off-by: Thomas Zimmermann Reported-by: Zack Rusin Reviewed-by: Javier Martinez Canillas Reviewed-by: Zack Rusin Reviewed-by: Hans de Goede CC: stable@vger.kernel.org # v5.11+ Link: https://patchwork.freedesktop.org/patch/msgid/20220125091222.21457-2-tzimmermann@suse.de Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/core/fbmem.c | 29 ++++++++++++++++++++++++++--- include/linux/fb.h | 1 + 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 7bd5e2a4a9da..91145d93990a 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1557,18 +1558,36 @@ static void do_remove_conflicting_framebuffers(struct apertures_struct *a, /* check all firmware fbs and kick off if the base addr overlaps */ for_each_registered_fb(i) { struct apertures_struct *gen_aper; + struct device *device; if (!(registered_fb[i]->flags & FBINFO_MISC_FIRMWARE)) continue; gen_aper = registered_fb[i]->apertures; + device = registered_fb[i]->device; if (fb_do_apertures_overlap(gen_aper, a) || (primary && gen_aper && gen_aper->count && gen_aper->ranges[0].base == VGA_FB_PHYS)) { printk(KERN_INFO "fb%d: switching to %s from %s\n", i, name, registered_fb[i]->fix.id); - do_unregister_framebuffer(registered_fb[i]); + + /* + * If we kick-out a firmware driver, we also want to remove + * the underlying platform device, such as simple-framebuffer, + * VESA, EFI, etc. A native driver will then be able to + * allocate the memory range. + * + * If it's not a platform device, at least print a warning. A + * fix would add code to remove the device from the system. + */ + if (dev_is_platform(device)) { + registered_fb[i]->forced_out = true; + platform_device_unregister(to_platform_device(device)); + } else { + pr_warn("fb%d: cannot remove device\n", i); + do_unregister_framebuffer(registered_fb[i]); + } } } } @@ -1895,9 +1914,13 @@ EXPORT_SYMBOL(register_framebuffer); void unregister_framebuffer(struct fb_info *fb_info) { - mutex_lock(®istration_lock); + bool forced_out = fb_info->forced_out; + + if (!forced_out) + mutex_lock(®istration_lock); do_unregister_framebuffer(fb_info); - mutex_unlock(®istration_lock); + if (!forced_out) + mutex_unlock(®istration_lock); } EXPORT_SYMBOL(unregister_framebuffer); diff --git a/include/linux/fb.h b/include/linux/fb.h index 02f362c661c8..3d7306c9a706 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -502,6 +502,7 @@ struct fb_info { } *apertures; bool skip_vt_switch; /* no VT switch on suspend/resume required */ + bool forced_out; /* set when being removed by another driver */ }; static inline struct apertures_struct *alloc_apertures(unsigned int max_num) { -- cgit v1.2.3 From cdcaec46a6b22a7c2a44366115d424b38a5cee55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 Mar 2022 21:27:51 +0100 Subject: rfkill: make new event layout opt-in commit 54f586a9153201c6cff55e1f561990c78bd99aa7 upstream. Again new complaints surfaced that we had broken the ABI here, although previously all the userspace tools had agreed that it was their mistake and fixed it. Yet now there are cases (e.g. RHEL) that want to run old userspace with newer kernels, and thus are broken. Since this is a bit of a whack-a-mole thing, change the whole extensibility scheme of rfkill to no longer just rely on the message lengths, but instead require userspace to opt in via a new ioctl to a given maximum event size that it is willing to understand. By default, set that to RFKILL_EVENT_SIZE_V1 (8), so that the behaviour for userspace not calling the ioctl will look as if it's just running on an older kernel. Fixes: 14486c82612a ("rfkill: add a reason to the HW rfkill state") Cc: stable@vger.kernel.org # 5.11+ Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220316212749.16491491b270.Ifcb1950998330a596f29a2a162e00b7546a1d6d0@changeid Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/rfkill.h | 14 +++++++++++-- net/rfkill/core.c | 48 ++++++++++++++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h index 9b77cfc42efa..283c5a7b3f2c 100644 --- a/include/uapi/linux/rfkill.h +++ b/include/uapi/linux/rfkill.h @@ -159,8 +159,16 @@ struct rfkill_event_ext { * old behaviour for all userspace, unless it explicitly opts in to the * rules outlined here by using the new &struct rfkill_event_ext. * - * Userspace using &struct rfkill_event_ext must adhere to the following - * rules + * Additionally, some other userspace (bluez, g-s-d) was reading with a + * large size but as streaming reads rather than message-based, or with + * too strict checks for the returned size. So eventually, we completely + * reverted this, and extended messages need to be opted in to by using + * an ioctl: + * + * ioctl(fd, RFKILL_IOCTL_MAX_SIZE, sizeof(struct rfkill_event_ext)); + * + * Userspace using &struct rfkill_event_ext and the ioctl must adhere to + * the following rules: * * 1. accept short writes, optionally using them to detect that it's * running on an older kernel; @@ -175,6 +183,8 @@ struct rfkill_event_ext { #define RFKILL_IOC_MAGIC 'R' #define RFKILL_IOC_NOINPUT 1 #define RFKILL_IOCTL_NOINPUT _IO(RFKILL_IOC_MAGIC, RFKILL_IOC_NOINPUT) +#define RFKILL_IOC_MAX_SIZE 2 +#define RFKILL_IOCTL_MAX_SIZE _IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_EXT_SIZE, __u32) /* and that's all userspace gets */ diff --git a/net/rfkill/core.c b/net/rfkill/core.c index ac15a944573f..068c7bcd30c9 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -78,6 +78,7 @@ struct rfkill_data { struct mutex mtx; wait_queue_head_t read_wait; bool input_handler; + u8 max_size; }; @@ -1141,6 +1142,8 @@ static int rfkill_fop_open(struct inode *inode, struct file *file) if (!data) return -ENOMEM; + data->max_size = RFKILL_EVENT_SIZE_V1; + INIT_LIST_HEAD(&data->events); mutex_init(&data->mtx); init_waitqueue_head(&data->read_wait); @@ -1223,6 +1226,7 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf, list); sz = min_t(unsigned long, sizeof(ev->ev), count); + sz = min_t(unsigned long, sz, data->max_size); ret = sz; if (copy_to_user(buf, &ev->ev, sz)) ret = -EFAULT; @@ -1237,6 +1241,7 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf, static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { + struct rfkill_data *data = file->private_data; struct rfkill *rfkill; struct rfkill_event_ext ev; int ret; @@ -1251,6 +1256,7 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, * our API version even in a write() call, if it cares. */ count = min(count, sizeof(ev)); + count = min_t(size_t, count, data->max_size); if (copy_from_user(&ev, buf, count)) return -EFAULT; @@ -1310,31 +1316,47 @@ static int rfkill_fop_release(struct inode *inode, struct file *file) return 0; } -#ifdef CONFIG_RFKILL_INPUT static long rfkill_fop_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct rfkill_data *data = file->private_data; + int ret = -ENOSYS; + u32 size; if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC) return -ENOSYS; - if (_IOC_NR(cmd) != RFKILL_IOC_NOINPUT) - return -ENOSYS; - mutex_lock(&data->mtx); - - if (!data->input_handler) { - if (atomic_inc_return(&rfkill_input_disabled) == 1) - printk(KERN_DEBUG "rfkill: input handler disabled\n"); - data->input_handler = true; + switch (_IOC_NR(cmd)) { +#ifdef CONFIG_RFKILL_INPUT + case RFKILL_IOC_NOINPUT: + if (!data->input_handler) { + if (atomic_inc_return(&rfkill_input_disabled) == 1) + printk(KERN_DEBUG "rfkill: input handler disabled\n"); + data->input_handler = true; + } + ret = 0; + break; +#endif + case RFKILL_IOC_MAX_SIZE: + if (get_user(size, (__u32 __user *)arg)) { + ret = -EFAULT; + break; + } + if (size < RFKILL_EVENT_SIZE_V1 || size > U8_MAX) { + ret = -EINVAL; + break; + } + data->max_size = size; + ret = 0; + break; + default: + break; } - mutex_unlock(&data->mtx); - return 0; + return ret; } -#endif static const struct file_operations rfkill_fops = { .owner = THIS_MODULE, @@ -1343,10 +1365,8 @@ static const struct file_operations rfkill_fops = { .write = rfkill_fop_write, .poll = rfkill_fop_poll, .release = rfkill_fop_release, -#ifdef CONFIG_RFKILL_INPUT .unlocked_ioctl = rfkill_fop_ioctl, .compat_ioctl = compat_ptr_ioctl, -#endif .llseek = no_llseek, }; -- cgit v1.2.3 From bc5f440e1c5c86a09b4045049552fd31b5aea293 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 14 Mar 2022 19:59:53 +0100 Subject: pstore: Don't use semaphores in always-atomic-context code commit 8126b1c73108bc691f5643df19071a59a69d0bc6 upstream. pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior Signed-off-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/efi/efi-pstore.c | 2 +- fs/pstore/platform.c | 38 ++++++++++++++++++-------------------- include/linux/pstore.h | 6 +++--- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 0ef086e43090..7e771c56c13c 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -266,7 +266,7 @@ static int efi_pstore_write(struct pstore_record *record) efi_name[i] = name[i]; ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES, - preemptible(), record->size, record->psi->buf); + false, record->size, record->psi->buf); if (record->reason == KMSG_DUMP_OOPS && try_module_get(THIS_MODULE)) if (!schedule_work(&efivar_work)) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b9614db48b1d..ad96ba97d8f9 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -143,21 +143,22 @@ static void pstore_timer_kick(void) mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } -/* - * Should pstore_dump() wait for a concurrent pstore_dump()? If - * not, the current pstore_dump() will report a failure to dump - * and return. - */ -static bool pstore_cannot_wait(enum kmsg_dump_reason reason) +static bool pstore_cannot_block_path(enum kmsg_dump_reason reason) { - /* In NMI path, pstore shouldn't block regardless of reason. */ + /* + * In case of NMI path, pstore shouldn't be blocked + * regardless of reason. + */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked. */ + /* + * Emergency restart shouldn't be blocked by spinning on + * pstore_info::buf_lock. + */ case KMSG_DUMP_EMERG: return true; default: @@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; + unsigned long flags = 0; int ret; why = kmsg_dump_reason_str(reason); - if (down_trylock(&psinfo->buf_lock)) { - /* Failed to acquire lock: give up if we cannot wait. */ - if (pstore_cannot_wait(reason)) { - pr_err("dump skipped in %s path: may corrupt error record\n", - in_nmi() ? "NMI" : why); - return; - } - if (down_interruptible(&psinfo->buf_lock)) { - pr_err("could not grab semaphore?!\n"); + if (pstore_cannot_block_path(reason)) { + if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) { + pr_err("dump skipped in %s path because of concurrent dump\n", + in_nmi() ? "NMI" : why); return; } + } else { + spin_lock_irqsave(&psinfo->buf_lock, flags); } kmsg_dump_rewind(&iter); @@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - - up(&psinfo->buf_lock); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); - sema_init(&psinfo->buf_lock, 1); + spin_lock_init(&psinfo->buf_lock); if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index eb93a54cff31..e97a8188f0fd 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -87,7 +87,7 @@ struct pstore_record { * @owner: module which is responsible for this backend driver * @name: name of the backend driver * - * @buf_lock: semaphore to serialize access to @buf + * @buf_lock: spinlock to serialize access to @buf * @buf: preallocated crash dump buffer * @bufsize: size of @buf available for crash dump bytes (must match * smallest number of bytes available for writing to a @@ -178,7 +178,7 @@ struct pstore_info { struct module *owner; const char *name; - struct semaphore buf_lock; + spinlock_t buf_lock; char *buf; size_t bufsize; -- cgit v1.2.3 From 8550c9b846c5c9c10e053d2f8f1a04f5a8ea8a75 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 10:05:21 +0100 Subject: stack: Constrain and fix stack offset randomization with Clang builds [ Upstream commit efa90c11f62e6b7252fb75efe2787056872a627c ] All supported versions of Clang perform auto-init of __builtin_alloca() when stack auto-init is on (CONFIG_INIT_STACK_ALL_{ZERO,PATTERN}). add_random_kstack_offset() uses __builtin_alloca() to add a stack offset. This means, when CONFIG_INIT_STACK_ALL_{ZERO,PATTERN} is enabled, add_random_kstack_offset() will auto-init that unused portion of the stack used to add an offset. There are several problems with this: 1. These offsets can be as large as 1023 bytes. Performing memset() on them isn't exactly cheap, and this is done on every syscall entry. 2. Architectures adding add_random_kstack_offset() to syscall entry implemented in C require them to be 'noinstr' (e.g. see x86 and s390). The potential problem here is that a call to memset may occur, which is not noinstr. A x86_64 defconfig kernel with Clang 11 and CONFIG_VMLINUX_VALIDATION shows: | vmlinux.o: warning: objtool: do_syscall_64()+0x9d: call to memset() leaves .noinstr.text section | vmlinux.o: warning: objtool: do_int80_syscall_32()+0xab: call to memset() leaves .noinstr.text section | vmlinux.o: warning: objtool: __do_fast_syscall_32()+0xe2: call to memset() leaves .noinstr.text section | vmlinux.o: warning: objtool: fixup_bad_iret()+0x2f: call to memset() leaves .noinstr.text section Clang 14 (unreleased) will introduce a way to skip alloca initialization via __builtin_alloca_uninitialized() (https://reviews.llvm.org/D115440). Constrain RANDOMIZE_KSTACK_OFFSET to only be enabled if no stack auto-init is enabled, the compiler is GCC, or Clang is version 14+. Use __builtin_alloca_uninitialized() if the compiler provides it, as is done by Clang 14. Link: https://lkml.kernel.org/r/YbHTKUjEejZCLyhX@elver.google.com Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Signed-off-by: Marco Elver Reviewed-by: Nathan Chancellor Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220131090521.1947110-2-elver@google.com Signed-off-by: Sasha Levin --- arch/Kconfig | 1 + include/linux/randomize_kstack.h | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index d1e69d6e8498..191589f26b1a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1141,6 +1141,7 @@ config HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET config RANDOMIZE_KSTACK_OFFSET_DEFAULT bool "Randomize kernel stack offset on syscall entry" depends on HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET + depends on INIT_STACK_NONE || !CC_IS_CLANG || CLANG_VERSION >= 140000 help The kernel stack offset can be randomized (after pt_regs) by roughly 5 bits of entropy, frustrating memory corruption diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index bebc911161b6..d373f1bcbf7c 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -16,8 +16,20 @@ DECLARE_PER_CPU(u32, kstack_offset); * alignment. Also, since this use is being explicitly masked to a max of * 10 bits, stack-clash style attacks are unlikely. For more details see * "VLAs" in Documentation/process/deprecated.rst + * + * The normal __builtin_alloca() is initialized with INIT_STACK_ALL (currently + * only with Clang and not GCC). Initializing the unused area on each syscall + * entry is expensive, and generating an implicit call to memset() may also be + * problematic (such as in noinstr functions). Therefore, if the compiler + * supports it (which it should if it initializes allocas), always use the + * "uninitialized" variant of the builtin. */ -void *__builtin_alloca(size_t size); +#if __has_builtin(__builtin_alloca_uninitialized) +#define __kstack_alloca __builtin_alloca_uninitialized +#else +#define __kstack_alloca __builtin_alloca +#endif + /* * Use, at most, 10 bits of entropy. We explicitly cap this to keep the * "VLA" from being unbounded (see above). 10 bits leaves enough room for @@ -36,7 +48,7 @@ void *__builtin_alloca(size_t size); if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ u32 offset = raw_cpu_read(kstack_offset); \ - u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset)); \ + u8 *ptr = __kstack_alloca(KSTACK_OFFSET_MAX(offset)); \ /* Keep allocation even after "ptr" loses scope. */ \ asm volatile("" :: "r"(ptr) : "memory"); \ } \ -- cgit v1.2.3 From 3bb11f3f6872a692759f653f90d10674deb330a4 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 27 Jan 2022 10:27:20 -0500 Subject: rseq: Remove broken uapi field layout on 32-bit little endian [ Upstream commit bfdf4e6208051ed7165b2e92035b4bf11f43eb63 ] The rseq rseq_cs.ptr.{ptr32,padding} uapi endianness handling is entirely wrong on 32-bit little endian: a preprocessor logic mistake wrongly uses the big endian field layout on 32-bit little endian architectures. Fortunately, those ptr32 accessors were never used within the kernel, and only meant as a convenience for user-space. Remove those and replace the whole rseq_cs union by a __u64 type, as this is the only thing really needed to express the ABI. Document how 32-bit architectures are meant to interact with this field. Fixes: ec9c82e03a74 ("rseq: uapi: Declare rseq_cs field as union, update includes") Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220127152720.25898-1-mathieu.desnoyers@efficios.com Signed-off-by: Sasha Levin --- include/uapi/linux/rseq.h | 20 ++++---------------- kernel/rseq.c | 8 ++++---- 2 files changed, 8 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 9a402fdb60e9..77ee207623a9 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -105,23 +105,11 @@ struct rseq { * Read and set by the kernel. Set by user-space with single-copy * atomicity semantics. This field should only be updated by the * thread which registered this data structure. Aligned on 64-bit. + * + * 32-bit architectures should update the low order bits of the + * rseq_cs field, leaving the high order bits initialized to 0. */ - union { - __u64 ptr64; -#ifdef __LP64__ - __u64 ptr; -#else - struct { -#if (defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN)) || defined(__BIG_ENDIAN) - __u32 padding; /* Initialized to zero. */ - __u32 ptr32; -#else /* LITTLE */ - __u32 ptr32; - __u32 padding; /* Initialized to zero. */ -#endif /* ENDIAN */ - } ptr; -#endif - } rseq_cs; + __u64 rseq_cs; /* * Restartable sequences flags field. diff --git a/kernel/rseq.c b/kernel/rseq.c index 6d45ac3dae7f..97ac20b4f738 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) int ret; #ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs.ptr64)) + if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else - if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr))) + if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { @@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t) * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs.ptr64); + return put_user(0UL, &t->rseq->rseq_cs); #else - if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64))) + if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif -- cgit v1.2.3 From 42dab81e93094eb312a1482a4914d7f5addb8210 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 23 Dec 2021 17:23:00 +0100 Subject: firmware: ti_sci: Fix compilation failure when CONFIG_TI_SCI_PROTOCOL is not defined [ Upstream commit 043cfff99a18933fda2fb2e163daee73cc07910b ] Remove an extra ";" which breaks compilation. Fixes: 53bf2b0e4e4c ("firmware: ti_sci: Add support for getting resource with subtype") Signed-off-by: Christophe JAILLET Signed-off-by: Nishanth Menon Link: https://lore.kernel.org/r/e6c3cb793e1a6a2a0ae2528d5a5650dfe6a4b6ff.1640276505.git.christophe.jaillet@wanadoo.fr Signed-off-by: Sasha Levin --- include/linux/soc/ti/ti_sci_protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h index 0aad7009b50e..bd0d11af76c5 100644 --- a/include/linux/soc/ti/ti_sci_protocol.h +++ b/include/linux/soc/ti/ti_sci_protocol.h @@ -645,7 +645,7 @@ devm_ti_sci_get_of_resource(const struct ti_sci_handle *handle, static inline struct ti_sci_resource * devm_ti_sci_get_resource(const struct ti_sci_handle *handle, struct device *dev, - u32 dev_id, u32 sub_type); + u32 dev_id, u32 sub_type) { return ERR_PTR(-EINVAL); } -- cgit v1.2.3 From da491fc54e4e387419948840636df15f4a611ec4 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 20 Jan 2022 16:16:12 +0100 Subject: drm/edid: Split deep color modes between RGB and YUV444 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4adc33f36d80489339f1b43dfeee96bb9ea8e459 ] The current code assumes that the RGB444 and YUV444 formats are the same, but the HDMI 2.0 specification states that: The three DC_XXbit bits above only indicate support for RGB 4:4:4 at that pixel size. Support for YCBCR 4:4:4 in Deep Color modes is indicated with the DC_Y444 bit. If DC_Y444 is set, then YCBCR 4:4:4 is supported for all modes indicated by the DC_XXbit flags. So if we have YUV444 support and any DC_XXbit flag set but the DC_Y444 flag isn't, we'll assume that we support that deep colour mode for YUV444 which breaks the specification. In order to fix this, let's split the edid_hdmi_dc_modes field in struct drm_display_info into two fields, one for RGB444 and one for YUV444. Suggested-by: Ville Syrjälä Fixes: d0c94692e0a3 ("drm/edid: Parse and handle HDMI deep color modes.") Signed-off-by: Maxime Ripard Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20220120151625.594595-4-maxime@cerno.tech Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 2 +- drivers/gpu/drm/drm_edid.c | 7 ++++--- drivers/gpu/drm/i915/display/intel_hdmi.c | 4 ++-- drivers/gpu/drm/radeon/radeon_connectors.c | 2 +- include/drm/drm_connector.h | 12 +++++++++--- 5 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c index df1f9b88a53f..a09876bb7ec8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c @@ -175,7 +175,7 @@ int amdgpu_connector_get_monitor_bpc(struct drm_connector *connector) /* Check if bpc is within clock limit. Try to degrade gracefully otherwise */ if ((bpc == 12) && (mode_clock * 3/2 > max_tmds_clock)) { - if ((connector->display_info.edid_hdmi_dc_modes & DRM_EDID_HDMI_DC_30) && + if ((connector->display_info.edid_hdmi_rgb444_dc_modes & DRM_EDID_HDMI_DC_30) && (mode_clock * 5/4 <= max_tmds_clock)) bpc = 10; else diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index cdf5789ab246..ee6f44f9a81c 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -5004,21 +5004,21 @@ static void drm_parse_hdmi_deep_color_info(struct drm_connector *connector, if (hdmi[6] & DRM_EDID_HDMI_DC_30) { dc_bpc = 10; - info->edid_hdmi_dc_modes |= DRM_EDID_HDMI_DC_30; + info->edid_hdmi_rgb444_dc_modes |= DRM_EDID_HDMI_DC_30; DRM_DEBUG("%s: HDMI sink does deep color 30.\n", connector->name); } if (hdmi[6] & DRM_EDID_HDMI_DC_36) { dc_bpc = 12; - info->edid_hdmi_dc_modes |= DRM_EDID_HDMI_DC_36; + info->edid_hdmi_rgb444_dc_modes |= DRM_EDID_HDMI_DC_36; DRM_DEBUG("%s: HDMI sink does deep color 36.\n", connector->name); } if (hdmi[6] & DRM_EDID_HDMI_DC_48) { dc_bpc = 16; - info->edid_hdmi_dc_modes |= DRM_EDID_HDMI_DC_48; + info->edid_hdmi_rgb444_dc_modes |= DRM_EDID_HDMI_DC_48; DRM_DEBUG("%s: HDMI sink does deep color 48.\n", connector->name); } @@ -5035,6 +5035,7 @@ static void drm_parse_hdmi_deep_color_info(struct drm_connector *connector, /* YCRCB444 is optional according to spec. */ if (hdmi[6] & DRM_EDID_HDMI_DC_Y444) { + info->edid_hdmi_ycbcr444_dc_modes = info->edid_hdmi_rgb444_dc_modes; DRM_DEBUG("%s: HDMI sink does YCRCB444 in deep color.\n", connector->name); } diff --git a/drivers/gpu/drm/i915/display/intel_hdmi.c b/drivers/gpu/drm/i915/display/intel_hdmi.c index c3787512295d..5bb2a42bf6b0 100644 --- a/drivers/gpu/drm/i915/display/intel_hdmi.c +++ b/drivers/gpu/drm/i915/display/intel_hdmi.c @@ -1892,7 +1892,7 @@ static bool intel_hdmi_bpc_possible(struct drm_connector *connector, if (ycbcr420_output) return hdmi->y420_dc_modes & DRM_EDID_YCBCR420_DC_36; else - return info->edid_hdmi_dc_modes & DRM_EDID_HDMI_DC_36; + return info->edid_hdmi_rgb444_dc_modes & DRM_EDID_HDMI_DC_36; case 10: if (DISPLAY_VER(i915) < 11) return false; @@ -1903,7 +1903,7 @@ static bool intel_hdmi_bpc_possible(struct drm_connector *connector, if (ycbcr420_output) return hdmi->y420_dc_modes & DRM_EDID_YCBCR420_DC_30; else - return info->edid_hdmi_dc_modes & DRM_EDID_HDMI_DC_30; + return info->edid_hdmi_rgb444_dc_modes & DRM_EDID_HDMI_DC_30; case 8: return true; default: diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index 607ad5620bd9..1546abcadacf 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -204,7 +204,7 @@ int radeon_get_monitor_bpc(struct drm_connector *connector) /* Check if bpc is within clock limit. Try to degrade gracefully otherwise */ if ((bpc == 12) && (mode_clock * 3/2 > max_tmds_clock)) { - if ((connector->display_info.edid_hdmi_dc_modes & DRM_EDID_HDMI_DC_30) && + if ((connector->display_info.edid_hdmi_rgb444_dc_modes & DRM_EDID_HDMI_DC_30) && (mode_clock * 5/4 <= max_tmds_clock)) bpc = 10; else diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index 1647960c9e50..dbd0ccdec656 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -566,10 +566,16 @@ struct drm_display_info { bool rgb_quant_range_selectable; /** - * @edid_hdmi_dc_modes: Mask of supported hdmi deep color modes. Even - * more stuff redundant with @bus_formats. + * @edid_hdmi_dc_rgb444_modes: Mask of supported hdmi deep color modes + * in RGB 4:4:4. Even more stuff redundant with @bus_formats. */ - u8 edid_hdmi_dc_modes; + u8 edid_hdmi_rgb444_dc_modes; + + /** + * @edid_hdmi_dc_ycbcr444_modes: Mask of supported hdmi deep color + * modes in YCbCr 4:4:4. Even more stuff redundant with @bus_formats. + */ + u8 edid_hdmi_ycbcr444_dc_modes; /** * @cea_rev: CEA revision of the HDMI sink. -- cgit v1.2.3 From e37d269734ee9866bef3f965d4f545b86352f326 Mon Sep 17 00:00:00 2001 From: Mark Tomlinson Date: Thu, 6 Aug 2020 16:14:55 +1200 Subject: PCI: Reduce warnings on possible RW1C corruption [ Upstream commit 92c45b63ce22c8898aa41806e8d6692bcd577510 ] For hardware that only supports 32-bit writes to PCI there is the possibility of clearing RW1C (write-one-to-clear) bits. A rate-limited messages was introduced by fb2659230120, but rate-limiting is not the best choice here. Some devices may not show the warnings they should if another device has just produced a bunch of warnings. Also, the number of messages can be a nuisance on devices which are otherwise working fine. Change the ratelimit to a single warning per bus. This ensures no bus is 'starved' of emitting a warning and also that there isn't a continuous stream of warnings. It would be preferable to have a warning per device, but the pci_dev structure is not available here, and a lookup from devfn would be far too slow. Suggested-by: Bjorn Helgaas Fixes: fb2659230120 ("PCI: Warn on possible RW1C corruption for sub-32 bit config writes") Link: https://lore.kernel.org/r/20200806041455.11070-1-mark.tomlinson@alliedtelesis.co.nz Signed-off-by: Mark Tomlinson Signed-off-by: Bjorn Helgaas Reviewed-by: Florian Fainelli Reviewed-by: Rob Herring Acked-by: Scott Branden Signed-off-by: Sasha Levin --- drivers/pci/access.c | 9 ++++++--- include/linux/pci.h | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 46935695cfb9..8d0d1f61c650 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -160,9 +160,12 @@ int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn, * write happen to have any RW1C (write-one-to-clear) bits set, we * just inadvertently cleared something we shouldn't have. */ - dev_warn_ratelimited(&bus->dev, "%d-byte config write to %04x:%02x:%02x.%d offset %#x may corrupt adjacent RW1C bits\n", - size, pci_domain_nr(bus), bus->number, - PCI_SLOT(devfn), PCI_FUNC(devfn), where); + if (!bus->unsafe_warn) { + dev_warn(&bus->dev, "%d-byte config write to %04x:%02x:%02x.%d offset %#x may corrupt adjacent RW1C bits\n", + size, pci_domain_nr(bus), bus->number, + PCI_SLOT(devfn), PCI_FUNC(devfn), where); + bus->unsafe_warn = 1; + } mask = ~(((1 << (size * 8)) - 1) << ((where & 0x3) * 8)); tmp = readl(addr) & mask; diff --git a/include/linux/pci.h b/include/linux/pci.h index 152a4d74f87f..9d6e75222868 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -656,6 +656,7 @@ struct pci_bus { struct bin_attribute *legacy_io; /* Legacy I/O for this bus */ struct bin_attribute *legacy_mem; /* Legacy mem */ unsigned int is_added:1; + unsigned int unsafe_warn:1; /* warned about RW1C config write */ }; #define to_pci_bus(n) container_of(n, struct pci_bus, dev) -- cgit v1.2.3 From ef9785f429794567792561a584901faa9291d3ee Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 4 Mar 2022 16:11:42 +0800 Subject: bpf, sockmap: Fix memleak in sk_psock_queue_msg [ Upstream commit 938d3480b92fa5e454b7734294f12a7b75126f09 ] If tcp_bpf_sendmsg is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it. sk1 (redirect sk2) sk2 ------------------- --------------- tcp_bpf_sendmsg() tcp_bpf_send_verdict() tcp_bpf_sendmsg_redir() bpf_tcp_ingress() sock_map_close() lock_sock() lock_sock() ... blocking sk_psock_stop sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); release_sock(sk); lock_sock() sk_mem_charge() get_page() sk_psock_queue_msg() sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED); drop_sk_msg() release_sock() While drop_sk_msg(), the msg has charged memory form sk by sk_mem_charge and has sg pages need to put. To fix we use sk_msg_free() and then kfee() msg. This issue can cause the following info: WARNING: CPU: 0 PID: 9202 at net/core/stream.c:205 sk_stream_kill_queues+0xc8/0xe0 Call Trace: inet_csk_destroy_sock+0x55/0x110 tcp_rcv_state_process+0xe5f/0xe90 ? sk_filter_trim_cap+0x10d/0x230 ? tcp_v4_do_rcv+0x161/0x250 tcp_v4_do_rcv+0x161/0x250 tcp_v4_rcv+0xc3a/0xce0 ip_protocol_deliver_rcu+0x3d/0x230 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0xfd/0x110 ? ip_protocol_deliver_rcu+0x230/0x230 ip_rcv+0xd6/0x100 ? ip_local_deliver+0x110/0x110 __netif_receive_skb_one_core+0x85/0xa0 process_backlog+0xa4/0x160 __napi_poll+0x29/0x1b0 net_rx_action+0x287/0x300 __do_softirq+0xff/0x2fc do_softirq+0x79/0x90 WARNING: CPU: 0 PID: 531 at net/ipv4/af_inet.c:154 inet_sock_destruct+0x175/0x1b0 Call Trace: __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Fixes: 9635720b7c88 ("bpf, sockmap: Fix memleak on ingress msg enqueue") Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220304081145.2037182-2-wangyufen@huawei.com Signed-off-by: Sasha Levin --- include/linux/skmsg.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index b4256847c707..73bedd128d52 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -310,21 +310,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) -{ - if (msg->skb) - sock_drop(psock->sk, msg->skb); - kfree(msg); -} - static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); - else - drop_sk_msg(psock, msg); + else { + sk_msg_free(psock->sk, msg); + kfree(msg); + } spin_unlock_bh(&psock->ingress_lock); } -- cgit v1.2.3 From 2a85c4cb3b1585adad7917dfa7d1889939bdec3e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Mar 2022 11:19:43 +0100 Subject: netfilter: flowtable: Fix QinQ and pppoe support for inet table [ Upstream commit 0492d857636e1c52cd71594a723c4b26a7b31978 ] nf_flow_offload_inet_hook() does not check for 802.1q and PPPoE. Fetch inner ethertype from these encapsulation protocols. Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_flow_table.h | 18 ++++++++++++++++++ net/netfilter/nf_flow_table_inet.c | 17 +++++++++++++++++ net/netfilter/nf_flow_table_ip.c | 18 ------------------ 3 files changed, 35 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index a3647fadf1cc..9f927c44087d 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include struct nf_flowtable; struct nf_flow_rule; @@ -313,4 +315,20 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); +static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +{ + __be16 proto; + + proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); + switch (proto) { + case htons(PPP_IP): + return htons(ETH_P_IP); + case htons(PPP_IPV6): + return htons(ETH_P_IPV6); + } + + return 0; +} + #endif /* _NF_FLOW_TABLE_H */ diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c index bc4126d8ef65..280fdd32965f 100644 --- a/net/netfilter/nf_flow_table_inet.c +++ b/net/netfilter/nf_flow_table_inet.c @@ -6,12 +6,29 @@ #include #include #include +#include static unsigned int nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct vlan_ethhdr *veth; + __be16 proto; + switch (skb->protocol) { + case htons(ETH_P_8021Q): + veth = (struct vlan_ethhdr *)skb_mac_header(skb); + proto = veth->h_vlan_encapsulated_proto; + break; + case htons(ETH_P_PPP_SES): + proto = nf_flow_pppoe_proto(skb); + break; + default: + proto = skb->protocol; + break; + } + + switch (proto) { case htons(ETH_P_IP): return nf_flow_offload_ip_hook(priv, skb, state); case htons(ETH_P_IPV6): diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 889cf88d3dba..6257d87c3a56 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,8 +8,6 @@ #include #include #include -#include -#include #include #include #include @@ -239,22 +237,6 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) -{ - __be16 proto; - - proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + - sizeof(struct pppoe_hdr))); - switch (proto) { - case htons(PPP_IP): - return htons(ETH_P_IP); - case htons(PPP_IPV6): - return htons(ETH_P_IPV6); - } - - return 0; -} - static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto, u32 *offset) { -- cgit v1.2.3 From 0ee072f91326d4a616098fb55c9e0e8e6a399642 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 2 Mar 2022 22:02:55 +0100 Subject: netfilter: conntrack: Add and use nf_ct_set_auto_assign_helper_warned() [ Upstream commit 31d0bb9763efad30377505f3467f958d1ebe1e3d ] The function sets the pernet boolean to avoid the spurious warning from nf_ct_lookup_helper() when assigning conntrack helpers via nftables. Fixes: 1a64edf54f55 ("netfilter: nft_ct: add helper set support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/netfilter/nf_conntrack_helper.h | 1 + net/netfilter/nf_conntrack_helper.c | 6 ++++++ net/netfilter/nft_ct.c | 3 +++ 3 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 37f0fbefb060..9939c366f720 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -177,4 +177,5 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_nat_helper_put(struct nf_conntrack_helper *helper); +void nf_ct_set_auto_assign_helper_warned(struct net *net); #endif /*_NF_CONNTRACK_HELPER_H*/ diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index ae4488a13c70..ceb38a7b37cb 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -556,6 +556,12 @@ static const struct nf_ct_ext_type helper_extend = { .id = NF_CT_EXT_HELPER, }; +void nf_ct_set_auto_assign_helper_warned(struct net *net) +{ + nf_ct_pernet(net)->auto_assign_helper_warned = true; +} +EXPORT_SYMBOL_GPL(nf_ct_set_auto_assign_helper_warned); + void nf_conntrack_helper_pernet_init(struct net *net) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 99b1de14ff7e..54ecb9fbf2de 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1040,6 +1040,9 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (err < 0) goto err_put_helper; + /* Avoid the bogus warning, helper will be assigned after CT init */ + nf_ct_set_auto_assign_helper_warned(ctx->net); + return 0; err_put_helper: -- cgit v1.2.3 From d6c4fc0d903fbcc8e1190ed12bfd80f5d031c41f Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Thu, 10 Feb 2022 16:42:03 -0800 Subject: serial: 8250_aspeed_vuart: add PORT_ASPEED_VUART port type [ Upstream commit a603ca60cebff8589882427a67f870ed946b3fc8 ] Commit 54da3e381c2b ("serial: 8250_aspeed_vuart: use UPF_IOREMAP to set up register mapping") fixed a bug that had, as a side-effect, prevented the 8250_aspeed_vuart driver from enabling the VUART's FIFOs. However, fixing that (and hence enabling the FIFOs) has in turn revealed what appears to be a hardware bug in the ASPEED VUART in which the host-side THRE bit doesn't get if the BMC-side receive FIFO trigger level is set to anything but one byte. This causes problems for polled-mode writes from the host -- for example, Linux kernel console writes proceed at a glacial pace (less than 100 bytes per second) because the write path waits for a 10ms timeout to expire after every character instead of being able to continue on to the next character upon seeing THRE asserted. (GRUB behaves similarly.) As a workaround, introduce a new port type for the ASPEED VUART that's identical to PORT_16550A as it had previously been using, but with UART_FCR_R_TRIG_00 instead to set the receive FIFO trigger level to one byte, which (experimentally) seems to avoid the problematic THRE behavior. Fixes: 54da3e381c2b ("serial: 8250_aspeed_vuart: use UPF_IOREMAP to set up register mapping") Tested-by: Konstantin Aladyshev Reviewed-by: Andy Shevchenko Signed-off-by: Zev Weiss Link: https://lore.kernel.org/r/20220211004203.14915-1-zev@bewilderbeest.net Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/8250/8250_aspeed_vuart.c | 2 +- drivers/tty/serial/8250/8250_port.c | 8 ++++++++ include/uapi/linux/serial_core.h | 3 +++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c index 2350fb3bb5e4..c2cecc6f47db 100644 --- a/drivers/tty/serial/8250/8250_aspeed_vuart.c +++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c @@ -487,7 +487,7 @@ static int aspeed_vuart_probe(struct platform_device *pdev) port.port.irq = irq_of_parse_and_map(np, 0); port.port.handle_irq = aspeed_vuart_handle_irq; port.port.iotype = UPIO_MEM; - port.port.type = PORT_16550A; + port.port.type = PORT_ASPEED_VUART; port.port.uartclk = clk; port.port.flags = UPF_SHARE_IRQ | UPF_BOOT_AUTOCONF | UPF_IOREMAP | UPF_FIXED_PORT | UPF_FIXED_TYPE | UPF_NO_THRE_TEST; diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index ec88b706e882..b470bc747b99 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -307,6 +307,14 @@ static const struct serial8250_config uart_config[] = { .rxtrig_bytes = {1, 32, 64, 112}, .flags = UART_CAP_FIFO | UART_CAP_SLEEP, }, + [PORT_ASPEED_VUART] = { + .name = "ASPEED VUART", + .fifo_size = 16, + .tx_loadsz = 16, + .fcr = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00, + .rxtrig_bytes = {1, 4, 8, 14}, + .flags = UART_CAP_FIFO, + }, }; /* Uart divisor latch read */ diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index c4042dcfdc0c..8885e69178bd 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -68,6 +68,9 @@ /* NVIDIA Tegra Combined UART */ #define PORT_TEGRA_TCU 41 +/* ASPEED AST2x00 virtual UART */ +#define PORT_ASPEED_VUART 42 + /* Intel EG20 */ #define PORT_PCH_8LINE 44 #define PORT_PCH_2LINE 45 -- cgit v1.2.3 From 16e3238047322c3967e601b2813a10837fb2acb9 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Mon, 14 Mar 2022 11:14:32 +0200 Subject: serial: 8250: fix XOFF/XON sending when DMA is used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit f58c252e30cf74f68b0054293adc03b5923b9f0e ] When 8250 UART is using DMA, x_char (XON/XOFF) is never sent to the wire. After this change, x_char is injected correctly. Create uart_xchar_out() helper for sending the x_char out and accounting related to it. It seems that almost every driver does these same steps with x_char. Except for 8250, however, almost all currently lack .serial_out so they cannot immediately take advantage of this new helper. The downside of this patch is that it might reintroduce the problems some devices faced with mixed DMA/non-DMA transfer which caused revert f967fc8f165f (Revert "serial: 8250_dma: don't bother DMA with small transfers"). However, the impact should be limited to cases with XON/XOFF (that didn't work with DMA capable devices to begin with so this problem is not very likely to cause a major issue, if any at all). Fixes: 9ee4b83e51f74 ("serial: 8250: Add support for dmaengine") Reported-by: Gilles Buloz Tested-by: Gilles Buloz Reviewed-by: Andy Shevchenko Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20220314091432.4288-2-ilpo.jarvinen@linux.intel.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/8250/8250_dma.c | 11 ++++++++++- drivers/tty/serial/8250/8250_port.c | 4 +--- drivers/tty/serial/serial_core.c | 14 ++++++++++++++ include/linux/serial_core.h | 2 ++ 4 files changed, 27 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/tty/serial/8250/8250_dma.c b/drivers/tty/serial/8250/8250_dma.c index 890fa7ddaa7f..b3c3f7e5851a 100644 --- a/drivers/tty/serial/8250/8250_dma.c +++ b/drivers/tty/serial/8250/8250_dma.c @@ -64,10 +64,19 @@ int serial8250_tx_dma(struct uart_8250_port *p) struct uart_8250_dma *dma = p->dma; struct circ_buf *xmit = &p->port.state->xmit; struct dma_async_tx_descriptor *desc; + struct uart_port *up = &p->port; int ret; - if (dma->tx_running) + if (dma->tx_running) { + if (up->x_char) { + dmaengine_pause(dma->txchan); + uart_xchar_out(up, UART_TX); + dmaengine_resume(dma->txchan); + } return 0; + } else if (up->x_char) { + uart_xchar_out(up, UART_TX); + } if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) { /* We have been called from __dma_tx_complete() */ diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 868ccb3e16cf..723ec0806799 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1819,9 +1819,7 @@ void serial8250_tx_chars(struct uart_8250_port *up) int count; if (port->x_char) { - serial_out(up, UART_TX, port->x_char); - port->icount.tx++; - port->x_char = 0; + uart_xchar_out(port, UART_TX); return; } if (uart_tx_stopped(port)) { diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index dc6129ddef85..eb15423f935a 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -652,6 +652,20 @@ static void uart_flush_buffer(struct tty_struct *tty) tty_port_tty_wakeup(&state->port); } +/* + * This function performs low-level write of high-priority XON/XOFF + * character and accounting for it. + * + * Requires uart_port to implement .serial_out(). + */ +void uart_xchar_out(struct uart_port *uport, int offset) +{ + serial_port_out(uport, offset, uport->x_char); + uport->icount.tx++; + uport->x_char = 0; +} +EXPORT_SYMBOL_GPL(uart_xchar_out); + /* * This function is used to send a high-priority XON/XOFF character to * the device diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index c58cc142d23f..8c32935e1059 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -458,6 +458,8 @@ extern void uart_handle_cts_change(struct uart_port *uport, extern void uart_insert_char(struct uart_port *port, unsigned int status, unsigned int overrun, unsigned int ch, unsigned int flag); +void uart_xchar_out(struct uart_port *uport, int offset); + #ifdef CONFIG_MAGIC_SYSRQ_SERIAL #define SYSRQ_TIMEOUT (HZ * 5) -- cgit v1.2.3 From 95bc0ba6bef8f23242ef82a42deebb3ebf58b05b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 7 Feb 2022 10:19:43 +0000 Subject: atomics: Fix atomic64_{read_acquire,set_release} fallbacks [ Upstream commit dc1b4df09acdca7a89806b28f235cd6d8dcd3d24 ] Arnd reports that on 32-bit architectures, the fallbacks for atomic64_read_acquire() and atomic64_set_release() are broken as they use smp_load_acquire() and smp_store_release() respectively, which do not work on types larger than the native word size. Since those contain compiletime_assert_atomic_type(), any attempt to use those fallbacks will result in a build-time error. e.g. with the following added to arch/arm/kernel/setup.c: | void test_atomic64(atomic64_t *v) | { | atomic64_set_release(v, 5); | atomic64_read_acquire(v); | } The compiler will complain as follows: | In file included from : | In function 'arch_atomic64_set_release', | inlined from 'test_atomic64' at ./include/linux/atomic/atomic-instrumented.h:669:2: | ././include/linux/compiler_types.h:346:38: error: call to '__compiletime_assert_9' declared with attribute error: Need native word sized stores/loads for atomicity. | 346 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | | ^ | ././include/linux/compiler_types.h:327:4: note: in definition of macro '__compiletime_assert' | 327 | prefix ## suffix(); \ | | ^~~~~~ | ././include/linux/compiler_types.h:346:2: note: in expansion of macro '_compiletime_assert' | 346 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | | ^~~~~~~~~~~~~~~~~~~ | ././include/linux/compiler_types.h:349:2: note: in expansion of macro 'compiletime_assert' | 349 | compiletime_assert(__native_word(t), \ | | ^~~~~~~~~~~~~~~~~~ | ./include/asm-generic/barrier.h:133:2: note: in expansion of macro 'compiletime_assert_atomic_type' | 133 | compiletime_assert_atomic_type(*p); \ | | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ./include/asm-generic/barrier.h:164:55: note: in expansion of macro '__smp_store_release' | 164 | #define smp_store_release(p, v) do { kcsan_release(); __smp_store_release(p, v); } while (0) | | ^~~~~~~~~~~~~~~~~~~ | ./include/linux/atomic/atomic-arch-fallback.h:1270:2: note: in expansion of macro 'smp_store_release' | 1270 | smp_store_release(&(v)->counter, i); | | ^~~~~~~~~~~~~~~~~ | make[2]: *** [scripts/Makefile.build:288: arch/arm/kernel/setup.o] Error 1 | make[1]: *** [scripts/Makefile.build:550: arch/arm/kernel] Error 2 | make: *** [Makefile:1831: arch/arm] Error 2 Fix this by only using smp_load_acquire() and smp_store_release() for native atomic types, and otherwise falling back to the regular barriers necessary for acquire/release semantics, as we do in the more generic acquire and release fallbacks. Since the fallback templates are used to generate the atomic64_*() and atomic_*() operations, the __native_word() check is added to both. For the atomic_*() operations, which are always 32-bit, the __native_word() check is redundant but not harmful, as it is always true. For the example above this works as expected on 32-bit, e.g. for arm multi_v7_defconfig: | : | push {r4, r5} | dmb ish | pldw [r0] | mov r2, #5 | mov r3, #0 | ldrexd r4, [r0] | strexd r4, r2, [r0] | teq r4, #0 | bne 484 | ldrexd r2, [r0] | dmb ish | pop {r4, r5} | bx lr ... and also on 64-bit, e.g. for arm64 defconfig: | : | bti c | paciasp | mov x1, #0x5 | stlr x1, [x0] | ldar x0, [x0] | autiasp | ret Reported-by: Arnd Bergmann Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ard Biesheuvel Reviewed-by: Boqun Feng Link: https://lore.kernel.org/r/20220207101943.439825-1-mark.rutland@arm.com Signed-off-by: Sasha Levin --- include/linux/atomic/atomic-arch-fallback.h | 38 +++++++++++++++++++++++++---- scripts/atomic/fallbacks/read_acquire | 11 ++++++++- scripts/atomic/fallbacks/set_release | 7 +++++- 3 files changed, 49 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index a3dba31df01e..6db58d180866 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -151,7 +151,16 @@ static __always_inline int arch_atomic_read_acquire(const atomic_t *v) { - return smp_load_acquire(&(v)->counter); + int ret; + + if (__native_word(atomic_t)) { + ret = smp_load_acquire(&(v)->counter); + } else { + ret = arch_atomic_read(v); + __atomic_acquire_fence(); + } + + return ret; } #define arch_atomic_read_acquire arch_atomic_read_acquire #endif @@ -160,7 +169,12 @@ arch_atomic_read_acquire(const atomic_t *v) static __always_inline void arch_atomic_set_release(atomic_t *v, int i) { - smp_store_release(&(v)->counter, i); + if (__native_word(atomic_t)) { + smp_store_release(&(v)->counter, i); + } else { + __atomic_release_fence(); + arch_atomic_set(v, i); + } } #define arch_atomic_set_release arch_atomic_set_release #endif @@ -1258,7 +1272,16 @@ arch_atomic_dec_if_positive(atomic_t *v) static __always_inline s64 arch_atomic64_read_acquire(const atomic64_t *v) { - return smp_load_acquire(&(v)->counter); + s64 ret; + + if (__native_word(atomic64_t)) { + ret = smp_load_acquire(&(v)->counter); + } else { + ret = arch_atomic64_read(v); + __atomic_acquire_fence(); + } + + return ret; } #define arch_atomic64_read_acquire arch_atomic64_read_acquire #endif @@ -1267,7 +1290,12 @@ arch_atomic64_read_acquire(const atomic64_t *v) static __always_inline void arch_atomic64_set_release(atomic64_t *v, s64 i) { - smp_store_release(&(v)->counter, i); + if (__native_word(atomic64_t)) { + smp_store_release(&(v)->counter, i); + } else { + __atomic_release_fence(); + arch_atomic64_set(v, i); + } } #define arch_atomic64_set_release arch_atomic64_set_release #endif @@ -2358,4 +2386,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v) #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// cca554917d7ea73d5e3e7397dd70c484cad9b2c4 +// 8e2cc06bc0d2c0967d2f8424762bd48555ee40ae diff --git a/scripts/atomic/fallbacks/read_acquire b/scripts/atomic/fallbacks/read_acquire index 803ba7561076..a0ea1d26e6b2 100755 --- a/scripts/atomic/fallbacks/read_acquire +++ b/scripts/atomic/fallbacks/read_acquire @@ -2,6 +2,15 @@ cat <counter); + ${int} ret; + + if (__native_word(${atomic}_t)) { + ret = smp_load_acquire(&(v)->counter); + } else { + ret = arch_${atomic}_read(v); + __atomic_acquire_fence(); + } + + return ret; } EOF diff --git a/scripts/atomic/fallbacks/set_release b/scripts/atomic/fallbacks/set_release index 86ede759f24e..05cdb7f42477 100755 --- a/scripts/atomic/fallbacks/set_release +++ b/scripts/atomic/fallbacks/set_release @@ -2,6 +2,11 @@ cat <counter, i); + if (__native_word(${atomic}_t)) { + smp_store_release(&(v)->counter, i); + } else { + __atomic_release_fence(); + arch_${atomic}_set(v, i); + } } EOF -- cgit v1.2.3 From 7cca463f103068ba2532aaf57a564f21f2b55f15 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Thu, 20 Jan 2022 16:25:20 +0000 Subject: sched/tracing: Report TASK_RTLOCK_WAIT tasks as TASK_UNINTERRUPTIBLE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 25795ef6299f07ce3838f3253a9cb34f64efcfae ] TASK_RTLOCK_WAIT currently isn't part of TASK_REPORT, thus a task blocking on an rtlock will appear as having a task state == 0, IOW TASK_RUNNING. The actual state is saved in p->saved_state, but reading it after reading p->__state has a few issues: o that could still be TASK_RUNNING in the case of e.g. rt_spin_lock o ttwu_state_match() might have changed that to TASK_RUNNING As pointed out by Eric, adding TASK_RTLOCK_WAIT to TASK_REPORT implies exposing a new state to userspace tools which way not know what to do with them. The only information that needs to be conveyed here is that a task is waiting on an rt_mutex, which matches TASK_UNINTERRUPTIBLE - there's no need for a new state. Reported-by: Uwe Kleine-König Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20220120162520.570782-3-valentin.schneider@arm.com Signed-off-by: Sasha Levin --- include/linux/sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 76e869550646..9016bbacedf3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1626,6 +1626,14 @@ static inline unsigned int task_state_index(struct task_struct *tsk) if (tsk_state == TASK_IDLE) state = TASK_REPORT_IDLE; + /* + * We're lying here, but rather than expose a completely new task state + * to userspace, we can make this appear as if the task has gone through + * a regular rt_mutex_lock() call. + */ + if (tsk_state == TASK_RTLOCK_WAIT) + state = TASK_UNINTERRUPTIBLE; + return fls(state); } -- cgit v1.2.3 From e912d697400c5f4bc8c1c8090416dad6c59cb28c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 24 Feb 2022 19:56:09 -0800 Subject: drm/dp: Fix off-by-one in register cache size commit d4da1f27396fb1dde079447a3612f4f512caed07 upstream. The pcon_dsc_dpcd array holds 13 registers (0x92 through 0x9E). Fix the math to calculate the max size. Found from a -Warray-bounds build: drivers/gpu/drm/drm_dp_helper.c: In function 'drm_dp_pcon_dsc_bpp_incr': drivers/gpu/drm/drm_dp_helper.c:3130:28: error: array subscript 12 is outside array bounds of 'const u8[12]' {aka 'const unsigned char[12]'} [-Werror=array-bounds] 3130 | buf = pcon_dsc_dpcd[DP_PCON_DSC_BPP_INCR - DP_PCON_DSC_ENCODER]; | ~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/drm_dp_helper.c:3126:39: note: while referencing 'pcon_dsc_dpcd' 3126 | int drm_dp_pcon_dsc_bpp_incr(const u8 pcon_dsc_dpcd[DP_PCON_DSC_ENCODER_CAP_SIZE]) | ~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Cc: Daniel Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: dri-devel@lists.freedesktop.org Fixes: e2e16da398d9 ("drm/dp_helper: Add support for Configuring DSC for HDMI2.1 Pcon") Cc: stable@vger.kernel.org Reviewed-by: Gustavo A. R. Silva Link: https://lore.kernel.org/lkml/20211214001849.GA62559@embeddedor/ Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220105173310.2420598-1-keescook@chromium.org Signed-off-by: Thierry Reding Link: https://patchwork.freedesktop.org/patch/msgid/20220225035610.2552144-2-keescook@chromium.org Signed-off-by: Greg Kroah-Hartman --- include/drm/drm_dp_helper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h index 1d5b3dbb6e56..dfb46915015b 100644 --- a/include/drm/drm_dp_helper.h +++ b/include/drm/drm_dp_helper.h @@ -455,7 +455,7 @@ struct drm_panel; # define DP_FEC_BIT_ERROR_COUNT_CAP (1 << 3) /* DP-HDMI2.1 PCON DSC ENCODER SUPPORT */ -#define DP_PCON_DSC_ENCODER_CAP_SIZE 0xC /* 0x9E - 0x92 */ +#define DP_PCON_DSC_ENCODER_CAP_SIZE 0xD /* 0x92 through 0x9E */ #define DP_PCON_DSC_ENCODER 0x092 # define DP_PCON_DSC_ENCODER_SUPPORTED (1 << 0) # define DP_PCON_DSC_PPS_ENC_OVERRIDE (1 << 1) -- cgit v1.2.3 From 7d18d6c71372bde4d62eceb819de631cebd44367 Mon Sep 17 00:00:00 2001 From: Sungup Moon Date: Mon, 14 Mar 2022 20:05:45 +0900 Subject: nvme: allow duplicate NSIDs for private namespaces commit 5974ea7ce0f9a5987fc8cf5e08ad6e3e70bb542e upstream. A NVMe subsystem with multiple controller can have private namespaces that use the same NSID under some conditions: "If Namespace Management, ANA Reporting, or NVM Sets are supported, the NSIDs shall be unique within the NVM subsystem. If the Namespace Management, ANA Reporting, and NVM Sets are not supported, then NSIDs: a) for shared namespace shall be unique; and b) for private namespace are not required to be unique." Reference: Section 6.1.6 NSID and Namespace Usage; NVM Express 1.4c spec. Make sure this specific setup is supported in Linux. Fixes: 9ad1927a3bc2 ("nvme: always search for namespace head") Signed-off-by: Sungup Moon [hch: refactored and fixed the controller vs subsystem based naming conflict] Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/core.c | 15 ++++++++++----- drivers/nvme/host/multipath.c | 7 ++++--- drivers/nvme/host/nvme.h | 19 +++++++++++++++++++ include/linux/nvme.h | 1 + 4 files changed, 34 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b349e19e745b..916f55bb7662 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3510,15 +3510,20 @@ static const struct attribute_group *nvme_dev_attr_groups[] = { NULL, }; -static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, +static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns_head *h; - lockdep_assert_held(&subsys->lock); + lockdep_assert_held(&ctrl->subsys->lock); - list_for_each_entry(h, &subsys->nsheads, entry) { - if (h->ns_id != nsid) + list_for_each_entry(h, &ctrl->subsys->nsheads, entry) { + /* + * Private namespaces can share NSIDs under some conditions. + * In that case we can't use the same ns_head for namespaces + * with the same NSID. + */ + if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h)) continue; if (!list_empty(&h->list) && nvme_tryget_ns_head(h)) return h; @@ -3686,7 +3691,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, int ret = 0; mutex_lock(&ctrl->subsys->lock); - head = nvme_find_ns_head(ctrl->subsys, nsid); + head = nvme_find_ns_head(ctrl, nsid); if (!head) { head = nvme_alloc_ns_head(ctrl, nsid, ids); if (IS_ERR(head)) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 727520c39710..e9301b51db76 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -462,10 +462,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) /* * Add a multipath node if the subsystems supports multiple controllers. - * We also do this for private namespaces as the namespace sharing data could - * change after a rescan. + * We also do this for private namespaces as the namespace sharing flag + * could change after a rescan. */ - if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) + if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || + !nvme_is_unique_nsid(ctrl, head) || !multipath) return 0; head->disk = blk_alloc_disk(ctrl->numa_node); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ed79a6c7e804..0628e2d802e7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -693,6 +693,25 @@ static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, return true; return __nvme_check_ready(ctrl, rq, queue_live); } + +/* + * NSID shall be unique for all shared namespaces, or if at least one of the + * following conditions is met: + * 1. Namespace Management is supported by the controller + * 2. ANA is supported by the controller + * 3. NVM Set are supported by the controller + * + * In other case, private namespace are not required to report a unique NSID. + */ +static inline bool nvme_is_unique_nsid(struct nvme_ctrl *ctrl, + struct nvme_ns_head *head) +{ + return head->shared || + (ctrl->oacs & NVME_CTRL_OACS_NS_MNGT_SUPP) || + (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) || + (ctrl->ctratt & NVME_CTRL_CTRATT_NVM_SETS); +} + int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index b7c4c4130b65..039f59ee8f43 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -322,6 +322,7 @@ enum { NVME_CTRL_ONCS_TIMESTAMP = 1 << 6, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_NS_MNGT_SUPP = 1 << 3, NVME_CTRL_OACS_DIRECTIVES = 1 << 5, NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8, NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1, -- cgit v1.2.3 From 8cbf4ae7a2833767d63114573e5f9a45740cc975 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 30 Mar 2022 15:39:16 +0100 Subject: rxrpc: Fix call timer start racing with call destruction commit 4a7f62f91933c8ae5308f9127fd8ea48188b6bc3 upstream. The rxrpc_call struct has a timer used to handle various timed events relating to a call. This timer can get started from the packet input routines that are run in softirq mode with just the RCU read lock held. Unfortunately, because only the RCU read lock is held - and neither ref or other lock is taken - the call can start getting destroyed at the same time a packet comes in addressed to that call. This causes the timer - which was already stopped - to get restarted. Later, the timer dispatch code may then oops if the timer got deallocated first. Fix this by trying to take a ref on the rxrpc_call struct and, if successful, passing that ref along to the timer. If the timer was already running, the ref is discarded. The timer completion routine can then pass the ref along to the call's work item when it queues it. If the timer or work item where already queued/running, the extra ref is discarded. Fixes: a158bdd3247b ("rxrpc: Fix call timeouts") Reported-by: Marc Dionne Signed-off-by: David Howells Reviewed-by: Marc Dionne Tested-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: http://lists.infradead.org/pipermail/linux-afs/2022-March/005073.html Link: https://lore.kernel.org/r/164865115696.2943015.11097991776647323586.stgit@warthog.procyon.org.uk Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- include/trace/events/rxrpc.h | 8 +++++++- net/rxrpc/ar-internal.h | 15 +++++++-------- net/rxrpc/call_event.c | 2 +- net/rxrpc/call_object.c | 40 +++++++++++++++++++++++++++++++++++----- 4 files changed, 50 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e70c90116eda..4a3ab0ed6e06 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -83,12 +83,15 @@ enum rxrpc_call_trace { rxrpc_call_error, rxrpc_call_got, rxrpc_call_got_kernel, + rxrpc_call_got_timer, rxrpc_call_got_userid, rxrpc_call_new_client, rxrpc_call_new_service, rxrpc_call_put, rxrpc_call_put_kernel, rxrpc_call_put_noqueue, + rxrpc_call_put_notimer, + rxrpc_call_put_timer, rxrpc_call_put_userid, rxrpc_call_queued, rxrpc_call_queued_ref, @@ -278,12 +281,15 @@ enum rxrpc_tx_point { EM(rxrpc_call_error, "*E*") \ EM(rxrpc_call_got, "GOT") \ EM(rxrpc_call_got_kernel, "Gke") \ + EM(rxrpc_call_got_timer, "GTM") \ EM(rxrpc_call_got_userid, "Gus") \ EM(rxrpc_call_new_client, "NWc") \ EM(rxrpc_call_new_service, "NWs") \ EM(rxrpc_call_put, "PUT") \ EM(rxrpc_call_put_kernel, "Pke") \ - EM(rxrpc_call_put_noqueue, "PNQ") \ + EM(rxrpc_call_put_noqueue, "PnQ") \ + EM(rxrpc_call_put_notimer, "PnT") \ + EM(rxrpc_call_put_timer, "PTM") \ EM(rxrpc_call_put_userid, "Pus") \ EM(rxrpc_call_queued, "QUE") \ EM(rxrpc_call_queued_ref, "QUR") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7bd6f8a66a3e..969e532f77a9 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -777,14 +777,12 @@ void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); -static inline void rxrpc_reduce_call_timer(struct rxrpc_call *call, - unsigned long expire_at, - unsigned long now, - enum rxrpc_timer_trace why) -{ - trace_rxrpc_timer(call, why, now); - timer_reduce(&call->timer, expire_at); -} +void rxrpc_reduce_call_timer(struct rxrpc_call *call, + unsigned long expire_at, + unsigned long now, + enum rxrpc_timer_trace why); + +void rxrpc_delete_call_timer(struct rxrpc_call *call); /* * call_object.c @@ -808,6 +806,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *); bool __rxrpc_queue_call(struct rxrpc_call *); bool rxrpc_queue_call(struct rxrpc_call *); void rxrpc_see_call(struct rxrpc_call *); +bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index df864e692267..22e05de5d1ca 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -310,7 +310,7 @@ recheck_state: } if (call->state == RXRPC_CALL_COMPLETE) { - del_timer_sync(&call->timer); + rxrpc_delete_call_timer(call); goto out_put; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 4eb91d958a48..043508fd8d8a 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -53,10 +53,30 @@ static void rxrpc_call_timer_expired(struct timer_list *t) if (call->state < RXRPC_CALL_COMPLETE) { trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); - rxrpc_queue_call(call); + __rxrpc_queue_call(call); + } else { + rxrpc_put_call(call, rxrpc_call_put); + } +} + +void rxrpc_reduce_call_timer(struct rxrpc_call *call, + unsigned long expire_at, + unsigned long now, + enum rxrpc_timer_trace why) +{ + if (rxrpc_try_get_call(call, rxrpc_call_got_timer)) { + trace_rxrpc_timer(call, why, now); + if (timer_reduce(&call->timer, expire_at)) + rxrpc_put_call(call, rxrpc_call_put_notimer); } } +void rxrpc_delete_call_timer(struct rxrpc_call *call) +{ + if (del_timer_sync(&call->timer)) + rxrpc_put_call(call, rxrpc_call_put_timer); +} + static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; /* @@ -463,6 +483,17 @@ void rxrpc_see_call(struct rxrpc_call *call) } } +bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) +{ + const void *here = __builtin_return_address(0); + int n = atomic_fetch_add_unless(&call->usage, 1, 0); + + if (n == 0) + return false; + trace_rxrpc_call(call->debug_id, op, n, here, NULL); + return true; +} + /* * Note the addition of a ref on a call. */ @@ -510,8 +541,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) spin_unlock_bh(&call->lock); rxrpc_put_call_slot(call); - - del_timer_sync(&call->timer); + rxrpc_delete_call_timer(call); /* Make sure we don't get any more notifications */ write_lock_bh(&rx->recvmsg_lock); @@ -618,6 +648,8 @@ static void rxrpc_destroy_call(struct work_struct *work) struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); struct rxrpc_net *rxnet = call->rxnet; + rxrpc_delete_call_timer(call); + rxrpc_put_connection(call->conn); rxrpc_put_peer(call->peer); kfree(call->rxtx_buffer); @@ -652,8 +684,6 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); - del_timer_sync(&call->timer); - ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); -- cgit v1.2.3 From b9cf1208af36f6552e6f3b8fdf364983e3cc8e60 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 2 Feb 2022 10:43:40 +0100 Subject: drm/connector: Fix typo in documentation commit dca384a3bf5af1c781cfa6aec63904bdb5018c36 upstream. Commit 4adc33f36d80 ("drm/edid: Split deep color modes between RGB and YUV444") introduced two new variables in struct drm_display_info and their documentation, but the documentation part had a typo resulting in a doc build warning. Fixes: 4adc33f36d80 ("drm/edid: Split deep color modes between RGB and YUV444") Reported-by: Stephen Rothwell Signed-off-by: Maxime Ripard Reviewed-by: Simon Ser Link: https://patchwork.freedesktop.org/patch/msgid/20220202094340.875190-1-maxime@cerno.tech Signed-off-by: Greg Kroah-Hartman --- include/drm/drm_connector.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index dbd0ccdec656..1f43d7c6724a 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -566,13 +566,13 @@ struct drm_display_info { bool rgb_quant_range_selectable; /** - * @edid_hdmi_dc_rgb444_modes: Mask of supported hdmi deep color modes + * @edid_hdmi_rgb444_dc_modes: Mask of supported hdmi deep color modes * in RGB 4:4:4. Even more stuff redundant with @bus_formats. */ u8 edid_hdmi_rgb444_dc_modes; /** - * @edid_hdmi_dc_ycbcr444_modes: Mask of supported hdmi deep color + * @edid_hdmi_ycbcr444_dc_modes: Mask of supported hdmi deep color * modes in YCbCr 4:4:4. Even more stuff redundant with @bus_formats. */ u8 edid_hdmi_ycbcr444_dc_modes; -- cgit v1.2.3 From 7007c894631cf43041dcfa0da7142bbaa7eb673c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Mar 2022 11:37:05 -0700 Subject: Reinstate some of "swiotlb: rework "fix info leak with DMA_FROM_DEVICE"" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 901c7280ca0d5e2b4a8929fbe0bfb007ac2a6544 upstream. Halil Pasic points out [1] that the full revert of that commit (revert in bddac7c1e02b), and that a partial revert that only reverts the problematic case, but still keeps some of the cleanups is probably better.  And that partial revert [2] had already been verified by Oleksandr Natalenko to also fix the issue, I had just missed that in the long discussion. So let's reinstate the cleanups from commit aa6f8dcbab47 ("swiotlb: rework "fix info leak with DMA_FROM_DEVICE""), and effectively only revert the part that caused problems. Link: https://lore.kernel.org/all/20220328013731.017ae3e3.pasic@linux.ibm.com/ [1] Link: https://lore.kernel.org/all/20220324055732.GB12078@lst.de/ [2] Link: https://lore.kernel.org/all/4386660.LvFx2qVVIh@natalenko.name/ [3] Suggested-by: Halil Pasic Tested-by: Oleksandr Natalenko Cc: Christoph Hellwig Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- Documentation/core-api/dma-attributes.rst | 8 -------- include/linux/dma-mapping.h | 8 -------- kernel/dma/swiotlb.c | 12 ++++++++---- 3 files changed, 8 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 17706dc91ec9..1887d92e8e92 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,11 +130,3 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). - -DMA_ATTR_OVERWRITE ------------------- - -This is a hint to the DMA-mapping subsystem that the device is expected to -overwrite the entire mapped size, thus the caller does not require any of the -previous buffer contents to be preserved. This allows bounce-buffering -implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 6150d11a607e..dca2b1355bb1 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,14 +61,6 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) -/* - * This is a hint to the DMA-mapping subsystem that the device is expected - * to overwrite the entire mapped size, thus the caller does not require any - * of the previous buffer contents to be preserved. This allows - * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. - */ -#define DMA_ATTR_OVERWRITE (1UL << 10) - /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index aca0690550e2..912397dbc2a8 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -578,10 +578,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || - dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -- cgit v1.2.3 From 51e458fc0ca63a0a8e8946dfad19535ae7bb2772 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:12 -0800 Subject: net: add skb_set_end_offset() helper commit 763087dab97547230a6807c865a6a5ae53a59247 upstream. We have multiple places where this helper is convenient, and plan using it in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/linux/skbuff.h | 10 ++++++++++ net/core/skbuff.c | 19 +++++-------------- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 532f5d402f06..615588971328 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1436,6 +1436,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = offset; +} #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { @@ -1446,6 +1451,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = skb->head + offset; +} #endif /* Internal */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cb7ec85c9a1..105530fac890 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -203,7 +203,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); - skb->end = skb->tail + size; + skb_set_end_offset(skb, size); skb->mac_header = (typeof(skb->mac_header))~0U; skb->transport_header = (typeof(skb->transport_header))~0U; @@ -1738,11 +1738,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->head = data; skb->head_frag = 0; skb->data += off; + + skb_set_end_offset(skb, size); #ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; off = nhead; -#else - skb->end = skb->head + size; #endif skb->tail += off; skb_headers_offset_update(skb, nhead); @@ -6159,11 +6158,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb->head = data; skb->data = data; skb->head_frag = 0; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_set_tail_pointer(skb, skb_headlen(skb)); skb_headers_offset_update(skb, 0); skb->cloned = 0; @@ -6301,11 +6296,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb->head = data; skb->head_frag = 0; skb->data = data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - skb->end = size; -#else - skb->end = skb->head + size; -#endif + skb_set_end_offset(skb, size); skb_reset_tail_pointer(skb); skb_headers_offset_update(skb, 0); skb->cloned = 0; -- cgit v1.2.3 From 23629b673b780d967b88a850b1518cf0f0ffc6aa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:13 -0800 Subject: net: preserve skb_end_offset() in skb_unclone_keeptruesize() commit 2b88cba55883eaafbc9b7cbff0b2c7cdba71ed01 upstream. syzbot found another way to trigger the infamous WARN_ON_ONCE(delta < len) in skb_try_coalesce() [1] I was able to root cause the issue to kfence. When kfence is in action, the following assertion is no longer true: int size = xxxx; void *ptr1 = kmalloc(size, gfp); void *ptr2 = kmalloc(size, gfp); if (ptr1 && ptr2) ASSERT(ksize(ptr1) == ksize(ptr2)); We attempted to fix these issues in the blamed commits, but forgot that TCP was possibly shifting data after skb_unclone_keeptruesize() has been used, notably from tcp_retrans_try_collapse(). So we not only need to keep same skb->truesize value, we also need to make sure TCP wont fill new tailroom that pskb_expand_head() was able to get from a addr = kmalloc(...) followed by ksize(addr) Split skb_unclone_keeptruesize() into two parts: 1) Inline skb_unclone_keeptruesize() for the common case, when skb is not cloned. 2) Out of line __skb_unclone_keeptruesize() for the 'slow path'. WARNING: CPU: 1 PID: 6490 at net/core/skbuff.c:5295 skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Modules linked in: CPU: 1 PID: 6490 Comm: syz-executor161 Not tainted 5.17.0-rc4-syzkaller-00229-g4f12b742eb2b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Code: bf 01 00 00 00 0f b7 c0 89 c6 89 44 24 20 e8 62 24 4e fa 8b 44 24 20 83 e8 01 0f 85 e5 f0 ff ff e9 87 f4 ff ff e8 cb 20 4e fa <0f> 0b e9 06 f9 ff ff e8 af b2 95 fa e9 69 f0 ff ff e8 95 b2 95 fa RSP: 0018:ffffc900063af268 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 00000000ffffffd5 RCX: 0000000000000000 RDX: ffff88806fc05700 RSI: ffffffff872abd55 RDI: 0000000000000003 RBP: ffff88806e675500 R08: 00000000ffffffd5 R09: 0000000000000000 R10: ffffffff872ab659 R11: 0000000000000000 R12: ffff88806dd554e8 R13: ffff88806dd9bac0 R14: ffff88806dd9a2c0 R15: 0000000000000155 FS: 00007f18014f9700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020002000 CR3: 000000006be7a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_try_coalesce net/ipv4/tcp_input.c:4651 [inline] tcp_try_coalesce+0x393/0x920 net/ipv4/tcp_input.c:4630 tcp_queue_rcv+0x8a/0x6e0 net/ipv4/tcp_input.c:4914 tcp_data_queue+0x11fd/0x4bb0 net/ipv4/tcp_input.c:5025 tcp_rcv_established+0x81e/0x1ff0 net/ipv4/tcp_input.c:5947 tcp_v4_do_rcv+0x65e/0x980 net/ipv4/tcp_ipv4.c:1719 sk_backlog_rcv include/net/sock.h:1037 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2779 release_sock+0x54/0x1b0 net/core/sock.c:3311 sk_wait_data+0x177/0x450 net/core/sock.c:2821 tcp_recvmsg_locked+0xe28/0x1fd0 net/ipv4/tcp.c:2457 tcp_recvmsg+0x137/0x610 net/ipv4/tcp.c:2572 inet_recvmsg+0x11b/0x5e0 net/ipv4/af_inet.c:850 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_recvmsg net/socket.c:962 [inline] ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632 ___sys_recvmsg+0x127/0x200 net/socket.c:2674 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: c4777efa751d ("net: add and use skb_unclone_keeptruesize() helper") Fixes: 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Marco Elver Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/linux/skbuff.h | 18 +++++++++--------- net/core/skbuff.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 615588971328..e213acaa91ec 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1684,19 +1684,19 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } -/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +/* This variant of skb_unclone() makes sure skb->truesize + * and skb_end_offset() are not changed, whenever a new skb->head is needed. + * + * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) + * when various debugging features are in place. + */ +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); - if (skb_cloned(skb)) { - unsigned int save = skb->truesize; - int res; - - res = pskb_expand_head(skb, 0, 0, pri); - skb->truesize = save; - return res; - } + if (skb_cloned(skb)) + return __skb_unclone_keeptruesize(skb, pri); return 0; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 105530fac890..5956c84cb274 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1789,6 +1789,38 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) } EXPORT_SYMBOL(skb_realloc_headroom); +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) +{ + unsigned int saved_end_offset, saved_truesize; + struct skb_shared_info *shinfo; + int res; + + saved_end_offset = skb_end_offset(skb); + saved_truesize = skb->truesize; + + res = pskb_expand_head(skb, 0, 0, pri); + if (res) + return res; + + skb->truesize = saved_truesize; + + if (likely(skb_end_offset(skb) == saved_end_offset)) + return 0; + + shinfo = skb_shinfo(skb); + + /* We are about to change back skb->end, + * we need to move skb_shinfo() to its new location. + */ + memmove(skb->head + saved_end_offset, + shinfo, + offsetof(struct skb_shared_info, frags[shinfo->nr_frags])); + + skb_set_end_offset(skb, saved_end_offset); + + return 0; +} + /** * skb_expand_head - reallocate header of &sk_buff * @skb: buffer to reallocate -- cgit v1.2.3 From 398ac11f4425d1e52aaf0d05d4fc90524e1a5b5e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 14 Mar 2022 11:20:41 -0700 Subject: bpf: Adjust BPF stack helper functions to accommodate skip > 0 commit ee2a098851bfbe8bcdd964c0121f4246f00ff41e upstream. Let's say that the caller has storage for num_elem stack frames. Then, the BPF stack helper functions walk the stack for only num_elem frames. This means that if skip > 0, one keeps only 'num_elem - skip' frames. This is because it sets init_nr in the perf_callchain_entry to the end of the buffer to save num_elem entries only. I believe it was because the perf callchain code unwound the stack frames until it reached the global max size (sysctl_perf_event_max_stack). However it now has perf_callchain_entry_ctx.max_stack to limit the iteration locally. This simplifies the code to handle init_nr in the BPF callstack entries and removes the confusion with the perf_event's __PERF_SAMPLE_CALLCHAIN_EARLY which sets init_nr to 0. Also change the comment on bpf_get_stack() in the header file to be more explicit what the return value means. Fixes: c195651e565a ("bpf: add bpf_get_stack helper") Signed-off-by: Namhyung Kim Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/30a7b5d5-6726-1cc2-eaee-8da2828a9a9c@oracle.com Link: https://lore.kernel.org/bpf/20220314182042.71025-1-namhyung@kernel.org Signed-off-by: Greg Kroah-Hartman Based-on-patch-by: Eugene Loh --- include/uapi/linux/bpf.h | 8 +++---- kernel/bpf/stackmap.c | 56 +++++++++++++++++++++--------------------------- 2 files changed, 28 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 791f31dd0abe..7b107ab8af3a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2965,8 +2965,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -4269,8 +4269,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 0dcaed4d3f4c..fc0f77f91224 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -219,7 +219,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } static struct perf_callchain_entry * -get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; @@ -230,9 +230,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) if (!entry) return NULL; - entry->nr = init_nr + - stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), - sysctl_perf_event_max_stack - init_nr, 0); + entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, + max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is @@ -244,7 +243,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) int i; /* copy data from the end to avoid using extra buffer */ - for (i = entry->nr - 1; i >= (int)init_nr; i--) + for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } @@ -261,27 +260,19 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; - /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth - */ - trace_nr = trace->nr - init_nr; - - if (trace_nr <= skip) + if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); - ips = trace->ip + skip + init_nr; + ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); @@ -338,8 +329,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; @@ -348,8 +338,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + max_depth += skip; + if (max_depth > sysctl_perf_event_max_stack) + max_depth = sysctl_perf_event_max_stack; + + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -440,7 +434,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { - u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; @@ -465,30 +459,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto err_fault; num_elem = size / elem_size; - if (sysctl_perf_event_max_stack < num_elem) - init_nr = 0; - else - init_nr = sysctl_perf_event_max_stack - num_elem; + max_depth = num_elem + skip; + if (sysctl_perf_event_max_stack < max_depth) + max_depth = sysctl_perf_event_max_stack; if (trace_in) trace = trace_in; else if (kernel && task) - trace = get_callchain_entry_for_task(task, init_nr); + trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) goto err_fault; - trace_nr = trace->nr - init_nr; - if (trace_nr < skip) + if (trace->nr < skip) goto err_fault; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; - ips = trace->ip + skip + init_nr; + + ips = trace->ip + skip; if (user && user_build_id) stack_map_get_build_id_offset(buf, ips, trace_nr, user); else -- cgit v1.2.3 From 50c906a6977f1a2261be37cbf49762000acbf508 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 10 Mar 2022 23:53:35 +0800 Subject: bpf: Fix comment for helper bpf_current_task_under_cgroup() commit 58617014405ad5c9f94f464444f4972dabb71ca7 upstream. Fix the descriptions of the return values of helper bpf_current_task_under_cgroup(). Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Hengqi Chen Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220310155335.1278783-1-hengqi.chen@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/bpf.h | 4 ++-- tools/include/uapi/linux/bpf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7b107ab8af3a..e3fb5e520511 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2276,8 +2276,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if current task belongs to the cgroup2. - * * 1, if current task does not belong to the cgroup2. + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 791f31dd0abe..e2c8f946c541 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2276,8 +2276,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if current task belongs to the cgroup2. - * * 1, if current task does not belong to the cgroup2. + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) -- cgit v1.2.3 From 3e44e136560cb66dd5156889f811b870789b9499 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 29 Sep 2021 11:32:59 +0900 Subject: swiotlb: Support aligned swiotlb buffers commit e81e99bacc9f9347bda7808a949c1ce9fcc2bbf4 upstream. Add an argument to swiotlb_tbl_map_single that specifies the desired alignment of the allocated buffer. This is used by dma-iommu to ensure the buffer is aligned to the iova granule size when using swiotlb with untrusted sub-granule mappings. This addresses an issue where adjacent slots could be exposed to the untrusted device if IO_TLB_SIZE < iova granule < PAGE_SIZE. Signed-off-by: David Stevens Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210929023300.335969-7-stevensd@google.com Signed-off-by: Joerg Roedel Cc: Mario Limonciello Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/dma-iommu.c | 4 ++-- drivers/xen/swiotlb-xen.c | 2 +- include/linux/swiotlb.h | 3 ++- kernel/dma/swiotlb.c | 13 ++++++++----- 4 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 3857aa46dff4..26f754b87469 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -818,8 +818,8 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, size_t padding_size; aligned_size = iova_align(iovad, size); - phys = swiotlb_tbl_map_single(dev, phys, size, - aligned_size, dir, attrs); + phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size, + iova_mask(iovad), dir, attrs); if (phys == DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index e56a5faac395..cbdff8979980 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -380,7 +380,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs); + map = swiotlb_tbl_map_single(dev, phys, size, size, 0, dir, attrs); if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b0cb2a9973f4..569272871375 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,7 +45,8 @@ extern void __init swiotlb_update_mem_attributes(void); phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, size_t mapping_size, size_t alloc_size, - enum dma_data_direction dir, unsigned long attrs); + unsigned int alloc_aligned_mask, enum dma_data_direction dir, + unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 912397dbc2a8..2ee5419649ed 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -459,7 +459,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) * allocate a buffer from that IO TLB pool. */ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, - size_t alloc_size) + size_t alloc_size, unsigned int alloc_align_mask) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); @@ -483,6 +483,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; if (alloc_size >= PAGE_SIZE) stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); + stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1); spin_lock_irqsave(&mem->lock, flags); if (unlikely(nslots > mem->nslabs - mem->used)) @@ -541,7 +542,8 @@ found: phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, - enum dma_data_direction dir, unsigned long attrs) + unsigned int alloc_align_mask, enum dma_data_direction dir, + unsigned long attrs) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); @@ -561,7 +563,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return (phys_addr_t)DMA_MAPPING_ERROR; } - index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset); + index = swiotlb_find_slots(dev, orig_addr, + alloc_size + offset, alloc_align_mask); if (index == -1) { if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, @@ -680,7 +683,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir, attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; @@ -764,7 +767,7 @@ struct page *swiotlb_alloc(struct device *dev, size_t size) if (!mem) return NULL; - index = swiotlb_find_slots(dev, 0, size); + index = swiotlb_find_slots(dev, 0, size, 0); if (index == -1) return NULL; -- cgit v1.2.3 From f6ca862806df3762170cd3251852330304e781c9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 12:55:29 -0600 Subject: coredump: Snapshot the vmas in do_coredump commit 95c5436a4883841588dae86fb0b9325f47ba5ad3 upstream. Move the call of dump_vma_snapshot and kvfree(vma_meta) out of the individual coredump routines into do_coredump itself. This makes the code less error prone and easier to maintain. Make the vma snapshot available to the coredump routines in struct coredump_params. This makes it easier to change and update what is captures in the vma snapshot and will be needed for fixing fill_file_notes. Reviewed-by: Jann Horn Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf.c | 20 +++++++------------- fs/binfmt_elf_fdpic.c | 18 ++++++------------ fs/coredump.c | 41 +++++++++++++++++++++++------------------ include/linux/binfmts.h | 3 +++ include/linux/coredump.h | 3 --- 5 files changed, 39 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3f6a7cac68fd..bd6189f55ddb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2168,8 +2168,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs, i; - size_t vma_data_size; + int segs, i; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2177,16 +2176,12 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - struct core_vma_metadata *vma_meta; - - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - return 0; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2225,7 +2220,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2245,8 +2240,8 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; @@ -2283,8 +2278,8 @@ static int elf_core_dump(struct coredump_params *cprm) /* Align to page */ dump_skip_to(cprm, dataoff); - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; @@ -2301,7 +2296,6 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 6d8fd6030cbb..830a6a876ffe 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm, static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs; + int segs; int i; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - struct core_vma_metadata *vma_meta = NULL; - size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - goto end_coredump; - for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; size_t sz; @@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) dump_skip_to(cprm, dataoff); - if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) + if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1652,7 +1647,6 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } - kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/coredump.c b/fs/coredump.c index bb66a4e3819f..64e2ce897420 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -53,6 +53,8 @@ #include +static bool dump_vma_snapshot(struct coredump_params *cprm); + int core_uses_pid; unsigned int core_pipe_limit; char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -601,6 +603,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) * by any locks. */ .mm_flags = mm->flags, + .vma_meta = NULL, }; audit_core_dumps(siginfo->si_signo); @@ -815,6 +818,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) pr_info("Core dump to |%s disabled\n", cn.corename); goto close_fail; } + if (!dump_vma_snapshot(&cprm)) + goto close_fail; + file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); /* @@ -828,6 +834,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) dump_emit(&cprm, "", 1); } file_end_write(cprm.file); + kvfree(cprm.vma_meta); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -1108,14 +1115,11 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. */ -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr) +static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *vma, *gate_vma; struct mm_struct *mm = current->mm; int i; - size_t vma_data_size = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1123,20 +1127,21 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, * mmap_lock in read mode. */ if (mmap_write_lock_killable(mm)) - return -EINTR; + return false; + cprm->vma_data_size = 0; gate_vma = get_gate_vma(mm); - *vma_count = mm->map_count + (gate_vma ? 1 : 0); + cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); - *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); - if (!*vma_meta) { + cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL); + if (!cprm->vma_meta) { mmap_write_unlock(mm); - return -ENOMEM; + return false; } for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma), i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; m->end = vma->vm_end; @@ -1146,13 +1151,14 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, mmap_write_unlock(mm); - if (WARN_ON(i != *vma_count)) { - kvfree(*vma_meta); - return -EFAULT; + if (WARN_ON(i != cprm->vma_count)) { + kvfree(cprm->vma_meta); + return false; } - for (i = 0; i < *vma_count; i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = cprm->vma_meta + i; if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { char elfmag[SELFMAG]; @@ -1165,9 +1171,8 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, } } - vma_data_size += m->dump_size; + cprm->vma_data_size += m->dump_size; } - *vma_data_size_ptr = vma_data_size; - return 0; + return true; } diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 049cf9421d83..f821b7243361 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -87,6 +87,9 @@ struct coredump_params { loff_t written; loff_t pos; loff_t to_skip; + int vma_count; + size_t vma_data_size; + struct core_vma_metadata *vma_meta; }; /* diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 78fcd776b185..cc1aee2cb46a 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -29,9 +29,6 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr); extern void do_coredump(const kernel_siginfo_t *siginfo); #else static inline void do_coredump(const kernel_siginfo_t *siginfo) {} -- cgit v1.2.3 From 39fd0cc079c98dafcf355997ada7b5e67f0bb10a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 13:04:19 -0600 Subject: coredump: Use the vma snapshot in fill_files_note commit 390031c942116d4733310f0684beb8db19885fe6 upstream. Matthew Wilcox reported that there is a missing mmap_lock in file_files_note that could possibly lead to a user after free. Solve this by using the existing vma snapshot for consistency and to avoid the need to take the mmap_lock anywhere in the coredump code except for dump_vma_snapshot. Update the dump_vma_snapshot to capture vm_pgoff and vm_file that are neeeded by fill_files_note. Add free_vma_snapshot to free the captured values of vm_file. Reported-by: Matthew Wilcox Link: https://lkml.kernel.org/r/20220131153740.2396974-1-willy@infradead.org Cc: stable@vger.kernel.org Fixes: a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot") Fixes: 2aa362c49c31 ("coredump: extend core dump note section to contain file names of mapped files") Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/binfmt_elf.c | 24 ++++++++++++------------ fs/coredump.c | 22 +++++++++++++++++++++- include/linux/coredump.h | 2 ++ 3 files changed, 35 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1ae3fe3eddf1..c93150f36a52 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1618,17 +1618,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ -static int fill_files_note(struct memelfnote *note) +static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; + int i; /* *Estimated* file count and total data size needed */ - count = mm->map_count; + count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1650,11 +1649,12 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; - file = vma->vm_file; + file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); @@ -1674,9 +1674,9 @@ static int fill_files_note(struct memelfnote *note) memmove(name_curpos, filename, n); name_curpos += n; - *start_end_ofs++ = vma->vm_start; - *start_end_ofs++ = vma->vm_end; - *start_end_ofs++ = vma->vm_pgoff; + *start_end_ofs++ = m->start; + *start_end_ofs++ = m->end; + *start_end_ofs++ = m->pgoff; count++; } @@ -1687,7 +1687,7 @@ static int fill_files_note(struct memelfnote *note) * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = mm->map_count - count; + n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -1886,7 +1886,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); - if (fill_files_note(&info->files) == 0) + if (fill_files_note(&info->files, cprm) == 0) info->size += notesize(&info->files); return 1; @@ -2075,7 +2075,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; - if (fill_files_note(info->notes + info->numnote) == 0) { + if (fill_files_note(info->notes + info->numnote, cprm) == 0) { info->notes_files = info->notes + info->numnote; info->numnote++; } diff --git a/fs/coredump.c b/fs/coredump.c index e95fea3beaeb..26eb5a095832 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -54,6 +54,7 @@ #include static bool dump_vma_snapshot(struct coredump_params *cprm); +static void free_vma_snapshot(struct coredump_params *cprm); int core_uses_pid; unsigned int core_pipe_limit; @@ -834,7 +835,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) dump_emit(&cprm, "", 1); } file_end_write(cprm.file); - kvfree(cprm.vma_meta); + free_vma_snapshot(&cprm); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -1111,6 +1112,20 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, return gate_vma; } +static void free_vma_snapshot(struct coredump_params *cprm) +{ + if (cprm->vma_meta) { + int i; + for (i = 0; i < cprm->vma_count; i++) { + struct file *file = cprm->vma_meta[i].file; + if (file) + fput(file); + } + kvfree(cprm->vma_meta); + cprm->vma_meta = NULL; + } +} + /* * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. @@ -1147,6 +1162,11 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); + m->pgoff = vma->vm_pgoff; + + m->file = vma->vm_file; + if (m->file) + get_file(m->file); } mmap_write_unlock(mm); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index cc1aee2cb46a..4b95e46d215f 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -12,6 +12,8 @@ struct core_vma_metadata { unsigned long start, end; unsigned long flags; unsigned long dump_size; + unsigned long pgoff; + struct file *file; }; extern int core_uses_pid; -- cgit v1.2.3 From 1c6ffdf4cc45525e5736a1cb3e46cd232c121837 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 30 Jan 2022 12:55:17 +0100 Subject: bpf: Make dst_port field in struct bpf_sock 16-bit wide [ Upstream commit 4421a582718ab81608d8486734c18083b822390d ] Menglong Dong reports that the documentation for the dst_port field in struct bpf_sock is inaccurate and confusing. From the BPF program PoV, the field is a zero-padded 16-bit integer in network byte order. The value appears to the BPF user as if laid out in memory as so: offsetof(struct bpf_sock, dst_port) + 0 + 8 +16 0x00 +24 0x00 32-, 16-, and 8-bit wide loads from the field are all allowed, but only if the offset into the field is 0. 32-bit wide loads from dst_port are especially confusing. The loaded value, after converting to host byte order with bpf_ntohl(dst_port), contains the port number in the upper 16-bits. Remove the confusion by splitting the field into two 16-bit fields. For backward compatibility, allow 32-bit wide loads from offsetof(struct bpf_sock, dst_port). While at it, allow loads 8-bit loads at offset [0] and [1] from dst_port. Reported-by: Menglong Dong Signed-off-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220130115518.213259-2-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- include/uapi/linux/bpf.h | 3 ++- net/core/filter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e3fb5e520511..2136e45656ab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5347,7 +5347,8 @@ struct bpf_sock { __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ - __u32 dst_port; /* network byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; diff --git a/net/core/filter.c b/net/core/filter.c index 76e406965b6f..a65de7ac60aa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7966,6 +7966,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); + int field_size; if (off < 0 || off >= sizeof(struct bpf_sock)) return false; @@ -7977,7 +7978,6 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, family): case offsetof(struct bpf_sock, type): case offsetof(struct bpf_sock, protocol): - case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): @@ -7986,6 +7986,14 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case bpf_ctx_range(struct bpf_sock, dst_port): + field_size = size == size_default ? + size_default : sizeof_field(struct bpf_sock, dst_port); + bpf_ctx_record_field_size(info, field_size); + return bpf_ctx_narrow_access_ok(off, size, field_size); + case offsetofend(struct bpf_sock, dst_port) ... + offsetof(struct bpf_sock, dst_ip4) - 1: + return false; } return size == size_default; -- cgit v1.2.3 From 4790998fdd0d0c682512af9b4a875bc80da73836 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 12:15:45 -0800 Subject: ipv6: make mc_forwarding atomic [ Upstream commit 145c7a793838add5e004e7d49a67654dc7eba147 ] This fixes minor data-races in ip6_mc_input() and batadv_mcast_mla_rtr_flags_softif_get_ipv6() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/linux/ipv6.h | 2 +- net/batman-adv/multicast.c | 2 +- net/ipv6/addrconf.c | 4 ++-- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6mr.c | 8 ++++---- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 07cba0b3496d..d1f386430795 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -51,7 +51,7 @@ struct ipv6_devconf { __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - __s32 mc_forwarding; + atomic_t mc_forwarding; #endif __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 6e3419beca09..2853634a3979 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -134,7 +134,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); - if (in6_dev && in6_dev->cnf.mc_forwarding) + if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 1fe27807e471..3a8838b79bb6 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -552,7 +552,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, #ifdef CONFIG_IPV6_MROUTE if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, - devconf->mc_forwarding) < 0) + atomic_read(&devconf->mc_forwarding)) < 0) goto nla_put_failure; #endif if ((all || type == NETCONFA_PROXY_NEIGH) && @@ -5537,7 +5537,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding; + array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding); #endif array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6; array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad; diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 80256717868e..d4b1e2c5aa76 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -508,7 +508,7 @@ int ip6_mc_input(struct sk_buff *skb) /* * IPv6 multicast router mode is now supported ;) */ - if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) && !(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6a4065d81aa9..91f1c5f56d5f 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -739,7 +739,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, in6_dev = __in6_dev_get(dev); if (in6_dev) { - in6_dev->cnf.mc_forwarding--; + atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); @@ -907,7 +907,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt, in6_dev = __in6_dev_get(dev); if (in6_dev) { - in6_dev->cnf.mc_forwarding++; + atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); @@ -1557,7 +1557,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); - net->ipv6.devconf_all->mc_forwarding++; + atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } write_unlock_bh(&mrt_lock); @@ -1590,7 +1590,7 @@ int ip6mr_sk_done(struct sock *sk) * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ - net->ipv6.devconf_all->mc_forwarding--; + atomic_dec(&net->ipv6.devconf_all->mc_forwarding); write_unlock_bh(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, -- cgit v1.2.3 From 9c1ace066f22fe490d240dba8e13aeee9a88c589 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 5 Feb 2022 09:01:25 -0800 Subject: net: initialize init_net earlier [ Upstream commit 9c1be1935fb68b2413796cdc03d019b8cf35ab51 ] While testing a patch that will follow later ("net: add netns refcount tracker to struct nsproxy") I found that devtmpfs_init() was called before init_net was initialized. This is a bug, because devtmpfs_setup() calls ksys_unshare(CLONE_NEWNS); This has the effect of increasing init_net refcount, which will be later overwritten to 1, as part of setup_net(&init_net) We had too many prior patches [1] trying to work around the root cause. Really, make sure init_net is in BSS section, and that net_ns_init() is called earlier at boot time. Note that another patch ("vfs: add netns refcount tracker to struct fs_context") also will need net_ns_init() being called before vfs_caches_init() As a bonus, this patch saves around 4KB in .data section. [1] f8c46cb39079 ("netns: do not call pernet ops for not yet set up init_net namespace") b5082df8019a ("net: Initialise init_net.count to 1") 734b65417b24 ("net: Statically initialize init_net.dev_base_head") v2: fixed a build error reported by kernel build bots (CONFIG_NET=n) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/net_namespace.h | 6 ++++++ init/main.c | 2 ++ net/core/dev.c | 3 +-- net/core/net_namespace.c | 17 +++++------------ 4 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bb5fa5914032..2ba326f9e004 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -479,4 +479,10 @@ static inline void fnhe_genid_bump(struct net *net) atomic_inc(&net->fnhe_genid); } +#ifdef CONFIG_NET +void net_ns_init(void); +#else +static inline void net_ns_init(void) {} +#endif + #endif /* __NET_NET_NAMESPACE_H */ diff --git a/init/main.c b/init/main.c index bcd132d4e7bd..b340d990d77c 100644 --- a/init/main.c +++ b/init/main.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include @@ -1122,6 +1123,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) key_init(); security_init(); dbg_late_init(); + net_ns_init(); vfs_caches_init(); pagecache_init(); signals_init(); diff --git a/net/core/dev.c b/net/core/dev.c index 33dc2a3ff7d7..804aba2228c2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11378,8 +11378,7 @@ static int __net_init netdev_init(struct net *net) BUILD_BUG_ON(GRO_HASH_BUCKETS > 8 * sizeof_field(struct napi_struct, gro_bitmask)); - if (net != &init_net) - INIT_LIST_HEAD(&net->dev_base_head); + INIT_LIST_HEAD(&net->dev_base_head); net->dev_name_head = netdev_create_hash(); if (net->dev_name_head == NULL) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9702d2b0d920..9745cb6fdf51 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -44,13 +44,7 @@ EXPORT_SYMBOL_GPL(net_rwsem); static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) }; #endif -struct net init_net = { - .ns.count = REFCOUNT_INIT(1), - .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), -#ifdef CONFIG_KEYS - .key_domain = &init_net_key_domain, -#endif -}; +struct net init_net; EXPORT_SYMBOL(init_net); static bool init_net_initialized; @@ -1081,7 +1075,7 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } -static int __init net_ns_init(void) +void __init net_ns_init(void) { struct net_generic *ng; @@ -1102,6 +1096,9 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); +#ifdef CONFIG_KEYS + init_net.key_domain = &init_net_key_domain; +#endif down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); @@ -1116,12 +1113,8 @@ static int __init net_ns_init(void) RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, RTNL_FLAG_DOIT_UNLOCKED); - - return 0; } -pure_initcall(net_ns_init); - static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) { ops_pre_exit_list(ops, net_exit_list); -- cgit v1.2.3 From 049072749a5e1fe277ee64746bbe9fca5c55f3f9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 19 Feb 2022 17:45:19 +0200 Subject: ipv4: Invalidate neighbour for broadcast address upon address addition [ Upstream commit 0c51e12e218f20b7d976158fdc18019627326f7a ] In case user space sends a packet destined to a broadcast address when a matching broadcast route is not configured, the kernel will create a unicast neighbour entry that will never be resolved [1]. When the broadcast route is configured, the unicast neighbour entry will not be invalidated and continue to linger, resulting in packets being dropped. Solve this by invalidating unresolved neighbour entries for broadcast addresses after routes for these addresses are internally configured by the kernel. This allows the kernel to create a broadcast neighbour entry following the next route lookup. Another possible solution that is more generic but also more complex is to have the ARP code register a listener to the FIB notification chain and invalidate matching neighbour entries upon the addition of broadcast routes. It is also possible to wave off the issue as a user space problem, but it seems a bit excessive to expect user space to be that intimately familiar with the inner workings of the FIB/neighbour kernel code. [1] https://lore.kernel.org/netdev/55a04a8f-56f3-f73c-2aea-2195923f09d1@huawei.com/ Reported-by: Wang Hai Signed-off-by: Ido Schimmel Tested-by: Wang Hai Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/arp.h | 1 + net/ipv4/arp.c | 9 +++++++-- net/ipv4/fib_frontend.c | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/arp.h b/include/net/arp.h index 4950191f6b2b..4a23a97195f3 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -71,6 +71,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); +int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 922dd73e5740..83a47998c4b1 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1116,13 +1116,18 @@ static int arp_req_get(struct arpreq *r, struct net_device *dev) return err; } -static int arp_invalidate(struct net_device *dev, __be32 ip) +int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { + if ((neigh->nud_state & NUD_VALID) && !force) { + neigh_release(neigh); + return 0; + } + if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| @@ -1169,7 +1174,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, if (!dev) return -EINVAL; } - return arp_invalidate(dev, ip); + return arp_invalidate(dev, ip, true); } /* diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4d61ddd8a0ec..1eb7795edb9d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1112,9 +1112,11 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) return; /* Add broadcast address, if it is explicitly assigned. */ - if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) + if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim, 0); + arp_invalidate(dev, ifa->ifa_broadcast, false); + } if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { @@ -1128,6 +1130,7 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 32, prim, 0); + arp_invalidate(dev, prefix | ~mask, false); } } } -- cgit v1.2.3 From 5e96bb81eddf0551a826043c97b0175978eee504 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 24 Jan 2022 16:11:37 -0700 Subject: vfio/pci: Stub vfio_pci_vga_rw when !CONFIG_VFIO_PCI_VGA [ Upstream commit 6e031ec0e5a2dda53e12e0d2a7e9b15b47a3c502 ] Resolve build errors reported against UML build for undefined ioport_map() and ioport_unmap() functions. Without this config option a device cannot have vfio_pci_core_device.has_vga set, so the existing function would always return -EINVAL anyway. Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220123125737.2658758-1-geert@linux-m68k.org Link: https://lore.kernel.org/r/164306582968.3758255.15192949639574660648.stgit@omen Signed-off-by: Alex Williamson Signed-off-by: Sasha Levin --- drivers/vfio/pci/vfio_pci_rdwr.c | 2 ++ include/linux/vfio_pci_core.h | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 57d3b2cbbd8e..82ac1569deb0 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -288,6 +288,7 @@ out: return done; } +#ifdef CONFIG_VFIO_PCI_VGA ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -355,6 +356,7 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, return done; } +#endif static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, bool test_mem) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index ef9a44b6cf5d..ae6f4838ab75 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -159,8 +159,17 @@ extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +#ifdef CONFIG_VFIO_PCI_VGA extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +#else +static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + return -EINVAL; +} +#endif extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, uint64_t data, int count, int fd); -- cgit v1.2.3 From 0000de40b9f35c300e8e2328f30113145ad04796 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 3 Mar 2022 13:11:57 -0800 Subject: Bluetooth: Fix not checking for valid hdev on bt_dev_{info,warn,err,dbg} [ Upstream commit 9b392e0e0b6d026da5a62bb79a08f32e27af858e ] This fixes attemting to print hdev->name directly which causes them to print an error: kernel: read_version:367: (efault): sock 000000006a3008f2 Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann Signed-off-by: Sasha Levin --- include/net/bluetooth/bluetooth.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 9125effbf448..3fecc4a411a1 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -180,19 +180,21 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif +#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null") + #define bt_dev_info(hdev, fmt, ...) \ - BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ - BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err(hdev, fmt, ...) \ - BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_dbg(hdev, fmt, ...) \ - BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ - bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ - bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) /* Connection and socket states */ enum { -- cgit v1.2.3 From db9a140a858545f8cf9c40ecfe8e8e772b3d98b7 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 9 Mar 2022 13:04:13 +0100 Subject: can: isotp: set default value for N_As to 50 micro seconds [ Upstream commit 530e0d46c61314c59ecfdb8d3bcb87edbc0f85d3 ] The N_As value describes the time a CAN frame needs on the wire when transmitted by the CAN controller. Even very short CAN FD frames need arround 100 usecs (bitrate 1Mbit/s, data bitrate 8Mbit/s). Having N_As to be zero (the former default) leads to 'no CAN frame separation' when STmin is set to zero by the receiving node. This 'burst mode' should not be enabled by default as it could potentially dump a high number of CAN frames into the netdev queue from the soft hrtimer context. This does not affect the system stability but is just not nice and cooperative. With this N_As/frame_txtime value the 'burst mode' is disabled by default. As user space applications usually do not set the frame_txtime element of struct can_isotp_options the new in-kernel default is very likely overwritten with zero when the sockopt() CAN_ISOTP_OPTS is invoked. To make sure that a N_As value of zero is only set intentional the value '0' is now interpreted as 'do not change the current value'. When a frame_txtime of zero is required for testing purposes this CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime. Link: https://lore.kernel.org/all/20220309120416.83514-2-socketcan@hartkopp.net Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde Signed-off-by: Sasha Levin --- include/uapi/linux/can/isotp.h | 28 ++++++++++++++++++++++------ net/can/isotp.c | 12 +++++++++++- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h index c55935b64ccc..590f8aea2b6d 100644 --- a/include/uapi/linux/can/isotp.h +++ b/include/uapi/linux/can/isotp.h @@ -137,20 +137,16 @@ struct can_isotp_ll_options { #define CAN_ISOTP_WAIT_TX_DONE 0x400 /* wait for tx completion */ #define CAN_ISOTP_SF_BROADCAST 0x800 /* 1-to-N functional addressing */ -/* default values */ +/* protocol machine default values */ #define CAN_ISOTP_DEFAULT_FLAGS 0 #define CAN_ISOTP_DEFAULT_EXT_ADDRESS 0x00 #define CAN_ISOTP_DEFAULT_PAD_CONTENT 0xCC /* prevent bit-stuffing */ -#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 0 +#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 50000 /* 50 micro seconds */ #define CAN_ISOTP_DEFAULT_RECV_BS 0 #define CAN_ISOTP_DEFAULT_RECV_STMIN 0x00 #define CAN_ISOTP_DEFAULT_RECV_WFTMAX 0 -#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU -#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN -#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 - /* * Remark on CAN_ISOTP_DEFAULT_RECV_* values: * @@ -162,4 +158,24 @@ struct can_isotp_ll_options { * consistency and copied directly into the flow control (FC) frame. */ +/* link layer default values => make use of Classical CAN frames */ + +#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU +#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN +#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 + +/* + * The CAN_ISOTP_DEFAULT_FRAME_TXTIME has become a non-zero value as + * it only makes sense for isotp implementation tests to run without + * a N_As value. As user space applications usually do not set the + * frame_txtime element of struct can_isotp_options the new in-kernel + * default is very likely overwritten with zero when the sockopt() + * CAN_ISOTP_OPTS is invoked. + * To make sure that a N_As value of zero is only set intentional the + * value '0' is now interpreted as 'do not change the current value'. + * When a frame_txtime of zero is required for testing purposes this + * CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime. + */ +#define CAN_ISOTP_FRAME_TXTIME_ZERO 0xFFFFFFFF + #endif /* !_UAPI_CAN_ISOTP_H */ diff --git a/net/can/isotp.c b/net/can/isotp.c index a95d171b3a64..5bce7c66c121 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -141,6 +141,7 @@ struct isotp_sock { struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; + u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; struct tpcon rx, tx; @@ -360,7 +361,7 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ - so->tx_gap = ktime_add_ns(so->tx_gap, so->opt.frame_txtime); + so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, @@ -1247,6 +1248,14 @@ static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; + + /* check for frame_txtime changes (0 => no changes) */ + if (so->opt.frame_txtime) { + if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) + so->frame_txtime = 0; + else + so->frame_txtime = so->opt.frame_txtime; + } break; case CAN_ISOTP_RECV_FC: @@ -1448,6 +1457,7 @@ static int isotp_init(struct sock *sk) so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; + so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; -- cgit v1.2.3 From 24d28d9b0fd5c544a56804cda4458cf6b53afb84 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 7 Mar 2022 10:41:44 +1100 Subject: NFS: swap IO handling is slightly different for O_DIRECT IO [ Upstream commit 64158668ac8b31626a8ce48db4cad08496eb8340 ] 1/ Taking the i_rwsem for swap IO triggers lockdep warnings regarding possible deadlocks with "fs_reclaim". These deadlocks could, I believe, eventuate if a buffered read on the swapfile was attempted. We don't need coherence with the page cache for a swap file, and buffered writes are forbidden anyway. There is no other need for i_rwsem during direct IO. So never take it for swap_rw() 2/ generic_write_checks() explicitly forbids writes to swap, and performs checks that are not needed for swap. So bypass it for swap_rw(). Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/direct.c | 42 ++++++++++++++++++++++++++++-------------- fs/nfs/file.c | 4 ++-- include/linux/nfs_fs.h | 8 ++++---- 3 files changed, 34 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 3c0335c15a73..28afc315ec0c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -172,8 +172,8 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter); - return nfs_file_direct_write(iocb, iter); + return nfs_file_direct_read(iocb, iter, true); + return nfs_file_direct_write(iocb, iter, true); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -424,6 +424,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -439,7 +440,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -481,12 +483,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) if (iter_is_iovec(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - nfs_start_io_direct(inode); + if (!swap) + nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); - nfs_end_io_direct(inode); + if (!swap) + nfs_end_io_direct(inode); if (requested > 0) { result = nfs_direct_wait(dreq); @@ -875,6 +879,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data + * @swap: flag indicating this is swap IO, not O_DIRECT IO * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode @@ -891,7 +896,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, + bool swap) { ssize_t result, requested; size_t count; @@ -905,7 +911,11 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(iocb, iter); + if (swap) + /* bypass generic checks */ + result = iov_iter_count(iter); + else + result = generic_write_checks(iocb, iter); if (result <= 0) return result; count = result; @@ -936,16 +946,20 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) dreq->iocb = iocb; pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); - nfs_start_io_direct(inode); + if (swap) { + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + } else { + nfs_start_io_direct(inode); - requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + } - nfs_end_io_direct(inode); + nfs_end_io_direct(inode); + } if (requested > 0) { result = nfs_direct_wait(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index aa353fd58240..42a16993913a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -161,7 +161,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to); + return nfs_file_direct_read(iocb, to, false); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -616,7 +616,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_write(iocb, from); + return nfs_file_direct_write(iocb, from, false); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, iov_iter_count(from), (long long) iocb->ki_pos); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4a733f140939..41102e03512f 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -494,10 +494,10 @@ static inline const struct cred *nfs_file_cred(struct file *file) * linux/fs/nfs/direct.c */ extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *); -extern ssize_t nfs_file_direct_read(struct kiocb *iocb, - struct iov_iter *iter); -extern ssize_t nfs_file_direct_write(struct kiocb *iocb, - struct iov_iter *iter); +ssize_t nfs_file_direct_read(struct kiocb *iocb, + struct iov_iter *iter, bool swap); +ssize_t nfs_file_direct_write(struct kiocb *iocb, + struct iov_iter *iter, bool swap); /* * linux/fs/nfs/dir.c -- cgit v1.2.3 From da747de6859948e75bcaaa1019ef0f7fdaf0b867 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Mar 2022 12:34:19 -0400 Subject: NFS: nfsiod should not block forever in mempool_alloc() [ Upstream commit 515dcdcd48736576c6f5c197814da6f81c60a21e ] The concern is that since nfsiod is sometimes required to kick off a commit, it can get locked up waiting forever in mempool_alloc() instead of failing gracefully and leaving the commit until later. Try to allocate from the slab first, with GFP_KERNEL | __GFP_NORETRY, then fall back to a non-blocking attempt to allocate from the memory pool. Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/internal.h | 7 +++++++ fs/nfs/pnfs_nfs.c | 8 ++++++-- fs/nfs/write.c | 24 +++++++++--------------- include/linux/nfs_fs.h | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 66fc936834f2..7239118d98a3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -580,6 +580,13 @@ nfs_write_match_verf(const struct nfs_writeverf *verf, !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } +static inline gfp_t nfs_io_gfp_mask(void) +{ + if (current->flags & PF_WQ_WORKER) + return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + return GFP_KERNEL; +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 316f68f96e57..657c242a18ff 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -419,7 +419,7 @@ static struct nfs_commit_data * pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo) { - struct nfs_commit_data *data = nfs_commitdata_alloc(false); + struct nfs_commit_data *data = nfs_commitdata_alloc(); if (!data) return NULL; @@ -515,7 +515,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(mds_pages, NULL, cinfo, -1); + return -ENOMEM; + } data->ds_commit_index = -1; list_splice_init(mds_pages, &data->pages); list_add_tail(&data->list, &list); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0691b0b02147..a296e504e4aa 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -70,27 +70,17 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) +struct nfs_commit_data *nfs_commitdata_alloc(void) { struct nfs_commit_data *p; - if (never_fail) - p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); - else { - /* It is OK to do some reclaim, not no safe to wait - * for anything to be returned to the pool. - * mempool_alloc() cannot handle that particular combination, - * so we need two separate attempts. - */ + p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask()); + if (!p) { p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); - if (!p) - p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | - __GFP_NOWARN | __GFP_NORETRY); if (!p) return NULL; + memset(p, 0, sizeof(*p)); } - - memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); return p; } @@ -1809,7 +1799,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(true); + data = nfs_commitdata_alloc(); + if (!data) { + nfs_retry_commit(head, NULL, cinfo, -1); + return -ENOMEM; + } /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 41102e03512f..5ffcde9ac413 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -567,7 +567,7 @@ extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page *page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); +extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); -- cgit v1.2.3 From 985d87e6a36551b292443c52a4ae9b82296d1f51 Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:26 +0100 Subject: rtc: mc146818-lib: fix RTC presence check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ea6fa4961aab8f90a8aa03575a98b4bda368d4b6 ] To prevent an infinite loop in mc146818_get_time(), commit 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs") added a check for RTC availability. Together with a later fix, it checked if bit 6 in register 0x0d is cleared. This, however, caused a false negative on a motherboard with an AMD SB710 southbridge; according to the specification [1], bit 6 of register 0x0d of this chipset is a scratchbit. This caused a regression in Linux 5.11 - the RTC was determined broken by the kernel and not used by rtc-cmos.c [3]. This problem was also reported in Fedora [4]. As a better alternative, check whether the UIP ("Update-in-progress") bit is set for longer then 10ms. If that is the case, then apparently the RTC is either absent (and all register reads return 0xff) or broken. Also limit the number of loop iterations in mc146818_get_time() to 10 to prevent an infinite loop there. The functions mc146818_get_time() and mc146818_does_rtc_work() will be refactored later in this patch series, in order to fix a separate problem with reading / setting the RTC alarm time. This is done so to avoid a confusion about what is being fixed when. In a previous approach to this problem, I implemented a check whether the RTC_HOURS register contains a value <= 24. This, however, sometimes did not work correctly on my Intel Kaby Lake laptop. According to Intel's documentation [2], "the time and date RAM locations (0-9) are disconnected from the external bus" during the update cycle so reading this register without checking the UIP bit is incorrect. [1] AMD SB700/710/750 Register Reference Guide, page 308, https://developer.amd.com/wordpress/media/2012/10/43009_sb7xx_rrg_pub_1.00.pdf [2] 7th Generation Intel ® Processor Family I/O for U/Y Platforms [...] Datasheet Volume 1 of 2, page 209 Intel's Document Number: 334658-006, https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/7th-and-8th-gen-core-family-mobile-u-y-processor-lines-i-o-datasheet-vol-1.pdf [3] Functions in arch/x86/kernel/rtc.c apparently were using it. [4] https://bugzilla.redhat.com/show_bug.cgi?id=1936688 Fixes: 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs") Fixes: ebb22a059436 ("rtc: mc146818: Dont test for bit 0-5 in Register D") Signed-off-by: Mateusz Jończyk Cc: Thomas Gleixner Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-5-mat.jonczyk@o2.pl Signed-off-by: Sasha Levin --- drivers/rtc/rtc-cmos.c | 10 ++++------ drivers/rtc/rtc-mc146818-lib.c | 34 ++++++++++++++++++++++++++++++---- include/linux/mc146818rtc.h | 1 + 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index d0f58cca5c20..b90a603d6b12 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -800,16 +800,14 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) rename_region(ports, dev_name(&cmos_rtc.rtc->dev)); - spin_lock_irq(&rtc_lock); - - /* Ensure that the RTC is accessible. Bit 6 must be 0! */ - if ((CMOS_READ(RTC_VALID) & 0x40) != 0) { - spin_unlock_irq(&rtc_lock); - dev_warn(dev, "not accessible\n"); + if (!mc146818_does_rtc_work()) { + dev_warn(dev, "broken or not accessible\n"); retval = -ENXIO; goto cleanup1; } + spin_lock_irq(&rtc_lock); + if (!(flags & CMOS_RTC_FLAGS_NOFREQ)) { /* force periodic irq to CMOS reset default of 1024Hz; * diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 97e3cebb4da9..70137566981e 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -8,10 +8,36 @@ #include #endif +/* + * If the UIP (Update-in-progress) bit of the RTC is set for more then + * 10ms, the RTC is apparently broken or not present. + */ +bool mc146818_does_rtc_work(void) +{ + int i; + unsigned char val; + unsigned long flags; + + for (i = 0; i < 10; i++) { + spin_lock_irqsave(&rtc_lock, flags); + val = CMOS_READ(RTC_FREQ_SELECT); + spin_unlock_irqrestore(&rtc_lock, flags); + + if ((val & RTC_UIP) == 0) + return true; + + mdelay(1); + } + + return false; +} +EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); + unsigned int mc146818_get_time(struct rtc_time *time) { unsigned char ctrl; unsigned long flags; + unsigned int iter_count = 0; unsigned char century = 0; bool retry; @@ -20,13 +46,13 @@ unsigned int mc146818_get_time(struct rtc_time *time) #endif again: - spin_lock_irqsave(&rtc_lock, flags); - /* Ensure that the RTC is accessible. Bit 6 must be 0! */ - if (WARN_ON_ONCE((CMOS_READ(RTC_VALID) & 0x40) != 0)) { - spin_unlock_irqrestore(&rtc_lock, flags); + if (iter_count > 10) { memset(time, 0, sizeof(*time)); return -EIO; } + iter_count++; + + spin_lock_irqsave(&rtc_lock, flags); /* * Check whether there is an update in progress during which the diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 0661af17a758..69c80c4325bf 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -123,6 +123,7 @@ struct cmos_rtc_board_info { #define RTC_IO_EXTENT_USED RTC_IO_EXTENT #endif /* ARCH_RTC_LOCATION */ +bool mc146818_does_rtc_work(void); unsigned int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); -- cgit v1.2.3 From 0912cf021fb5749372b3782611d8b1de4986c13a Mon Sep 17 00:00:00 2001 From: Shreeya Patel Date: Mon, 21 Mar 2022 19:02:41 +0530 Subject: gpio: Restrict usage of GPIO chip irq members before initialization commit 5467801f1fcbdc46bc7298a84dbf3ca1ff2a7320 upstream. GPIO chip irq members are exposed before they could be completely initialized and this leads to race conditions. One such issue was observed for the gc->irq.domain variable which was accessed through the I2C interface in gpiochip_to_irq() before it could be initialized by gpiochip_add_irqchip(). This resulted in Kernel NULL pointer dereference. Following are the logs for reference :- kernel: Call Trace: kernel: gpiod_to_irq+0x53/0x70 kernel: acpi_dev_gpio_irq_get_by+0x113/0x1f0 kernel: i2c_acpi_get_irq+0xc0/0xd0 kernel: i2c_device_probe+0x28a/0x2a0 kernel: really_probe+0xf2/0x460 kernel: RIP: 0010:gpiochip_to_irq+0x47/0xc0 To avoid such scenarios, restrict usage of GPIO chip irq members before they are completely initialized. Signed-off-by: Shreeya Patel Cc: stable@vger.kernel.org Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpiolib.c | 19 +++++++++++++++++++ include/linux/gpio/driver.h | 9 +++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 358f0ad9d0f8..91628edad2c6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1368,6 +1368,16 @@ static int gpiochip_to_irq(struct gpio_chip *gc, unsigned int offset) { struct irq_domain *domain = gc->irq.domain; +#ifdef CONFIG_GPIOLIB_IRQCHIP + /* + * Avoid race condition with other code, which tries to lookup + * an IRQ before the irqchip has been properly registered, + * i.e. while gpiochip is still being brought up. + */ + if (!gc->irq.initialized) + return -EPROBE_DEFER; +#endif + if (!gpiochip_irqchip_irq_valid(gc, offset)) return -ENXIO; @@ -1552,6 +1562,15 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, acpi_gpiochip_request_interrupts(gc); + /* + * Using barrier() here to prevent compiler from reordering + * gc->irq.initialized before initialization of above + * GPIO chip irq members. + */ + barrier(); + + gc->irq.initialized = true; + return 0; } diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index a0f9901dcae6..e3c29d2e6826 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -224,6 +224,15 @@ struct gpio_irq_chip { unsigned long *valid_mask, unsigned int ngpios); + /** + * @initialized: + * + * Flag to track GPIO chip irq member's initialization. + * This flag will make sure GPIO chip irq members are not used + * before they are initialized. + */ + bool initialized; + /** * @valid_mask: * -- cgit v1.2.3 From 0c64645e6373761f1f8f292dc122742827f1240e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 9 Feb 2022 19:43:32 +0100 Subject: bpf: Make remote_port field in struct bpf_sk_lookup 16-bit wide commit 9a69e2b385f443f244a7e8b8bcafe5ccfb0866b4 upstream. remote_port is another case of a BPF context field documented as a 32-bit value in network byte order for which the BPF context access converter generates a load of a zero-padded 16-bit integer in network byte order. First such case was dst_port in bpf_sock which got addressed in commit 4421a582718a ("bpf: Make dst_port field in struct bpf_sock 16-bit wide"). Loading 4-bytes from the remote_port offset and converting the value with bpf_ntohl() leads to surprising results, as the expected value is shifted by 16 bits. Reduce the confusion by splitting the field in two - a 16-bit field holding a big-endian integer, and a 16-bit zero-padding anonymous field that follows it. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220209184333.654927-2-jakub@cloudflare.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/bpf.h | 3 ++- net/bpf/test_run.c | 4 ++-- net/core/filter.c | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2136e45656ab..a887e582f0e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6223,7 +6223,8 @@ struct bpf_sk_lookup { __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ __u32 remote_ip4; /* Network byte order */ __u32 remote_ip6[4]; /* Network byte order */ - __u32 remote_port; /* Network byte order */ + __be16 remote_port; /* Network byte order */ + __u16 :16; /* Zero padding */ __u32 local_ip4; /* Network byte order */ __u32 local_ip6[4]; /* Network byte order */ __u32 local_port; /* Host byte order */ diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index b5f4ef35357c..655ee0e2de86 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -954,7 +954,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) goto out; - if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + if (user_ctx->local_port > U16_MAX) { ret = -ERANGE; goto out; } @@ -962,7 +962,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat ctx.family = (u16)user_ctx->family; ctx.protocol = (u16)user_ctx->protocol; ctx.dport = (u16)user_ctx->local_port; - ctx.sport = (__force __be16)user_ctx->remote_port; + ctx.sport = user_ctx->remote_port; switch (ctx.family) { case AF_INET: diff --git a/net/core/filter.c b/net/core/filter.c index fbde862e3e82..cdd7e92db303 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10540,7 +10540,8 @@ static bool sk_lookup_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sk_lookup, local_ip4): case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]): - case bpf_ctx_range(struct bpf_sk_lookup, remote_port): + case offsetof(struct bpf_sk_lookup, remote_port) ... + offsetof(struct bpf_sk_lookup, local_ip4) - 1: case bpf_ctx_range(struct bpf_sk_lookup, local_port): bpf_ctx_record_field_size(info, sizeof(__u32)); return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32)); -- cgit v1.2.3 From 22174e8ce8ac32a841e5c63a8b75fdae85ffef8a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jan 2022 10:19:22 +0300 Subject: rtc: mc146818-lib: fix signedness bug in mc146818_get_time() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7372971c1be5b7d4fdd8ad237798bdc1d1d54162 upstream. The mc146818_get_time() function returns zero on success or negative a error code on failure. It needs to be type int. Fixes: d35786b3a28d ("rtc: mc146818-lib: change return values of mc146818_get_time()") Signed-off-by: Dan Carpenter Reviewed-by: Mateusz Jończyk Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220111071922.GE11243@kili Signed-off-by: Greg Kroah-Hartman --- drivers/rtc/rtc-mc146818-lib.c | 2 +- include/linux/mc146818rtc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 70137566981e..8abac672f3cb 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -33,7 +33,7 @@ bool mc146818_does_rtc_work(void) } EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); -unsigned int mc146818_get_time(struct rtc_time *time) +int mc146818_get_time(struct rtc_time *time) { unsigned char ctrl; unsigned long flags; diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 69c80c4325bf..37c74e25f53d 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -124,7 +124,7 @@ struct cmos_rtc_board_info { #endif /* ARCH_RTC_LOCATION */ bool mc146818_does_rtc_work(void); -unsigned int mc146818_get_time(struct rtc_time *time); +int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); #endif /* _MC146818RTC_H */ -- cgit v1.2.3 From 5d69622eb0361e5ad266d49f6bf312e0bdb3c654 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Mar 2022 19:10:43 -0400 Subject: SUNRPC: Don't call connect() more than once on a TCP socket commit 89f42494f92f448747bd8a7ab1ae8b5d5520577d upstream. Avoid socket state races due to repeated calls to ->connect() using the same socket. If connect() returns 0 due to the connection having completed, but we are in fact in a closing state, then we may leave the XPRT_CONNECTING flag set on the transport. Reported-by: Enrico Scholz Fixes: 3be232f11a3c ("SUNRPC: Prevent immediate close+reconnect") Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- include/linux/sunrpc/xprtsock.h | 1 + net/sunrpc/xprtsock.c | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 8c2a712cb242..689062afdd61 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -89,5 +89,6 @@ struct sock_xprt { #define XPRT_SOCK_WAKE_WRITE (5) #define XPRT_SOCK_WAKE_PENDING (6) #define XPRT_SOCK_WAKE_DISCONNECT (7) +#define XPRT_SOCK_CONNECT_SENT (8) #endif /* _LINUX_SUNRPC_XPRTSOCK_H */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8bc58302e550..c2f7819827b6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2257,6 +2257,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) fallthrough; case -EINPROGRESS: /* SYN_SENT! */ + set_bit(XPRT_SOCK_CONNECT_SENT, &transport->sock_state); if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO) xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; break; @@ -2282,10 +2283,14 @@ static void xs_tcp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; - if (!sock) { - sock = xs_create_sock(xprt, transport, - xs_addr(xprt)->sa_family, SOCK_STREAM, - IPPROTO_TCP, true); + if (xprt_connected(xprt)) + goto out; + if (test_and_clear_bit(XPRT_SOCK_CONNECT_SENT, + &transport->sock_state) || + !sock) { + xs_reset_transport(transport); + sock = xs_create_sock(xprt, transport, xs_addr(xprt)->sa_family, + SOCK_STREAM, IPPROTO_TCP, true); if (IS_ERR(sock)) { status = PTR_ERR(sock); goto out; @@ -2365,13 +2370,9 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task) WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport)); - if (transport->sock != NULL && !xprt_connecting(xprt)) { + if (transport->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p for %lu " - "seconds\n", - xprt, xprt->reestablish_timeout / HZ); - - /* Start by resetting any existing state */ - xs_reset_transport(transport); + "seconds\n", xprt, xprt->reestablish_timeout / HZ); delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO); -- cgit v1.2.3 From 429f413ed83f6b9f81e8c4362bcc19e817a4d208 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 8 Apr 2022 13:09:01 -0700 Subject: mm/sparsemem: fix 'mem_section' will never be NULL gcc 12 warning commit a431dbbc540532b7465eae4fc8b56a85a9fc7d17 upstream. The gcc 12 compiler reports a "'mem_section' will never be NULL" warning on the following code: static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section) return NULL; #endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; : It happens with CONFIG_SPARSEMEM_EXTREME off. The mem_section definition is #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif In the !CONFIG_SPARSEMEM_EXTREME case, mem_section is a static 2-dimensional array and so the check "!mem_section[SECTION_NR_TO_ROOT(nr)]" doesn't make sense. Fix this warning by moving the "!mem_section[SECTION_NR_TO_ROOT(nr)]" check up inside the CONFIG_SPARSEMEM_EXTREME block and adding an explicit NR_SECTION_ROOTS check to make sure that there is no out-of-bound array access. Link: https://lkml.kernel.org/r/20220331180246.2746210-1-longman@redhat.com Fixes: 3e347261a80b ("sparsemem extreme implementation") Signed-off-by: Waiman Long Reported-by: Justin Forbes Cc: "Kirill A . Shutemov" Cc: Ingo Molnar Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mmzone.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fa1ef98614bc..6ba100216530 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1351,13 +1351,16 @@ static inline unsigned long *section_to_usemap(struct mem_section *ms) static inline struct mem_section *__nr_to_section(unsigned long nr) { + unsigned long root = SECTION_NR_TO_ROOT(nr); + + if (unlikely(root >= NR_SECTION_ROOTS)) + return NULL; + #ifdef CONFIG_SPARSEMEM_EXTREME - if (!mem_section) + if (!mem_section || !mem_section[root]) return NULL; #endif - if (!mem_section[SECTION_NR_TO_ROOT(nr)]) - return NULL; - return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; + return &mem_section[root][nr & SECTION_ROOT_MASK]; } extern size_t mem_section_usage_size(void); -- cgit v1.2.3 From 68cea1e243b81e23e09b6822be6193321d9bf977 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 14 Mar 2022 12:49:36 +0100 Subject: static_call: Don't make __static_call_return0 static commit 8fd4ddda2f49a66bf5dd3d0c01966c4b1971308b upstream. System.map shows that vmlinux contains several instances of __static_call_return0(): c0004fc0 t __static_call_return0 c0011518 t __static_call_return0 c00d8160 t __static_call_return0 arch_static_call_transform() uses the middle one to check whether we are setting a call to __static_call_return0 or not: c0011520 : c0011520: 3d 20 c0 01 lis r9,-16383 <== r9 = 0xc001 << 16 c0011524: 39 29 15 18 addi r9,r9,5400 <== r9 += 0x1518 c0011528: 7c 05 48 00 cmpw r5,r9 <== r9 has value 0xc0011518 here So if static_call_update() is called with one of the other instances of __static_call_return0(), arch_static_call_transform() won't recognise it. In order to work properly, global single instance of __static_call_return0() is required. Fixes: 3f2a8fc4b15d ("static_call/x86: Add __static_call_return0()") Signed-off-by: Christophe Leroy Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/30821468a0e7d28251954b578e5051dc09300d04.1647258493.git.christophe.leroy@csgroup.eu Signed-off-by: Greg Kroah-Hartman --- include/linux/static_call.h | 5 +- kernel/Makefile | 3 +- kernel/static_call.c | 542 +------------------------------------------ kernel/static_call_inline.c | 543 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 547 insertions(+), 546 deletions(-) create mode 100644 kernel/static_call_inline.c (limited to 'include') diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 3e56a9751c06..fcc5b48989b3 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -248,10 +248,7 @@ static inline int static_call_text_reserved(void *start, void *end) return 0; } -static inline long __static_call_return0(void) -{ - return 0; -} +extern long __static_call_return0(void); #define EXPORT_STATIC_CALL(name) \ EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ diff --git a/kernel/Makefile b/kernel/Makefile index 4df609be42d0..0e119c52a2cd 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -113,7 +113,8 @@ obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o -obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/static_call.c b/kernel/static_call.c index 43ba0b1e0edb..e9c3e69f3837 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -1,548 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 -#include #include -#include -#include -#include -#include -#include -#include -#include -#include - -extern struct static_call_site __start_static_call_sites[], - __stop_static_call_sites[]; -extern struct static_call_tramp_key __start_static_call_tramp_key[], - __stop_static_call_tramp_key[]; - -static bool static_call_initialized; - -/* mutex to protect key modules/sites */ -static DEFINE_MUTEX(static_call_mutex); - -static void static_call_lock(void) -{ - mutex_lock(&static_call_mutex); -} - -static void static_call_unlock(void) -{ - mutex_unlock(&static_call_mutex); -} - -static inline void *static_call_addr(struct static_call_site *site) -{ - return (void *)((long)site->addr + (long)&site->addr); -} - -static inline unsigned long __static_call_key(const struct static_call_site *site) -{ - return (long)site->key + (long)&site->key; -} - -static inline struct static_call_key *static_call_key(const struct static_call_site *site) -{ - return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); -} - -/* These assume the key is word-aligned. */ -static inline bool static_call_is_init(struct static_call_site *site) -{ - return __static_call_key(site) & STATIC_CALL_SITE_INIT; -} - -static inline bool static_call_is_tail(struct static_call_site *site) -{ - return __static_call_key(site) & STATIC_CALL_SITE_TAIL; -} - -static inline void static_call_set_init(struct static_call_site *site) -{ - site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - - (long)&site->key; -} - -static int static_call_site_cmp(const void *_a, const void *_b) -{ - const struct static_call_site *a = _a; - const struct static_call_site *b = _b; - const struct static_call_key *key_a = static_call_key(a); - const struct static_call_key *key_b = static_call_key(b); - - if (key_a < key_b) - return -1; - - if (key_a > key_b) - return 1; - - return 0; -} - -static void static_call_site_swap(void *_a, void *_b, int size) -{ - long delta = (unsigned long)_a - (unsigned long)_b; - struct static_call_site *a = _a; - struct static_call_site *b = _b; - struct static_call_site tmp = *a; - - a->addr = b->addr - delta; - a->key = b->key - delta; - - b->addr = tmp.addr + delta; - b->key = tmp.key + delta; -} - -static inline void static_call_sort_entries(struct static_call_site *start, - struct static_call_site *stop) -{ - sort(start, stop - start, sizeof(struct static_call_site), - static_call_site_cmp, static_call_site_swap); -} - -static inline bool static_call_key_has_mods(struct static_call_key *key) -{ - return !(key->type & 1); -} - -static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) -{ - if (!static_call_key_has_mods(key)) - return NULL; - - return key->mods; -} - -static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) -{ - if (static_call_key_has_mods(key)) - return NULL; - - return (struct static_call_site *)(key->type & ~1); -} - -void __static_call_update(struct static_call_key *key, void *tramp, void *func) -{ - struct static_call_site *site, *stop; - struct static_call_mod *site_mod, first; - - cpus_read_lock(); - static_call_lock(); - - if (key->func == func) - goto done; - - key->func = func; - - arch_static_call_transform(NULL, tramp, func, false); - - /* - * If uninitialized, we'll not update the callsites, but they still - * point to the trampoline and we just patched that. - */ - if (WARN_ON_ONCE(!static_call_initialized)) - goto done; - - first = (struct static_call_mod){ - .next = static_call_key_next(key), - .mod = NULL, - .sites = static_call_key_sites(key), - }; - - for (site_mod = &first; site_mod; site_mod = site_mod->next) { - bool init = system_state < SYSTEM_RUNNING; - struct module *mod = site_mod->mod; - - if (!site_mod->sites) { - /* - * This can happen if the static call key is defined in - * a module which doesn't use it. - * - * It also happens in the has_mods case, where the - * 'first' entry has no sites associated with it. - */ - continue; - } - - stop = __stop_static_call_sites; - - if (mod) { -#ifdef CONFIG_MODULES - stop = mod->static_call_sites + - mod->num_static_call_sites; - init = mod->state == MODULE_STATE_COMING; -#endif - } - - for (site = site_mod->sites; - site < stop && static_call_key(site) == key; site++) { - void *site_addr = static_call_addr(site); - - if (!init && static_call_is_init(site)) - continue; - - if (!kernel_text_address((unsigned long)site_addr)) { - /* - * This skips patching built-in __exit, which - * is part of init_section_contains() but is - * not part of kernel_text_address(). - * - * Skipping built-in __exit is fine since it - * will never be executed. - */ - WARN_ONCE(!static_call_is_init(site), - "can't patch static call site at %pS", - site_addr); - continue; - } - - arch_static_call_transform(site_addr, NULL, func, - static_call_is_tail(site)); - } - } - -done: - static_call_unlock(); - cpus_read_unlock(); -} -EXPORT_SYMBOL_GPL(__static_call_update); - -static int __static_call_init(struct module *mod, - struct static_call_site *start, - struct static_call_site *stop) -{ - struct static_call_site *site; - struct static_call_key *key, *prev_key = NULL; - struct static_call_mod *site_mod; - - if (start == stop) - return 0; - - static_call_sort_entries(start, stop); - - for (site = start; site < stop; site++) { - void *site_addr = static_call_addr(site); - - if ((mod && within_module_init((unsigned long)site_addr, mod)) || - (!mod && init_section_contains(site_addr, 1))) - static_call_set_init(site); - - key = static_call_key(site); - if (key != prev_key) { - prev_key = key; - - /* - * For vmlinux (!mod) avoid the allocation by storing - * the sites pointer in the key itself. Also see - * __static_call_update()'s @first. - * - * This allows architectures (eg. x86) to call - * static_call_init() before memory allocation works. - */ - if (!mod) { - key->sites = site; - key->type |= 1; - goto do_transform; - } - - site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); - if (!site_mod) - return -ENOMEM; - - /* - * When the key has a direct sites pointer, extract - * that into an explicit struct static_call_mod, so we - * can have a list of modules. - */ - if (static_call_key_sites(key)) { - site_mod->mod = NULL; - site_mod->next = NULL; - site_mod->sites = static_call_key_sites(key); - - key->mods = site_mod; - - site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); - if (!site_mod) - return -ENOMEM; - } - - site_mod->mod = mod; - site_mod->sites = site; - site_mod->next = static_call_key_next(key); - key->mods = site_mod; - } - -do_transform: - arch_static_call_transform(site_addr, NULL, key->func, - static_call_is_tail(site)); - } - - return 0; -} - -static int addr_conflict(struct static_call_site *site, void *start, void *end) -{ - unsigned long addr = (unsigned long)static_call_addr(site); - - if (addr <= (unsigned long)end && - addr + CALL_INSN_SIZE > (unsigned long)start) - return 1; - - return 0; -} - -static int __static_call_text_reserved(struct static_call_site *iter_start, - struct static_call_site *iter_stop, - void *start, void *end, bool init) -{ - struct static_call_site *iter = iter_start; - - while (iter < iter_stop) { - if (init || !static_call_is_init(iter)) { - if (addr_conflict(iter, start, end)) - return 1; - } - iter++; - } - - return 0; -} - -#ifdef CONFIG_MODULES - -static int __static_call_mod_text_reserved(void *start, void *end) -{ - struct module *mod; - int ret; - - preempt_disable(); - mod = __module_text_address((unsigned long)start); - WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); - if (!try_module_get(mod)) - mod = NULL; - preempt_enable(); - - if (!mod) - return 0; - - ret = __static_call_text_reserved(mod->static_call_sites, - mod->static_call_sites + mod->num_static_call_sites, - start, end, mod->state == MODULE_STATE_COMING); - - module_put(mod); - - return ret; -} - -static unsigned long tramp_key_lookup(unsigned long addr) -{ - struct static_call_tramp_key *start = __start_static_call_tramp_key; - struct static_call_tramp_key *stop = __stop_static_call_tramp_key; - struct static_call_tramp_key *tramp_key; - - for (tramp_key = start; tramp_key != stop; tramp_key++) { - unsigned long tramp; - - tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; - if (tramp == addr) - return (long)tramp_key->key + (long)&tramp_key->key; - } - - return 0; -} - -static int static_call_add_module(struct module *mod) -{ - struct static_call_site *start = mod->static_call_sites; - struct static_call_site *stop = start + mod->num_static_call_sites; - struct static_call_site *site; - - for (site = start; site != stop; site++) { - unsigned long s_key = __static_call_key(site); - unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; - unsigned long key; - - /* - * Is the key is exported, 'addr' points to the key, which - * means modules are allowed to call static_call_update() on - * it. - * - * Otherwise, the key isn't exported, and 'addr' points to the - * trampoline so we need to lookup the key. - * - * We go through this dance to prevent crazy modules from - * abusing sensitive static calls. - */ - if (!kernel_text_address(addr)) - continue; - - key = tramp_key_lookup(addr); - if (!key) { - pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", - static_call_addr(site)); - return -EINVAL; - } - - key |= s_key & STATIC_CALL_SITE_FLAGS; - site->key = key - (long)&site->key; - } - - return __static_call_init(mod, start, stop); -} - -static void static_call_del_module(struct module *mod) -{ - struct static_call_site *start = mod->static_call_sites; - struct static_call_site *stop = mod->static_call_sites + - mod->num_static_call_sites; - struct static_call_key *key, *prev_key = NULL; - struct static_call_mod *site_mod, **prev; - struct static_call_site *site; - - for (site = start; site < stop; site++) { - key = static_call_key(site); - if (key == prev_key) - continue; - - prev_key = key; - - for (prev = &key->mods, site_mod = key->mods; - site_mod && site_mod->mod != mod; - prev = &site_mod->next, site_mod = site_mod->next) - ; - - if (!site_mod) - continue; - - *prev = site_mod->next; - kfree(site_mod); - } -} - -static int static_call_module_notify(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct module *mod = data; - int ret = 0; - - cpus_read_lock(); - static_call_lock(); - - switch (val) { - case MODULE_STATE_COMING: - ret = static_call_add_module(mod); - if (ret) { - WARN(1, "Failed to allocate memory for static calls"); - static_call_del_module(mod); - } - break; - case MODULE_STATE_GOING: - static_call_del_module(mod); - break; - } - - static_call_unlock(); - cpus_read_unlock(); - - return notifier_from_errno(ret); -} - -static struct notifier_block static_call_module_nb = { - .notifier_call = static_call_module_notify, -}; - -#else - -static inline int __static_call_mod_text_reserved(void *start, void *end) -{ - return 0; -} - -#endif /* CONFIG_MODULES */ - -int static_call_text_reserved(void *start, void *end) -{ - bool init = system_state < SYSTEM_RUNNING; - int ret = __static_call_text_reserved(__start_static_call_sites, - __stop_static_call_sites, start, end, init); - - if (ret) - return ret; - - return __static_call_mod_text_reserved(start, end); -} - -int __init static_call_init(void) -{ - int ret; - - if (static_call_initialized) - return 0; - - cpus_read_lock(); - static_call_lock(); - ret = __static_call_init(NULL, __start_static_call_sites, - __stop_static_call_sites); - static_call_unlock(); - cpus_read_unlock(); - - if (ret) { - pr_err("Failed to allocate memory for static_call!\n"); - BUG(); - } - - static_call_initialized = true; - -#ifdef CONFIG_MODULES - register_module_notifier(&static_call_module_nb); -#endif - return 0; -} -early_initcall(static_call_init); long __static_call_return0(void) { return 0; } - -#ifdef CONFIG_STATIC_CALL_SELFTEST - -static int func_a(int x) -{ - return x+1; -} - -static int func_b(int x) -{ - return x+2; -} - -DEFINE_STATIC_CALL(sc_selftest, func_a); - -static struct static_call_data { - int (*func)(int); - int val; - int expect; -} static_call_data [] __initdata = { - { NULL, 2, 3 }, - { func_b, 2, 4 }, - { func_a, 2, 3 } -}; - -static int __init test_static_call_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { - struct static_call_data *scd = &static_call_data[i]; - - if (scd->func) - static_call_update(sc_selftest, scd->func); - - WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); - } - - return 0; -} -early_initcall(test_static_call_init); - -#endif /* CONFIG_STATIC_CALL_SELFTEST */ +EXPORT_SYMBOL_GPL(__static_call_return0); diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c new file mode 100644 index 000000000000..dc5665b62814 --- /dev/null +++ b/kernel/static_call_inline.c @@ -0,0 +1,543 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct static_call_site __start_static_call_sites[], + __stop_static_call_sites[]; +extern struct static_call_tramp_key __start_static_call_tramp_key[], + __stop_static_call_tramp_key[]; + +static bool static_call_initialized; + +/* mutex to protect key modules/sites */ +static DEFINE_MUTEX(static_call_mutex); + +static void static_call_lock(void) +{ + mutex_lock(&static_call_mutex); +} + +static void static_call_unlock(void) +{ + mutex_unlock(&static_call_mutex); +} + +static inline void *static_call_addr(struct static_call_site *site) +{ + return (void *)((long)site->addr + (long)&site->addr); +} + +static inline unsigned long __static_call_key(const struct static_call_site *site) +{ + return (long)site->key + (long)&site->key; +} + +static inline struct static_call_key *static_call_key(const struct static_call_site *site) +{ + return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS); +} + +/* These assume the key is word-aligned. */ +static inline bool static_call_is_init(struct static_call_site *site) +{ + return __static_call_key(site) & STATIC_CALL_SITE_INIT; +} + +static inline bool static_call_is_tail(struct static_call_site *site) +{ + return __static_call_key(site) & STATIC_CALL_SITE_TAIL; +} + +static inline void static_call_set_init(struct static_call_site *site) +{ + site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) - + (long)&site->key; +} + +static int static_call_site_cmp(const void *_a, const void *_b) +{ + const struct static_call_site *a = _a; + const struct static_call_site *b = _b; + const struct static_call_key *key_a = static_call_key(a); + const struct static_call_key *key_b = static_call_key(b); + + if (key_a < key_b) + return -1; + + if (key_a > key_b) + return 1; + + return 0; +} + +static void static_call_site_swap(void *_a, void *_b, int size) +{ + long delta = (unsigned long)_a - (unsigned long)_b; + struct static_call_site *a = _a; + struct static_call_site *b = _b; + struct static_call_site tmp = *a; + + a->addr = b->addr - delta; + a->key = b->key - delta; + + b->addr = tmp.addr + delta; + b->key = tmp.key + delta; +} + +static inline void static_call_sort_entries(struct static_call_site *start, + struct static_call_site *stop) +{ + sort(start, stop - start, sizeof(struct static_call_site), + static_call_site_cmp, static_call_site_swap); +} + +static inline bool static_call_key_has_mods(struct static_call_key *key) +{ + return !(key->type & 1); +} + +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key) +{ + if (!static_call_key_has_mods(key)) + return NULL; + + return key->mods; +} + +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key) +{ + if (static_call_key_has_mods(key)) + return NULL; + + return (struct static_call_site *)(key->type & ~1); +} + +void __static_call_update(struct static_call_key *key, void *tramp, void *func) +{ + struct static_call_site *site, *stop; + struct static_call_mod *site_mod, first; + + cpus_read_lock(); + static_call_lock(); + + if (key->func == func) + goto done; + + key->func = func; + + arch_static_call_transform(NULL, tramp, func, false); + + /* + * If uninitialized, we'll not update the callsites, but they still + * point to the trampoline and we just patched that. + */ + if (WARN_ON_ONCE(!static_call_initialized)) + goto done; + + first = (struct static_call_mod){ + .next = static_call_key_next(key), + .mod = NULL, + .sites = static_call_key_sites(key), + }; + + for (site_mod = &first; site_mod; site_mod = site_mod->next) { + bool init = system_state < SYSTEM_RUNNING; + struct module *mod = site_mod->mod; + + if (!site_mod->sites) { + /* + * This can happen if the static call key is defined in + * a module which doesn't use it. + * + * It also happens in the has_mods case, where the + * 'first' entry has no sites associated with it. + */ + continue; + } + + stop = __stop_static_call_sites; + + if (mod) { +#ifdef CONFIG_MODULES + stop = mod->static_call_sites + + mod->num_static_call_sites; + init = mod->state == MODULE_STATE_COMING; +#endif + } + + for (site = site_mod->sites; + site < stop && static_call_key(site) == key; site++) { + void *site_addr = static_call_addr(site); + + if (!init && static_call_is_init(site)) + continue; + + if (!kernel_text_address((unsigned long)site_addr)) { + /* + * This skips patching built-in __exit, which + * is part of init_section_contains() but is + * not part of kernel_text_address(). + * + * Skipping built-in __exit is fine since it + * will never be executed. + */ + WARN_ONCE(!static_call_is_init(site), + "can't patch static call site at %pS", + site_addr); + continue; + } + + arch_static_call_transform(site_addr, NULL, func, + static_call_is_tail(site)); + } + } + +done: + static_call_unlock(); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(__static_call_update); + +static int __static_call_init(struct module *mod, + struct static_call_site *start, + struct static_call_site *stop) +{ + struct static_call_site *site; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod; + + if (start == stop) + return 0; + + static_call_sort_entries(start, stop); + + for (site = start; site < stop; site++) { + void *site_addr = static_call_addr(site); + + if ((mod && within_module_init((unsigned long)site_addr, mod)) || + (!mod && init_section_contains(site_addr, 1))) + static_call_set_init(site); + + key = static_call_key(site); + if (key != prev_key) { + prev_key = key; + + /* + * For vmlinux (!mod) avoid the allocation by storing + * the sites pointer in the key itself. Also see + * __static_call_update()'s @first. + * + * This allows architectures (eg. x86) to call + * static_call_init() before memory allocation works. + */ + if (!mod) { + key->sites = site; + key->type |= 1; + goto do_transform; + } + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + + /* + * When the key has a direct sites pointer, extract + * that into an explicit struct static_call_mod, so we + * can have a list of modules. + */ + if (static_call_key_sites(key)) { + site_mod->mod = NULL; + site_mod->next = NULL; + site_mod->sites = static_call_key_sites(key); + + key->mods = site_mod; + + site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL); + if (!site_mod) + return -ENOMEM; + } + + site_mod->mod = mod; + site_mod->sites = site; + site_mod->next = static_call_key_next(key); + key->mods = site_mod; + } + +do_transform: + arch_static_call_transform(site_addr, NULL, key->func, + static_call_is_tail(site)); + } + + return 0; +} + +static int addr_conflict(struct static_call_site *site, void *start, void *end) +{ + unsigned long addr = (unsigned long)static_call_addr(site); + + if (addr <= (unsigned long)end && + addr + CALL_INSN_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +static int __static_call_text_reserved(struct static_call_site *iter_start, + struct static_call_site *iter_stop, + void *start, void *end, bool init) +{ + struct static_call_site *iter = iter_start; + + while (iter < iter_stop) { + if (init || !static_call_is_init(iter)) { + if (addr_conflict(iter, start, end)) + return 1; + } + iter++; + } + + return 0; +} + +#ifdef CONFIG_MODULES + +static int __static_call_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + int ret; + + preempt_disable(); + mod = __module_text_address((unsigned long)start); + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + if (!try_module_get(mod)) + mod = NULL; + preempt_enable(); + + if (!mod) + return 0; + + ret = __static_call_text_reserved(mod->static_call_sites, + mod->static_call_sites + mod->num_static_call_sites, + start, end, mod->state == MODULE_STATE_COMING); + + module_put(mod); + + return ret; +} + +static unsigned long tramp_key_lookup(unsigned long addr) +{ + struct static_call_tramp_key *start = __start_static_call_tramp_key; + struct static_call_tramp_key *stop = __stop_static_call_tramp_key; + struct static_call_tramp_key *tramp_key; + + for (tramp_key = start; tramp_key != stop; tramp_key++) { + unsigned long tramp; + + tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; + if (tramp == addr) + return (long)tramp_key->key + (long)&tramp_key->key; + } + + return 0; +} + +static int static_call_add_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = start + mod->num_static_call_sites; + struct static_call_site *site; + + for (site = start; site != stop; site++) { + unsigned long s_key = __static_call_key(site); + unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS; + unsigned long key; + + /* + * Is the key is exported, 'addr' points to the key, which + * means modules are allowed to call static_call_update() on + * it. + * + * Otherwise, the key isn't exported, and 'addr' points to the + * trampoline so we need to lookup the key. + * + * We go through this dance to prevent crazy modules from + * abusing sensitive static calls. + */ + if (!kernel_text_address(addr)) + continue; + + key = tramp_key_lookup(addr); + if (!key) { + pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", + static_call_addr(site)); + return -EINVAL; + } + + key |= s_key & STATIC_CALL_SITE_FLAGS; + site->key = key - (long)&site->key; + } + + return __static_call_init(mod, start, stop); +} + +static void static_call_del_module(struct module *mod) +{ + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = mod->static_call_sites + + mod->num_static_call_sites; + struct static_call_key *key, *prev_key = NULL; + struct static_call_mod *site_mod, **prev; + struct static_call_site *site; + + for (site = start; site < stop; site++) { + key = static_call_key(site); + if (key == prev_key) + continue; + + prev_key = key; + + for (prev = &key->mods, site_mod = key->mods; + site_mod && site_mod->mod != mod; + prev = &site_mod->next, site_mod = site_mod->next) + ; + + if (!site_mod) + continue; + + *prev = site_mod->next; + kfree(site_mod); + } +} + +static int static_call_module_notify(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + cpus_read_lock(); + static_call_lock(); + + switch (val) { + case MODULE_STATE_COMING: + ret = static_call_add_module(mod); + if (ret) { + WARN(1, "Failed to allocate memory for static calls"); + static_call_del_module(mod); + } + break; + case MODULE_STATE_GOING: + static_call_del_module(mod); + break; + } + + static_call_unlock(); + cpus_read_unlock(); + + return notifier_from_errno(ret); +} + +static struct notifier_block static_call_module_nb = { + .notifier_call = static_call_module_notify, +}; + +#else + +static inline int __static_call_mod_text_reserved(void *start, void *end) +{ + return 0; +} + +#endif /* CONFIG_MODULES */ + +int static_call_text_reserved(void *start, void *end) +{ + bool init = system_state < SYSTEM_RUNNING; + int ret = __static_call_text_reserved(__start_static_call_sites, + __stop_static_call_sites, start, end, init); + + if (ret) + return ret; + + return __static_call_mod_text_reserved(start, end); +} + +int __init static_call_init(void) +{ + int ret; + + if (static_call_initialized) + return 0; + + cpus_read_lock(); + static_call_lock(); + ret = __static_call_init(NULL, __start_static_call_sites, + __stop_static_call_sites); + static_call_unlock(); + cpus_read_unlock(); + + if (ret) { + pr_err("Failed to allocate memory for static_call!\n"); + BUG(); + } + + static_call_initialized = true; + +#ifdef CONFIG_MODULES + register_module_notifier(&static_call_module_nb); +#endif + return 0; +} +early_initcall(static_call_init); + +#ifdef CONFIG_STATIC_CALL_SELFTEST + +static int func_a(int x) +{ + return x+1; +} + +static int func_b(int x) +{ + return x+2; +} + +DEFINE_STATIC_CALL(sc_selftest, func_a); + +static struct static_call_data { + int (*func)(int); + int val; + int expect; +} static_call_data [] __initdata = { + { NULL, 2, 3 }, + { func_b, 2, 4 }, + { func_a, 2, 3 } +}; + +static int __init test_static_call_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(static_call_data); i++ ) { + struct static_call_data *scd = &static_call_data[i]; + + if (scd->func) + static_call_update(sc_selftest, scd->func); + + WARN_ON(static_call(sc_selftest)(scd->val) != scd->expect); + } + + return 0; +} +early_initcall(test_static_call_init); + +#endif /* CONFIG_STATIC_CALL_SELFTEST */ -- cgit v1.2.3 From c9ea4fb1f3f356d7a04ebe15ac867ba3192c6d4f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 5 Nov 2021 13:45:25 -0700 Subject: stacktrace: move filter_irq_stacks() to kernel/stacktrace.c commit f39f21b3ddc7fc0f87eb6dc75ddc81b5bbfb7672 upstream. filter_irq_stacks() has little to do with the stackdepot implementation, except that it is usually used by users (such as KASAN) of stackdepot to reduce the stack trace. However, filter_irq_stacks() itself is not useful without a stack trace as obtained by stack_trace_save() and friends. Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new users of filter_irq_stacks() do not have to start depending on STACKDEPOT only for filter_irq_stacks(). Link: https://lkml.kernel.org/r/20210923104803.2620285-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Jann Horn Cc: Aleksandr Nogikh Cc: Taras Madan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/stackdepot.h | 2 -- include/linux/stacktrace.h | 1 + kernel/stacktrace.c | 30 ++++++++++++++++++++++++++++++ lib/stackdepot.c | 24 ------------------------ 4 files changed, 31 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 6bb4bc1a5f54..22919a94ca19 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -19,8 +19,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); - #ifdef CONFIG_STACKDEPOT int stack_depot_init(void); #else diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 9edecb494e9e..bef158815e83 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, unsigned int size, unsigned int skipnr); unsigned int stack_trace_save_user(unsigned long *store, unsigned int size); +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries); /* Internal interfaces. Do not use in generic code */ #ifdef CONFIG_ARCH_STACKWALK diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9f8117c7cfdd..9c625257023d 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -13,6 +13,7 @@ #include #include #include +#include /** * stack_trace_print - Print the entries in the stack trace @@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) #endif /* CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* !CONFIG_ARCH_STACKWALK */ + +static inline bool in_irqentry_text(unsigned long ptr) +{ + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +/** + * filter_irq_stacks - Find first IRQ stack entry in trace + * @entries: Pointer to stack trace array + * @nr_entries: Number of entries in the storage array + * + * Return: Number of trace entries until IRQ stack starts. + */ +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries) +{ + unsigned int i; + + for (i = 0; i < nr_entries; i++) { + if (in_irqentry_text(entries[i])) { + /* Include the irqentry function into the stack. */ + return i + 1; + } + } + return nr_entries; +} +EXPORT_SYMBOL_GPL(filter_irq_stacks); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 0a2e417f83cb..e90f0f19e77f 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -20,7 +20,6 @@ */ #include -#include #include #include #include @@ -341,26 +340,3 @@ fast_exit: return retval; } EXPORT_SYMBOL_GPL(stack_depot_save); - -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -unsigned int filter_irq_stacks(unsigned long *entries, - unsigned int nr_entries) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (in_irqentry_text(entries[i])) { - /* Include the irqentry function into the stack. */ - return i + 1; - } - } - return nr_entries; -} -EXPORT_SYMBOL_GPL(filter_irq_stacks); -- cgit v1.2.3 From 82552ddcb3aa870773cd84f658e3aab7725cf86c Mon Sep 17 00:00:00 2001 From: Eddie James Date: Tue, 12 Apr 2022 16:53:31 -0500 Subject: soc: aspeed: xdma: Add trace events Trace the flow of the driver to aid debugging after an error. OpenBMC-Staging-Count: 1 Signed-off-by: Eddie James Acked-by: Andrew Jeffery Link: https://lore.kernel.org/r/20220412215331.42491-1-eajames@linux.ibm.com Signed-off-by: Joel Stanley --- drivers/soc/aspeed/aspeed-xdma.c | 18 ++++- include/trace/events/xdma.h | 139 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/xdma.h (limited to 'include') diff --git a/drivers/soc/aspeed/aspeed-xdma.c b/drivers/soc/aspeed/aspeed-xdma.c index 48cfe30c90ad..579937ee3745 100644 --- a/drivers/soc/aspeed/aspeed-xdma.c +++ b/drivers/soc/aspeed/aspeed-xdma.c @@ -253,6 +253,9 @@ struct aspeed_xdma_client { u32 size; }; +#define CREATE_TRACE_POINTS +#include + static u32 aspeed_xdma_readl(struct aspeed_xdma *ctx, u8 reg) { u32 v = readl(ctx->base + reg); @@ -448,6 +451,7 @@ static int aspeed_xdma_start(struct aspeed_xdma *ctx, unsigned int num_cmds, ctx->upstream = upstream; for (i = 0; i < num_cmds; ++i) { + trace_xdma_start(ctx, &cmds[i]); /* * Use memcpy_toio here to get some barriers before starting * the operation. The command(s) need to be in physical memory @@ -490,6 +494,8 @@ static irqreturn_t aspeed_xdma_irq(int irq, void *arg) spin_lock(&ctx->engine_lock); status = aspeed_xdma_readl(ctx, ctx->chip->regs.status); + trace_xdma_irq(status); + if (status & ctx->chip->status_bits.ds_dirty) { aspeed_xdma_done(ctx, true); } else { @@ -514,6 +520,8 @@ static void aspeed_xdma_reset(struct aspeed_xdma *ctx) { unsigned long flags; + trace_xdma_reset(ctx); + reset_control_assert(ctx->reset); usleep_range(XDMA_ENGINE_SETUP_TIME_MIN_US, XDMA_ENGINE_SETUP_TIME_MAX_US); @@ -544,7 +552,7 @@ static irqreturn_t aspeed_xdma_pcie_irq(int irq, void *arg) { struct aspeed_xdma *ctx = arg; - dev_dbg(ctx->dev, "PCI-E reset requested.\n"); + trace_xdma_perst(ctx); spin_lock(&ctx->engine_lock); if (ctx->in_reset) { @@ -682,6 +690,7 @@ static void aspeed_xdma_vma_close(struct vm_area_struct *vma) gen_pool_free(client->ctx->pool, (unsigned long)client->virt, client->size); + trace_xdma_unmap(client); client->virt = NULL; client->phys = 0; @@ -706,6 +715,7 @@ static int aspeed_xdma_mmap(struct file *file, struct vm_area_struct *vma) client->virt = gen_pool_dma_alloc(ctx->pool, client->size, &client->phys); if (!client->virt) { + trace_xdma_mmap_error(client, 0UL); client->phys = 0; client->size = 0; return -ENOMEM; @@ -725,12 +735,14 @@ static int aspeed_xdma_mmap(struct file *file, struct vm_area_struct *vma) gen_pool_free(ctx->pool, (unsigned long)client->virt, client->size); + trace_xdma_mmap_error(client, vma->vm_start); client->virt = NULL; client->phys = 0; client->size = 0; return rc; } + trace_xdma_mmap(client); dev_dbg(ctx->dev, "mmap: v[%08lx] to p[%08x], s[%08x]\n", vma->vm_start, (u32)client->phys, client->size); @@ -776,9 +788,11 @@ static int aspeed_xdma_release(struct inode *inode, struct file *file) if (reset) aspeed_xdma_reset(ctx); - if (client->virt) + if (client->virt) { gen_pool_free(ctx->pool, (unsigned long)client->virt, client->size); + trace_xdma_unmap(client); + } kfree(client); kobject_put(&ctx->kobj); diff --git a/include/trace/events/xdma.h b/include/trace/events/xdma.h new file mode 100644 index 000000000000..bf515ad3d8e5 --- /dev/null +++ b/include/trace/events/xdma.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xdma + +#if !defined(_TRACE_XDMA_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XDMA_H + +#include + +TRACE_EVENT(xdma_start, + TP_PROTO(const struct aspeed_xdma *ctx, const struct aspeed_xdma_cmd *cmd), + TP_ARGS(ctx, cmd), + TP_STRUCT__entry( + __field(bool, dir_upstream) + __field(unsigned int, index) + __field(__u64, host) + __field(__u64, pitch) + __field(__u64, cmd) + ), + TP_fast_assign( + __entry->dir_upstream = ctx->upstream; + __entry->index = ctx->cmd_idx; + __entry->host = cmd->host_addr; + __entry->pitch = cmd->pitch; + __entry->cmd = cmd->cmd; + ), + TP_printk("%s cmd:%u [%08llx %016llx %016llx]", + __entry->dir_upstream ? "upstream" : "downstream", + __entry->index, + __entry->host, + __entry->pitch, + __entry->cmd + ) +); + +TRACE_EVENT(xdma_irq, + TP_PROTO(u32 sts), + TP_ARGS(sts), + TP_STRUCT__entry( + __field(__u32, status) + ), + TP_fast_assign( + __entry->status = sts; + ), + TP_printk("sts:%08x", + __entry->status + ) +); + +TRACE_EVENT(xdma_reset, + TP_PROTO(const struct aspeed_xdma *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(bool, dir_upstream) + __field(bool, in_progress) + ), + TP_fast_assign( + __entry->dir_upstream = ctx->upstream; + __entry->in_progress = + ctx->current_client ? ctx->current_client->in_progress : false; + ), + TP_printk("%sin progress%s", + __entry->in_progress ? "" : "not ", + __entry->in_progress ? (__entry->dir_upstream ? " upstream" : " downstream") : "" + ) +); + +TRACE_EVENT(xdma_perst, + TP_PROTO(const struct aspeed_xdma *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(bool, in_reset) + ), + TP_fast_assign( + __entry->in_reset = ctx->in_reset; + ), + TP_printk("%s", + __entry->in_reset ? "in reset" : "" + ) +); + +TRACE_EVENT(xdma_unmap, + TP_PROTO(const struct aspeed_xdma_client *client), + TP_ARGS(client), + TP_STRUCT__entry( + __field(__u32, phys) + __field(__u32, size) + ), + TP_fast_assign( + __entry->phys = client->phys; + __entry->size = client->size; + ), + TP_printk("p:%08x s:%08x", + __entry->phys, + __entry->size + ) +); + +TRACE_EVENT(xdma_mmap_error, + TP_PROTO(const struct aspeed_xdma_client *client, unsigned long vm_start), + TP_ARGS(client, vm_start), + TP_STRUCT__entry( + __field(__u32, phys) + __field(__u32, size) + __field(unsigned long, vm_start) + ), + TP_fast_assign( + __entry->phys = client->phys; + __entry->size = client->size; + __entry->vm_start = vm_start; + ), + TP_printk("p:%08x s:%08x v:%08lx", + __entry->phys, + __entry->size, + __entry->vm_start + ) +); + +TRACE_EVENT(xdma_mmap, + TP_PROTO(const struct aspeed_xdma_client *client), + TP_ARGS(client), + TP_STRUCT__entry( + __field(__u32, phys) + __field(__u32, size) + ), + TP_fast_assign( + __entry->phys = client->phys; + __entry->size = client->size; + ), + TP_printk("p:%08x s:%08x", + __entry->phys, + __entry->size + ) +); + +#endif /* _TRACE_XDMA_H */ + +#include -- cgit v1.2.3 From 7883df73fd594757f03e21da7598f4347da0441d Mon Sep 17 00:00:00 2001 From: Deepak Kumar Singh Date: Tue, 31 Aug 2021 20:00:27 +0530 Subject: soc: qcom: aoss: Expose send for generic usecase commit 8c75d585b931ac874fbe4ee5a8f1811d20c2817f upstream. Not all upcoming usecases will have an interface to allow the aoss driver to hook onto. Expose the send api and create a get function to enable drivers to send their own messages to aoss. Signed-off-by: Chris Lew Signed-off-by: Deepak Kumar Singh Reviewed-by: Stephen Boyd Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/1630420228-31075-2-git-send-email-deesin@codeaurora.org Signed-off-by: Alex Elder Signed-off-by: Greg Kroah-Hartman --- drivers/soc/qcom/qcom_aoss.c | 54 +++++++++++++++++++++++++++++++++++++- include/linux/soc/qcom/qcom_aoss.h | 38 +++++++++++++++++++++++++++ 2 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 include/linux/soc/qcom/qcom_aoss.h (limited to 'include') diff --git a/drivers/soc/qcom/qcom_aoss.c b/drivers/soc/qcom/qcom_aoss.c index a0659cf27845..d40d8e20ceb4 100644 --- a/drivers/soc/qcom/qcom_aoss.c +++ b/drivers/soc/qcom/qcom_aoss.c @@ -8,10 +8,12 @@ #include #include #include +#include #include #include #include #include +#include #define QMP_DESC_MAGIC 0x0 #define QMP_DESC_VERSION 0x4 @@ -223,11 +225,14 @@ static bool qmp_message_empty(struct qmp *qmp) * * Return: 0 on success, negative errno on failure */ -static int qmp_send(struct qmp *qmp, const void *data, size_t len) +int qmp_send(struct qmp *qmp, const void *data, size_t len) { long time_left; int ret; + if (WARN_ON(IS_ERR_OR_NULL(qmp) || !data)) + return -EINVAL; + if (WARN_ON(len + sizeof(u32) > qmp->size)) return -EINVAL; @@ -261,6 +266,7 @@ static int qmp_send(struct qmp *qmp, const void *data, size_t len) return ret; } +EXPORT_SYMBOL(qmp_send); static int qmp_qdss_clk_prepare(struct clk_hw *hw) { @@ -519,6 +525,51 @@ static void qmp_cooling_devices_remove(struct qmp *qmp) thermal_cooling_device_unregister(qmp->cooling_devs[i].cdev); } +/** + * qmp_get() - get a qmp handle from a device + * @dev: client device pointer + * + * Return: handle to qmp device on success, ERR_PTR() on failure + */ +struct qmp *qmp_get(struct device *dev) +{ + struct platform_device *pdev; + struct device_node *np; + struct qmp *qmp; + + if (!dev || !dev->of_node) + return ERR_PTR(-EINVAL); + + np = of_parse_phandle(dev->of_node, "qcom,qmp", 0); + if (!np) + return ERR_PTR(-ENODEV); + + pdev = of_find_device_by_node(np); + of_node_put(np); + if (!pdev) + return ERR_PTR(-EINVAL); + + qmp = platform_get_drvdata(pdev); + + return qmp ? qmp : ERR_PTR(-EPROBE_DEFER); +} +EXPORT_SYMBOL(qmp_get); + +/** + * qmp_put() - release a qmp handle + * @qmp: qmp handle obtained from qmp_get() + */ +void qmp_put(struct qmp *qmp) +{ + /* + * Match get_device() inside of_find_device_by_node() in + * qmp_get() + */ + if (!IS_ERR_OR_NULL(qmp)) + put_device(qmp->dev); +} +EXPORT_SYMBOL(qmp_put); + static int qmp_probe(struct platform_device *pdev) { struct resource *res; @@ -615,6 +666,7 @@ static struct platform_driver qmp_driver = { .driver = { .name = "qcom_aoss_qmp", .of_match_table = qmp_dt_match, + .suppress_bind_attrs = true, }, .probe = qmp_probe, .remove = qmp_remove, diff --git a/include/linux/soc/qcom/qcom_aoss.h b/include/linux/soc/qcom/qcom_aoss.h new file mode 100644 index 000000000000..3c2a82e606f8 --- /dev/null +++ b/include/linux/soc/qcom/qcom_aoss.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#ifndef __QCOM_AOSS_H__ +#define __QCOM_AOSS_H__ + +#include +#include + +struct qmp; + +#if IS_ENABLED(CONFIG_QCOM_AOSS_QMP) + +int qmp_send(struct qmp *qmp, const void *data, size_t len); +struct qmp *qmp_get(struct device *dev); +void qmp_put(struct qmp *qmp); + +#else + +static inline int qmp_send(struct qmp *qmp, const void *data, size_t len) +{ + return -ENODEV; +} + +static inline struct qmp *qmp_get(struct device *dev) +{ + return ERR_PTR(-ENODEV); +} + +static inline void qmp_put(struct qmp *qmp) +{ +} + +#endif + +#endif -- cgit v1.2.3 From 1479bdea76dd83222af1b0b232bd9335db69dce0 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 12 Apr 2022 11:31:40 +0200 Subject: ALSA: core: Add snd_card_free_on_error() helper commit fee2b871d8d6389c9b4bdf9346a99ccc1c98c9b8 upstream. This is a small helper function to handle the error path more easily when an error happens during the probe for the device with the device-managed card. Since devres releases in the reverser order of the creations, usually snd_card_free() gets called at the last in the probe error path unless it already reached snd_card_register() calls. Due to this nature, when a driver expects the resource releases in card->private_free, this might be called too lately. As a workaround, one should call the probe like: static int __some_probe(...) { // do real probe.... } static int some_probe(...) { return snd_card_free_on_error(dev, __some_probe(dev, ...)); } so that the snd_card_free() is called explicitly at the beginning of the error path from the probe. This function will be used in the upcoming fixes to address the regressions by devres usages. Fixes: e8ad415b7a55 ("ALSA: core: Add managed card creation") Cc: Link: https://lore.kernel.org/r/20220412093141.8008-2-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/sound/core.h | 1 + sound/core/init.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/sound/core.h b/include/sound/core.h index b7e9b58d3c78..6d4cc49584c6 100644 --- a/include/sound/core.h +++ b/include/sound/core.h @@ -284,6 +284,7 @@ int snd_card_disconnect(struct snd_card *card); void snd_card_disconnect_sync(struct snd_card *card); int snd_card_free(struct snd_card *card); int snd_card_free_when_closed(struct snd_card *card); +int snd_card_free_on_error(struct device *dev, int ret); void snd_card_set_id(struct snd_card *card, const char *id); int snd_card_register(struct snd_card *card); int snd_card_info_init(void); diff --git a/sound/core/init.c b/sound/core/init.c index ac335f5906c6..362588e3a275 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -209,6 +209,12 @@ static void __snd_card_release(struct device *dev, void *data) * snd_card_register(), the very first devres action to call snd_card_free() * is added automatically. In that way, the resource disconnection is assured * at first, then released in the expected order. + * + * If an error happens at the probe before snd_card_register() is called and + * there have been other devres resources, you'd need to free the card manually + * via snd_card_free() call in the error; otherwise it may lead to UAF due to + * devres call orders. You can use snd_card_free_on_error() helper for + * handling it more easily. */ int snd_devm_card_new(struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size, @@ -235,6 +241,28 @@ int snd_devm_card_new(struct device *parent, int idx, const char *xid, } EXPORT_SYMBOL_GPL(snd_devm_card_new); +/** + * snd_card_free_on_error - a small helper for handling devm probe errors + * @dev: the managed device object + * @ret: the return code from the probe callback + * + * This function handles the explicit snd_card_free() call at the error from + * the probe callback. It's just a small helper for simplifying the error + * handling with the managed devices. + */ +int snd_card_free_on_error(struct device *dev, int ret) +{ + struct snd_card *card; + + if (!ret) + return 0; + card = devres_find(dev, __snd_card_release, NULL, NULL); + if (card) + snd_card_free(card); + return ret; +} +EXPORT_SYMBOL_GPL(snd_card_free_on_error); + static int snd_card_init(struct snd_card *card, struct device *parent, int idx, const char *xid, struct module *module, size_t extra_size) -- cgit v1.2.3 From 726ae7300fcc25fefa46d188cc07eb16dc908f9e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 6 Apr 2022 13:51:32 -0400 Subject: SUNRPC: Fix the svc_deferred_event trace class [ Upstream commit 4d5004451ab2218eab94a30e1841462c9316ba19 ] Fix a NULL deref crash that occurs when an svc_rqst is deferred while the sunrpc tracing subsystem is enabled. svc_revisit() sets dr->xprt to NULL, so it can't be relied upon in the tracepoint to provide the remote's address. Unfortunately we can't revert the "svc_deferred_class" hunk in commit ece200ddd54b ("sunrpc: Save remote presentation address in svc_xprt for trace events") because there is now a specific check of event format specifiers for unsafe dereferences. The warning that check emits is: event svc_defer_recv has unsafe dereference of argument 1 A "%pISpc" format specifier with a "struct sockaddr *" is indeed flagged by this check. Instead, take the brute-force approach used by the svcrdma_qp_error tracepoint. Convert the dr::addr field into a presentation address in the TP_fast_assign() arm of the trace event, and store that as a string. This fix can be backported to -stable kernels. In the meantime, commit c6ced22997ad ("tracing: Update print fmt check to handle new __get_sockaddr() macro") is now in v5.18, so this wonky fix can be replaced with __sockaddr() and friends properly during the v5.19 merge window. Fixes: ece200ddd54b ("sunrpc: Save remote presentation address in svc_xprt for trace events") Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- include/trace/events/sunrpc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 7c48613c1830..6bcb8c7a3175 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1924,17 +1924,18 @@ DECLARE_EVENT_CLASS(svc_deferred_event, TP_STRUCT__entry( __field(const void *, dr) __field(u32, xid) - __string(addr, dr->xprt->xpt_remotebuf) + __array(__u8, addr, INET6_ADDRSTRLEN + 10) ), TP_fast_assign( __entry->dr = dr; __entry->xid = be32_to_cpu(*(__be32 *)(dr->args + (dr->xprt_hlen>>2))); - __assign_str(addr, dr->xprt->xpt_remotebuf); + snprintf(__entry->addr, sizeof(__entry->addr) - 1, + "%pISpc", (struct sockaddr *)&dr->addr); ), - TP_printk("addr=%s dr=%p xid=0x%08x", __get_str(addr), __entry->dr, + TP_printk("addr=%s dr=%p xid=0x%08x", __entry->addr, __entry->dr, __entry->xid) ); -- cgit v1.2.3 From de8a332c86a784431e1991be053704f2662239c1 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 6 Apr 2022 14:22:41 +0300 Subject: net/sched: flower: fix parsing of ethertype following VLAN header [ Upstream commit 2105f700b53c24aa48b65c15652acc386044d26a ] A tc flower filter matching TCA_FLOWER_KEY_VLAN_ETH_TYPE is expected to match the L2 ethertype following the first VLAN header, as confirmed by linked discussion with the maintainer. However, such rule also matches packets that have additional second VLAN header, even though filter has both eth_type and vlan_ethtype set to "ipv4". Looking at the code this seems to be mostly an artifact of the way flower uses flow dissector. First, even though looking at the uAPI eth_type and vlan_ethtype appear like a distinct fields, in flower they are all mapped to the same key->basic.n_proto. Second, flow dissector skips following VLAN header as no keys for FLOW_DISSECTOR_KEY_CVLAN are set and eventually assigns the value of n_proto to last parsed header. With these, such filters ignore any headers present between first VLAN header and first "non magic" header (ipv4 in this case) that doesn't result FLOW_DISSECT_RET_PROTO_AGAIN. Fix the issue by extending flow dissector VLAN key structure with new 'vlan_eth_type' field that matches first ethertype following previously parsed VLAN header. Modify flower classifier to set the new flow_dissector_key_vlan->vlan_eth_type with value obtained from TCA_FLOWER_KEY_VLAN_ETH_TYPE/TCA_FLOWER_KEY_CVLAN_ETH_TYPE uAPIs. Link: https://lore.kernel.org/all/Yjhgi48BpTGh6dig@nanopsycho/ Fixes: 9399ae9a6cb2 ("net_sched: flower: Add vlan support") Fixes: d64efd0926ba ("net/sched: flower: Add supprt for matching on QinQ vlan headers") Signed-off-by: Vlad Buslov Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/flow_dissector.h | 2 ++ net/core/flow_dissector.c | 1 + net/sched/cls_flower.c | 18 +++++++++++++----- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index ffd386ea0dbb..c8d1c5e187e4 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -59,6 +59,8 @@ struct flow_dissector_key_vlan { __be16 vlan_tci; }; __be16 vlan_tpid; + __be16 vlan_eth_type; + u16 padding; }; struct flow_dissector_mpls_lse { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index edffdaa875f1..bc50bd331d5b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1181,6 +1181,7 @@ proto_again: VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; + key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 709348262410..32b03a13f9b2 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1004,6 +1004,7 @@ static int fl_set_key_mpls(struct nlattr **tb, static void fl_set_key_vlan(struct nlattr **tb, __be16 ethertype, int vlan_id_key, int vlan_prio_key, + int vlan_next_eth_type_key, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { @@ -1022,6 +1023,11 @@ static void fl_set_key_vlan(struct nlattr **tb, } key_val->vlan_tpid = ethertype; key_mask->vlan_tpid = cpu_to_be16(~0); + if (tb[vlan_next_eth_type_key]) { + key_val->vlan_eth_type = + nla_get_be16(tb[vlan_next_eth_type_key]); + key_mask->vlan_eth_type = cpu_to_be16(~0); + } } static void fl_set_key_flag(u32 flower_key, u32 flower_mask, @@ -1518,8 +1524,9 @@ static int fl_set_key(struct net *net, struct nlattr **tb, if (eth_type_vlan(ethertype)) { fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_VLAN_ID, - TCA_FLOWER_KEY_VLAN_PRIO, &key->vlan, - &mask->vlan); + TCA_FLOWER_KEY_VLAN_PRIO, + TCA_FLOWER_KEY_VLAN_ETH_TYPE, + &key->vlan, &mask->vlan); if (tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]) { ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_VLAN_ETH_TYPE]); @@ -1527,6 +1534,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_vlan(tb, ethertype, TCA_FLOWER_KEY_CVLAN_ID, TCA_FLOWER_KEY_CVLAN_PRIO, + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, &key->cvlan, &mask->cvlan); fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, @@ -2882,13 +2890,13 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, goto nla_put_failure; if (mask->basic.n_proto) { - if (mask->cvlan.vlan_tpid) { + if (mask->cvlan.vlan_eth_type) { if (nla_put_be16(skb, TCA_FLOWER_KEY_CVLAN_ETH_TYPE, key->basic.n_proto)) goto nla_put_failure; - } else if (mask->vlan.vlan_tpid) { + } else if (mask->vlan.vlan_eth_type) { if (nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE, - key->basic.n_proto)) + key->vlan.vlan_eth_type)) goto nla_put_failure; } } -- cgit v1.2.3 From 578616ac3d87ed5cf32c21ac83263ae2011afbb4 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 7 Apr 2022 19:13:09 -0500 Subject: scsi: iscsi: Fix conn cleanup and stop race during iscsid restart [ Upstream commit 7c6e99c18167ed89729bf167ccb4a7e3ab3115ba ] If iscsid is doing a stop_conn at the same time the kernel is starting error recovery we can hit a race that allows the cleanup work to run on a valid connection. In the race, iscsi_if_stop_conn sees the cleanup bit set, but it calls flush_work on the clean_work before iscsi_conn_error_event has queued it. The flush then returns before the queueing and so the cleanup_work can run later and disconnect/stop a conn while it's in a connected state. The patch: Commit 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") added the late stop_conn call bug originally, and the patch: Commit 23d6fefbb3f6 ("scsi: iscsi: Fix in-kernel conn failure handling") attempted to fix it but only fixed the normal EH case and left the above race for the iscsid restart case. For the normal EH case we don't hit the race because we only signal userspace to start recovery after we have done the queueing, so the flush will always catch the queued work or see it completed. For iscsid restart cases like boot, we can hit the race because iscsid will call down to the kernel before the kernel has signaled any error, so both code paths can be running at the same time. This adds a lock around the setting of the cleanup bit and queueing so they happen together. Link: https://lore.kernel.org/r/20220408001314.5014-6-michael.christie@oracle.com Fixes: 0ab710458da1 ("scsi: iscsi: Perform connection failure entirely in kernel space") Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_transport_iscsi.c | 17 +++++++++++++++++ include/scsi/scsi_transport_iscsi.h | 2 ++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 4fa2fd7f4c72..ed289e1242c9 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2260,9 +2260,12 @@ static void iscsi_if_disconnect_bound_ep(struct iscsi_cls_conn *conn, bool is_active) { /* Check if this was a conn error and the kernel took ownership */ + spin_lock_irq(&conn->lock); if (!test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + spin_unlock_irq(&conn->lock); iscsi_ep_disconnect(conn, is_active); } else { + spin_unlock_irq(&conn->lock); ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); mutex_unlock(&conn->ep_mutex); @@ -2309,9 +2312,12 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, /* * Figure out if it was the kernel or userspace initiating this. */ + spin_lock_irq(&conn->lock); if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + spin_unlock_irq(&conn->lock); iscsi_stop_conn(conn, flag); } else { + spin_unlock_irq(&conn->lock); ISCSI_DBG_TRANS_CONN(conn, "flush kernel conn cleanup.\n"); flush_work(&conn->cleanup_work); @@ -2320,7 +2326,9 @@ static int iscsi_if_stop_conn(struct iscsi_transport *transport, * Only clear for recovery to avoid extra cleanup runs during * termination. */ + spin_lock_irq(&conn->lock); clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); + spin_unlock_irq(&conn->lock); } ISCSI_DBG_TRANS_CONN(conn, "iscsi if conn stop done.\n"); return 0; @@ -2341,7 +2349,9 @@ static void iscsi_cleanup_conn_work_fn(struct work_struct *work) */ if (conn->state != ISCSI_CONN_BOUND && conn->state != ISCSI_CONN_UP) { ISCSI_DBG_TRANS_CONN(conn, "Got error while conn is already failed. Ignoring.\n"); + spin_lock_irq(&conn->lock); clear_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags); + spin_unlock_irq(&conn->lock); mutex_unlock(&conn->ep_mutex); return; } @@ -2407,6 +2417,7 @@ iscsi_create_conn(struct iscsi_cls_session *session, int dd_size, uint32_t cid) conn->dd_data = &conn[1]; mutex_init(&conn->ep_mutex); + spin_lock_init(&conn->lock); INIT_LIST_HEAD(&conn->conn_list); INIT_WORK(&conn->cleanup_work, iscsi_cleanup_conn_work_fn); conn->transport = transport; @@ -2598,9 +2609,12 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) struct iscsi_uevent *ev; struct iscsi_internal *priv; int len = nlmsg_total_size(sizeof(*ev)); + unsigned long flags; + spin_lock_irqsave(&conn->lock, flags); if (!test_and_set_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) queue_work(iscsi_conn_cleanup_workq, &conn->cleanup_work); + spin_unlock_irqrestore(&conn->lock, flags); priv = iscsi_if_transport_lookup(conn->transport); if (!priv) @@ -3743,11 +3757,14 @@ static int iscsi_if_transport_conn(struct iscsi_transport *transport, return -EINVAL; mutex_lock(&conn->ep_mutex); + spin_lock_irq(&conn->lock); if (test_bit(ISCSI_CLS_CONN_BIT_CLEANUP, &conn->flags)) { + spin_unlock_irq(&conn->lock); mutex_unlock(&conn->ep_mutex); ev->r.retcode = -ENOTCONN; return 0; } + spin_unlock_irq(&conn->lock); switch (nlh->nlmsg_type) { case ISCSI_UEVENT_BIND_CONN: diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index c5d7810fd792..037c77fb5dc5 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -211,6 +211,8 @@ struct iscsi_cls_conn { struct mutex ep_mutex; struct iscsi_endpoint *ep; + /* Used when accessing flags and queueing work. */ + spinlock_t lock; unsigned long flags; struct work_struct cleanup_work; -- cgit v1.2.3 From b8c0f6d1b04cdf07eda31cb643eea1a1410b551f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 13 Apr 2022 10:10:36 -0300 Subject: vfio/pci: Fix vf_token mechanism when device-specific VF drivers are used [ Upstream commit 1ef3342a934e235aca72b4bcc0d6854d80a65077 ] get_pf_vdev() tries to check if a PF is a VFIO PF by looking at the driver: if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) { However now that we have multiple VF and PF drivers this is no longer reliable. This means that security tests realted to vf_token can be skipped by mixing and matching different VFIO PCI drivers. Instead of trying to use the driver core to find the PF devices maintain a linked list of all PF vfio_pci_core_device's that we have called pci_enable_sriov() on. When registering a VF just search the list to see if the PF is present and record the match permanently in the struct. PCI core locking prevents a PF from passing pci_disable_sriov() while VF drivers are attached so the VFIO owned PF becomes a static property of the VF. In common cases where vfio does not own the PF the global list remains empty and the VF's pointer is statically NULL. This also fixes a lockdep splat from recursive locking of the vfio_group::device_lock between vfio_device_get_from_name() and vfio_device_get_from_dev(). If the VF and PF share the same group this would deadlock. Fixes: ff53edf6d6ab ("vfio/pci: Split the pci_driver code out of vfio_pci_core.c") Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v3-876570980634+f2e8-vfio_vf_token_jgg@nvidia.com Signed-off-by: Alex Williamson Signed-off-by: Sasha Levin --- drivers/vfio/pci/vfio_pci_core.c | 124 +++++++++++++++++++++++---------------- include/linux/vfio_pci_core.h | 2 + 2 files changed, 76 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 15d158bdcde0..f3916e6b16b9 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -36,6 +36,10 @@ static bool nointxmask; static bool disable_vga; static bool disable_idle_d3; +/* List of PF's that vfio_pci_core_sriov_configure() has been called on */ +static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); +static LIST_HEAD(vfio_pci_sriov_pfs); + static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA @@ -434,47 +438,17 @@ out: } EXPORT_SYMBOL_GPL(vfio_pci_core_disable); -static struct vfio_pci_core_device *get_pf_vdev(struct vfio_pci_core_device *vdev) -{ - struct pci_dev *physfn = pci_physfn(vdev->pdev); - struct vfio_device *pf_dev; - - if (!vdev->pdev->is_virtfn) - return NULL; - - pf_dev = vfio_device_get_from_dev(&physfn->dev); - if (!pf_dev) - return NULL; - - if (pci_dev_driver(physfn) != pci_dev_driver(vdev->pdev)) { - vfio_device_put(pf_dev); - return NULL; - } - - return container_of(pf_dev, struct vfio_pci_core_device, vdev); -} - -static void vfio_pci_vf_token_user_add(struct vfio_pci_core_device *vdev, int val) -{ - struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev); - - if (!pf_vdev) - return; - - mutex_lock(&pf_vdev->vf_token->lock); - pf_vdev->vf_token->users += val; - WARN_ON(pf_vdev->vf_token->users < 0); - mutex_unlock(&pf_vdev->vf_token->lock); - - vfio_device_put(&pf_vdev->vdev); -} - void vfio_pci_core_close_device(struct vfio_device *core_vdev) { struct vfio_pci_core_device *vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); - vfio_pci_vf_token_user_add(vdev, -1); + if (vdev->sriov_pf_core_dev) { + mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); + WARN_ON(!vdev->sriov_pf_core_dev->vf_token->users); + vdev->sriov_pf_core_dev->vf_token->users--; + mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); + } vfio_spapr_pci_eeh_release(vdev->pdev); vfio_pci_core_disable(vdev); @@ -495,7 +469,12 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev) { vfio_pci_probe_mmaps(vdev); vfio_spapr_pci_eeh_open(vdev->pdev); - vfio_pci_vf_token_user_add(vdev, 1); + + if (vdev->sriov_pf_core_dev) { + mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock); + vdev->sriov_pf_core_dev->vf_token->users++; + mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock); + } } EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable); @@ -1603,11 +1582,8 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, * * If the VF token is provided but unused, an error is generated. */ - if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token) - return 0; /* No VF token provided or required */ - if (vdev->pdev->is_virtfn) { - struct vfio_pci_core_device *pf_vdev = get_pf_vdev(vdev); + struct vfio_pci_core_device *pf_vdev = vdev->sriov_pf_core_dev; bool match; if (!pf_vdev) { @@ -1620,7 +1596,6 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, } if (!vf_token) { - vfio_device_put(&pf_vdev->vdev); pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); return -EACCES; @@ -1630,8 +1605,6 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); mutex_unlock(&pf_vdev->vf_token->lock); - vfio_device_put(&pf_vdev->vdev); - if (!match) { pci_info_ratelimited(vdev->pdev, "Incorrect VF token provided for device\n"); @@ -1752,8 +1725,30 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb, static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; + struct vfio_pci_core_device *cur; + struct pci_dev *physfn; int ret; + if (pdev->is_virtfn) { + /* + * If this VF was created by our vfio_pci_core_sriov_configure() + * then we can find the PF vfio_pci_core_device now, and due to + * the locking in pci_disable_sriov() it cannot change until + * this VF device driver is removed. + */ + physfn = pci_physfn(vdev->pdev); + mutex_lock(&vfio_pci_sriov_pfs_mutex); + list_for_each_entry(cur, &vfio_pci_sriov_pfs, sriov_pfs_item) { + if (cur->pdev == physfn) { + vdev->sriov_pf_core_dev = cur; + break; + } + } + mutex_unlock(&vfio_pci_sriov_pfs_mutex); + return 0; + } + + /* Not a SRIOV PF */ if (!pdev->is_physfn) return 0; @@ -1825,6 +1820,7 @@ void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, INIT_LIST_HEAD(&vdev->ioeventfds_list); mutex_init(&vdev->vma_lock); INIT_LIST_HEAD(&vdev->vma_list); + INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); } EXPORT_SYMBOL_GPL(vfio_pci_core_init_device); @@ -1923,7 +1919,7 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; - pci_disable_sriov(pdev); + vfio_pci_core_sriov_configure(pdev, 0); vfio_unregister_group_dev(&vdev->vdev); @@ -1963,21 +1959,49 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn) { + struct vfio_pci_core_device *vdev; struct vfio_device *device; int ret = 0; + device_lock_assert(&pdev->dev); + device = vfio_device_get_from_dev(&pdev->dev); if (!device) return -ENODEV; - if (nr_virtfn == 0) - pci_disable_sriov(pdev); - else + vdev = container_of(device, struct vfio_pci_core_device, vdev); + + if (nr_virtfn) { + mutex_lock(&vfio_pci_sriov_pfs_mutex); + /* + * The thread that adds the vdev to the list is the only thread + * that gets to call pci_enable_sriov() and we will only allow + * it to be called once without going through + * pci_disable_sriov() + */ + if (!list_empty(&vdev->sriov_pfs_item)) { + ret = -EINVAL; + goto out_unlock; + } + list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs); + mutex_unlock(&vfio_pci_sriov_pfs_mutex); ret = pci_enable_sriov(pdev, nr_virtfn); + if (ret) + goto out_del; + ret = nr_virtfn; + goto out_put; + } - vfio_device_put(device); + pci_disable_sriov(pdev); - return ret < 0 ? ret : nr_virtfn; +out_del: + mutex_lock(&vfio_pci_sriov_pfs_mutex); + list_del_init(&vdev->sriov_pfs_item); +out_unlock: + mutex_unlock(&vfio_pci_sriov_pfs_mutex); +out_put: + vfio_device_put(device); + return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index ae6f4838ab75..6e5db4edc335 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -133,6 +133,8 @@ struct vfio_pci_core_device { struct mutex ioeventfds_lock; struct list_head ioeventfds_list; struct vfio_pci_vf_token *vf_token; + struct list_head sriov_pfs_item; + struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; struct mutex vma_lock; struct list_head vma_list; -- cgit v1.2.3 From 61dd8bef80c24f8a848d2e4b57e0ded77bf2a1ce Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Wed, 30 Mar 2022 12:25:43 +0100 Subject: tlb: hugetlb: Add more sizes to tlb_remove_huge_tlb_entry [ Upstream commit 697a1d44af8ba0477ee729e632f4ade37999249a ] tlb_remove_huge_tlb_entry only considers PMD_SIZE and PUD_SIZE when updating the mmu_gather structure. Unfortunately on arm64 there are two additional huge page sizes that need to be covered: CONT_PTE_SIZE and CONT_PMD_SIZE. Where an end-user attempts to employ contiguous huge pages, a VM_BUG_ON can be experienced due to the fact that the tlb structure hasn't been correctly updated by the relevant tlb_flush_p.._range() call from tlb_remove_huge_tlb_entry. This patch adds inequality logic to the generic implementation of tlb_remove_huge_tlb_entry s.t. CONT_PTE_SIZE and CONT_PMD_SIZE are effectively covered on arm64. Also, as well as ptes, pmds and puds; p4ds are now considered too. Reported-by: David Hildenbrand Suggested-by: Peter Zijlstra (Intel) Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Will Deacon Link: https://lore.kernel.org/linux-mm/811c5c8e-b3a2-85d2-049c-717f17c3a03a@redhat.com/ Signed-off-by: Steve Capper Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220330112543.863-1-steve.capper@arm.com Signed-off-by: Will Deacon Signed-off-by: Sasha Levin --- include/asm-generic/tlb.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 2c68a545ffa7..71942a1c642d 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -565,10 +565,14 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ - if (_sz == PMD_SIZE) \ - tlb_flush_pmd_range(tlb, address, _sz); \ - else if (_sz == PUD_SIZE) \ + if (_sz >= P4D_SIZE) \ + tlb_flush_p4d_range(tlb, address, _sz); \ + else if (_sz >= PUD_SIZE) \ tlb_flush_pud_range(tlb, address, _sz); \ + else if (_sz >= PMD_SIZE) \ + tlb_flush_pmd_range(tlb, address, _sz); \ + else \ + tlb_flush_pte_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) -- cgit v1.2.3 From f5e13d700a4d40ccde3d36e383f9247dcb3c1d2d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 1 Apr 2022 17:08:21 -0400 Subject: SUNRPC: Fix NFSD's request deferral on RDMA transports commit 773f91b2cf3f52df0d7508fdbf60f37567cdaee4 upstream. Trond Myklebust reports an NFSD crash in svc_rdma_sendto(). Further investigation shows that the crash occurred while NFSD was handling a deferred request. This patch addresses two inter-related issues that prevent request deferral from working correctly for RPC/RDMA requests: 1. Prevent the crash by ensuring that the original svc_rqst::rq_xprt_ctxt value is available when the request is revisited. Otherwise svc_rdma_sendto() does not have a Receive context available with which to construct its reply. 2. Possibly since before commit 71641d99ce03 ("svcrdma: Properly compute .len and .buflen for received RPC Calls"), svc_rdma_recvfrom() did not include the transport header in the returned xdr_buf. There should have been no need for svc_defer() and friends to save and restore that header, as of that commit. This issue is addressed in a backport-friendly way by simply having svc_rdma_recvfrom() set rq_xprt_hlen to zero unconditionally, just as svc_tcp_recvfrom() does. This enables svc_deferred_recv() to correctly reconstruct an RPC message received via RPC/RDMA. Reported-by: Trond Myklebust Link: https://lore.kernel.org/linux-nfs/82662b7190f26fb304eb0ab1bb04279072439d4e.camel@hammerspace.com/ Signed-off-by: Chuck Lever Cc: Signed-off-by: Greg Kroah-Hartman --- include/linux/sunrpc/svc.h | 1 + net/sunrpc/svc_xprt.c | 3 +++ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6263410c948a..01f09adccc63 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -384,6 +384,7 @@ struct svc_deferred_req { size_t addrlen; struct sockaddr_storage daddr; /* where reply must come from */ size_t daddrlen; + void *xprt_ctxt; struct cache_deferred_req handle; size_t xprt_hlen; int argslen; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d4b663401be1..935bba065636 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1213,6 +1213,8 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; dr->xprt_hlen = rqstp->rq_xprt_hlen; + dr->xprt_ctxt = rqstp->rq_xprt_ctxt; + rqstp->rq_xprt_ctxt = NULL; /* back up head to the start of the buffer and copy */ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; @@ -1251,6 +1253,7 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_xprt_hlen = dr->xprt_hlen; rqstp->rq_daddr = dr->daddr; rqstp->rq_respages = rqstp->rq_pages; + rqstp->rq_xprt_ctxt = dr->xprt_ctxt; svc_xprt_received(rqstp->rq_xprt); return (dr->argslen<<2) - dr->xprt_hlen; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 6be23ce7a93d..387a5da09daf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -826,7 +826,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_err; if (ret == 0) goto out_drop; - rqstp->rq_xprt_hlen = ret; + rqstp->rq_xprt_hlen = 0; if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) goto out_backchannel; -- cgit v1.2.3 From 9af0fd5c4453a44c692be0cbb3724859b75d739b Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 15 Apr 2022 19:14:15 +0300 Subject: ax25: add refcount in ax25_dev to avoid UAF bugs commit d01ffb9eee4af165d83b08dd73ebdf9fe94a519b upstream. If we dereference ax25_dev after we call kfree(ax25_dev) in ax25_dev_device_down(), it will lead to concurrency UAF bugs. There are eight syscall functions suffer from UAF bugs, include ax25_bind(), ax25_release(), ax25_connect(), ax25_ioctl(), ax25_getname(), ax25_sendmsg(), ax25_getsockopt() and ax25_info_show(). One of the concurrency UAF can be shown as below: (USE) | (FREE) | ax25_device_event | ax25_dev_device_down ax25_bind | ... ... | kfree(ax25_dev) ax25_fillin_cb() | ... ax25_fillin_cb_from_dev() | ... | The root cause of UAF bugs is that kfree(ax25_dev) in ax25_dev_device_down() is not protected by any locks. When ax25_dev, which there are still pointers point to, is released, the concurrency UAF bug will happen. This patch introduces refcount into ax25_dev in order to guarantee that there are no pointers point to it when ax25_dev is released. Signed-off-by: Duoming Zhou Signed-off-by: David S. Miller [OP: backport to 5.15: adjusted context] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/net/ax25.h | 10 ++++++++++ net/ax25/af_ax25.c | 2 ++ net/ax25/ax25_dev.c | 12 ++++++++++-- net/ax25/ax25_route.c | 3 +++ 4 files changed, 25 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ax25.h b/include/net/ax25.h index 8b7eb46ad72d..d81bfb674906 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -236,6 +236,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev; typedef struct ax25_cb { @@ -290,6 +291,15 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } +#define ax25_dev_hold(__ax25_dev) \ + refcount_inc(&((__ax25_dev)->refcount)) + +static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 735f29512163..954196ef7788 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -98,6 +98,7 @@ again: spin_unlock_bh(&ax25_list_lock); lock_sock(sk); s->ax25_dev = NULL; + ax25_dev_put(ax25_dev); release_sock(sk); ax25_disconnect(s, ENETUNREACH); spin_lock_bh(&ax25_list_lock); @@ -446,6 +447,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) } out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret; diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 4ac2e0847652..2c845ff1d036 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -37,6 +37,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock); @@ -56,6 +57,7 @@ void ax25_dev_device_up(struct net_device *dev) return; } + refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold(dev); @@ -83,6 +85,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; + ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -112,20 +115,22 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; } @@ -133,6 +138,7 @@ void ax25_dev_device_down(struct net_device *dev) } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); } int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -149,6 +155,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) if (ax25_dev->forward != NULL) return -EINVAL; ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); break; case SIOCAX25DELFWD: @@ -161,6 +168,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) return -EINVAL; } + ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index d0b2e094bd55..1e32693833e5 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -116,6 +116,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; + ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); @@ -172,6 +173,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return 0; @@ -214,6 +216,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return err; } -- cgit v1.2.3 From bc706d89199b0d8ee5e2229e18fdb9c0720f6ba8 Mon Sep 17 00:00:00 2001 From: Duoming Zhou Date: Fri, 15 Apr 2022 19:14:16 +0300 Subject: ax25: fix reference count leaks of ax25_dev commit 87563a043cef044fed5db7967a75741cc16ad2b1 upstream. The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") introduces refcount into ax25_dev, but there are reference leak paths in ax25_ctl_ioctl(), ax25_fwd_ioctl(), ax25_rt_add(), ax25_rt_del() and ax25_rt_opt(). This patch uses ax25_dev_put() and adjusts the position of ax25_addr_ax25dev() to fix reference cout leaks of ax25_dev. Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Signed-off-by: Duoming Zhou Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20220203150811.42256-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski [OP: backport to 5.15: adjust context] Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- include/net/ax25.h | 8 +++++--- net/ax25/af_ax25.c | 12 ++++++++---- net/ax25/ax25_dev.c | 24 +++++++++++++++++------- net/ax25/ax25_route.c | 16 +++++++++++----- 4 files changed, 41 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/ax25.h b/include/net/ax25.h index d81bfb674906..aadff553e4b7 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -291,10 +291,12 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } } -#define ax25_dev_hold(__ax25_dev) \ - refcount_inc(&((__ax25_dev)->refcount)) +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +} -static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 954196ef7788..f8c39ccd03bb 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -366,21 +366,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT; - if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL; if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k]; - if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + } switch (ax25_ctl.cmd) { case AX25_KILL: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 2c845ff1d036..d2e0cc67d91a 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -85,8 +85,8 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; - ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev); ax25_register_dev_sysctl(ax25_dev); } @@ -115,8 +115,8 @@ void ax25_dev_device_down(struct net_device *dev) if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); ax25_dev_put(ax25_dev); @@ -126,8 +126,8 @@ void ax25_dev_device_down(struct net_device *dev) while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); ax25_dev_put(ax25_dev); @@ -150,25 +150,35 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break; case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break; default: + ax25_dev_put(ax25_dev); return -EINVAL; } - ax25_dev_put(ax25_dev); return 0; } diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 1e32693833e5..9751207f7757 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -75,11 +75,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i; - if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL; + ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock); ax25_rt = ax25_route_list; @@ -91,6 +93,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -101,6 +104,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -108,6 +112,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } @@ -116,11 +121,11 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; - ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -133,6 +138,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -173,8 +179,8 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } @@ -216,8 +222,8 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) } out: - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; } -- cgit v1.2.3 From 7877e7a5a52e1c9716326b94dfe6c7e6cd7bce5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:25 +0200 Subject: block: remove __sync_blockdev [ Upstream commit 70164eb6ccb76ab679b016b4b60123bf4ec6c162 ] Instead offer a new sync_blockdev_nowait helper for the !wait case. This new helper is exported as it will grow modular callers in a bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-3-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/bdev.c | 11 ++++++----- fs/internal.h | 5 ----- fs/sync.c | 7 ++++--- include/linux/blkdev.h | 5 +++++ 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/block/bdev.c b/block/bdev.c index 485a258b0ab3..33cac289302e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -184,14 +184,13 @@ int sb_min_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_min_blocksize); -int __sync_blockdev(struct block_device *bdev, int wait) +int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); + return filemap_flush(bdev->bd_inode->i_mapping); } +EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block @@ -199,7 +198,9 @@ int __sync_blockdev(struct block_device *bdev, int wait) */ int sync_blockdev(struct block_device *bdev) { - return __sync_blockdev(bdev, 1); + if (!bdev) + return 0; + return filemap_write_and_wait(bdev->bd_inode->i_mapping); } EXPORT_SYMBOL(sync_blockdev); diff --git a/fs/internal.h b/fs/internal.h index 3cd065c8a66b..b5caa16f4645 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,7 +23,6 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -extern int __sync_blockdev(struct block_device *bdev, int wait); void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else @@ -31,10 +30,6 @@ static inline void bdev_cache_init(void) { } -static inline int __sync_blockdev(struct block_device *bdev, int wait) -{ - return 0; -} static inline void iterate_bdevs(void (*f)(struct block_device *, void *), void *arg) { diff --git a/fs/sync.c b/fs/sync.c index 0d6cdc507cb9..a621089eb07e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -3,6 +3,7 @@ * High-level sync()-related operations */ +#include #include #include #include @@ -45,7 +46,7 @@ int sync_filesystem(struct super_block *sb) /* * Do the filesystem syncing work. For simple filesystems * writeback_inodes_sb(sb) just dirties buffers with inodes so we have - * to submit I/O for these buffers via __sync_blockdev(). This also + * to submit I/O for these buffers via sync_blockdev(). This also * speeds up the wait == 1 case since in that case write_inode() * methods call sync_dirty_buffer() and thus effectively write one block * at a time. @@ -53,14 +54,14 @@ int sync_filesystem(struct super_block *sb) writeback_inodes_sb(sb, WB_REASON_SYNC); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 0); - ret = __sync_blockdev(sb->s_bdev, 0); + ret = sync_blockdev_nowait(sb->s_bdev); if (ret < 0) return ret; sync_inodes_sb(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 1); - return __sync_blockdev(sb->s_bdev, 1); + return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 413c0148c0ce..6bbd393e6bcc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,6 +1999,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); +int sync_blockdev_nowait(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -2007,6 +2008,10 @@ static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline int sync_blockdev_nowait(struct block_device *bdev) +{ + return 0; +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 6eb927ee189f39746bcb02123d270ef04457eab6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Oct 2021 08:25:30 +0200 Subject: block: simplify the block device syncing code [ Upstream commit 1e03a36bdff4709c1bbf0f57f60ae3f776d51adf ] Get rid of the indirections and just provide a sync_bdevs helper for the generic sync code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211019062530.2174626-8-hch@lst.de Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/bdev.c | 17 ++++++++++++++--- fs/internal.h | 6 ------ fs/sync.c | 23 ++++------------------- include/linux/blkdev.h | 4 ++++ 4 files changed, 22 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/block/bdev.c b/block/bdev.c index 33cac289302e..18abafb135e0 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1017,7 +1017,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) } EXPORT_SYMBOL(__invalidate_device); -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; @@ -1048,8 +1048,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); + if (!bdev->bd_openers) { + ; /* skip */ + } else if (wait) { + /* + * We keep the error status of individual mapping so + * that applications can catch the writeback error using + * fsync(2). See filemap_fdatawait_keep_errors() for + * details. + */ + filemap_fdatawait_keep_errors(inode->i_mapping); + } else { + filemap_fdatawrite(inode->i_mapping); + } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); diff --git a/fs/internal.h b/fs/internal.h index b5caa16f4645..cdd83d4899bb 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,17 +23,11 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } - -static inline void iterate_bdevs(void (*f)(struct block_device *, void *), - void *arg) -{ -} static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; diff --git a/fs/sync.c b/fs/sync.c index a621089eb07e..3ce8e2137f31 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -78,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg) sb->s_op->sync_fs(sb, *(int *)arg); } -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) -{ - filemap_fdatawrite(bdev->bd_inode->i_mapping); -} - -static void fdatawait_one_bdev(struct block_device *bdev, void *arg) -{ - /* - * We keep the error status of individual mapping so that - * applications can catch the writeback error using fsync(2). - * See filemap_fdatawait_keep_errors() for details. - */ - filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); -} - /* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably @@ -111,8 +96,8 @@ void ksys_sync(void) iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); - iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_bdevs(fdatawait_one_bdev, NULL); + sync_bdevs(false); + sync_bdevs(true); if (unlikely(laptop_mode)) laptop_sync_completion(); } @@ -133,10 +118,10 @@ static void do_sync_work(struct work_struct *work) */ iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6bbd393e6bcc..aebe67ed7a73 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -2000,6 +2000,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_nowait(struct block_device *bdev); +void sync_bdevs(bool wait); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -2012,6 +2013,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) { return 0; } +static inline void sync_bdevs(bool wait) +{ +} #endif int fsync_bdev(struct block_device *bdev); -- cgit v1.2.3 From 3177d047e58a56b2df9e9d59283c3121f1888fe8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 12 Feb 2022 09:14:49 -0800 Subject: etherdevice: Adjust ether_addr* prototypes to silence -Wstringop-overead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2618a0dae09ef37728dab89ff60418cbe25ae6bd upstream. With GCC 12, -Wstringop-overread was warning about an implicit cast from char[6] to char[8]. However, the extra 2 bytes are always thrown away, alignment doesn't matter, and the risk of hitting the edge of unallocated memory has been accepted, so this prototype can just be converted to a regular char *. Silences: net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread] 4618 | orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’ 375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], | ^~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Marc Kleine-Budde Tested-by: Marc Kleine-Budde Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de Cc: Jakub Kicinski Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller Cc: Khem Raj Signed-off-by: Greg Kroah-Hartman --- include/linux/etherdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index c58d50451485..7f28fa702bb7 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -127,7 +127,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr) #endif } -static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN @@ -364,8 +364,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ -static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], - const u8 addr2[6+2]) +static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); -- cgit v1.2.3 From a52e73bef25486cf2aa786ca5c83bcc940625103 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 14 Apr 2022 19:13:40 -0700 Subject: mm, kfence: support kmem_dump_obj() for KFENCE objects commit 2dfe63e61cc31ee59ce951672b0850b5229cd5b0 upstream. Calling kmem_obj_info() via kmem_dump_obj() on KFENCE objects has been producing garbage data due to the object not actually being maintained by SLAB or SLUB. Fix this by implementing __kfence_obj_info() that copies relevant information to struct kmem_obj_info when the object was allocated by KFENCE; this is called by a common kmem_obj_info(), which also calls the slab/slub/slob specific variant now called __kmem_obj_info(). For completeness, kmem_dump_obj() now displays if the object was allocated by KFENCE. Link: https://lore.kernel.org/all/20220323090520.GG16885@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/20220406131558.3558585-1-elver@google.com Fixes: b89fb5ef0ce6 ("mm, kfence: insert KFENCE hooks for SLUB") Fixes: d3fb45f370d9 ("mm, kfence: insert KFENCE hooks for SLAB") Signed-off-by: Marco Elver Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reported-by: kernel test robot Acked-by: Vlastimil Babka [slab] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/kfence.h | 24 ++++++++++++++++++++++++ mm/kfence/core.c | 21 --------------------- mm/kfence/kfence.h | 21 +++++++++++++++++++++ mm/kfence/report.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ mm/slab.c | 2 +- mm/slab.h | 2 +- mm/slab_common.c | 9 +++++++++ mm/slob.c | 2 +- mm/slub.c | 2 +- 9 files changed, 105 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 4b5e3679a72c..3c75209a545e 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -202,6 +202,22 @@ static __always_inline __must_check bool kfence_free(void *addr) */ bool __must_check kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs); +#ifdef CONFIG_PRINTK +struct kmem_obj_info; +/** + * __kfence_obj_info() - fill kmem_obj_info struct + * @kpp: kmem_obj_info to be filled + * @object: the object + * + * Return: + * * false - not a KFENCE object + * * true - a KFENCE object, filled @kpp + * + * Copies information to @kpp for KFENCE objects. + */ +bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +#endif + #else /* CONFIG_KFENCE */ static inline bool is_kfence_address(const void *addr) { return false; } @@ -219,6 +235,14 @@ static inline bool __must_check kfence_handle_page_fault(unsigned long addr, boo return false; } +#ifdef CONFIG_PRINTK +struct kmem_obj_info; +static inline bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + return false; +} +#endif + #endif #endif /* _LINUX_KFENCE_H */ diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 51ea9193cecb..86260e8f2830 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -221,27 +221,6 @@ static bool kfence_unprotect(unsigned long addr) return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); } -static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) -{ - long index; - - /* The checks do not affect performance; only called from slow-paths. */ - - if (!is_kfence_address((void *)addr)) - return NULL; - - /* - * May be an invalid index if called with an address at the edge of - * __kfence_pool, in which case we would report an "invalid access" - * error. - */ - index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; - if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) - return NULL; - - return &kfence_metadata[index]; -} - static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) { unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 2a2d5de9d379..92bf6eff6060 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -93,6 +93,27 @@ struct kfence_metadata { extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) +{ + long index; + + /* The checks do not affect performance; only called from slow-paths. */ + + if (!is_kfence_address((void *)addr)) + return NULL; + + /* + * May be an invalid index if called with an address at the edge of + * __kfence_pool, in which case we would report an "invalid access" + * error. + */ + index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; + if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) + return NULL; + + return &kfence_metadata[index]; +} + /* KFENCE error types for report generation. */ enum kfence_error_type { KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f93a7b2a338b..37e140e7f201 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -273,3 +273,50 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); } + +#ifdef CONFIG_PRINTK +static void kfence_to_kp_stack(const struct kfence_track *track, void **kp_stack) +{ + int i, j; + + i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + for (j = 0; i < track->num_stack_entries && j < KS_ADDRS_COUNT; ++i, ++j) + kp_stack[j] = (void *)track->stack_entries[i]; + if (j < KS_ADDRS_COUNT) + kp_stack[j] = NULL; +} + +bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)object); + unsigned long flags; + + if (!meta) + return false; + + /* + * If state is UNUSED at least show the pointer requested; the rest + * would be garbage data. + */ + kpp->kp_ptr = object; + + /* Requesting info an a never-used object is almost certainly a bug. */ + if (WARN_ON(meta->state == KFENCE_OBJECT_UNUSED)) + return true; + + raw_spin_lock_irqsave(&meta->lock, flags); + + kpp->kp_page = page; + kpp->kp_slab_cache = meta->cache; + kpp->kp_objp = (void *)meta->addr; + kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack); + if (meta->state == KFENCE_OBJECT_FREED) + kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack); + /* get_stack_skipnr() ensures the first entry is outside allocator. */ + kpp->kp_ret = kpp->kp_stack[0]; + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + return true; +} +#endif diff --git a/mm/slab.c b/mm/slab.c index 03d3074d0bb0..1bd283e98c58 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3658,7 +3658,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { struct kmem_cache *cachep; unsigned int objnr; diff --git a/mm/slab.h b/mm/slab.h index 56ad7eea3ddf..1ae1bdd485c1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -643,7 +643,7 @@ struct kmem_obj_info { void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); #endif #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index ec2bb0beed75..022319e7deaf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -568,6 +568,13 @@ bool kmem_valid_obj(void *object) } EXPORT_SYMBOL_GPL(kmem_valid_obj); +static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + if (__kfence_obj_info(kpp, object, page)) + return; + __kmem_obj_info(kpp, object, page); +} + /** * kmem_dump_obj - Print available slab provenance information * @object: slab object for which to find provenance information. @@ -603,6 +610,8 @@ void kmem_dump_obj(void *object) pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); else pr_cont(" slab%s", cp); + if (is_kfence_address(object)) + pr_cont(" (kfence)"); if (kp.kp_objp) pr_cont(" start %px", kp.kp_objp); if (kp.kp_data_offset) diff --git a/mm/slob.c b/mm/slob.c index 74d3f6e60666..f3fc15df971a 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -462,7 +462,7 @@ out: } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { kpp->kp_ptr = object; kpp->kp_page = page; diff --git a/mm/slub.c b/mm/slub.c index ca6ba6bdf27b..b75eebc0350e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4299,7 +4299,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) { void *base; int __maybe_unused i; -- cgit v1.2.3 From a583f2f3c8788bffd7fd7baeb76bd6d80543d7ea Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 13 Apr 2022 10:10:50 +0200 Subject: esp: limit skb_page_frag_refill use to a single page [ Upstream commit 5bd8baab087dff657e05387aee802e70304cc813 ] Commit ebe48d368e97 ("esp: Fix possible buffer overflow in ESP transformation") tried to fix skb_page_frag_refill usage in ESP by capping allocsize to 32k, but that doesn't completely solve the issue, as skb_page_frag_refill may return a single page. If that happens, we will write out of bounds, despite the check introduced in the previous patch. This patch forces COW in cases where we would end up calling skb_page_frag_refill with a size larger than a page (first in esp_output_head with tailen, then in esp_output_tail with skb->data_len). Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- include/net/esp.h | 2 -- net/ipv4/esp4.c | 5 ++--- net/ipv6/esp6.c | 5 ++--- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/esp.h b/include/net/esp.h index 90cd02ff77ef..9c5637d41d95 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,8 +4,6 @@ #include -#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) - struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 70e6c87fbe3d..d747166bb291 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -446,7 +446,6 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { @@ -456,8 +455,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 5023f59a5b96..6219d97cac7a 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -483,7 +483,6 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; - unsigned int allocsz; if (x->encap) { int err = esp6_output_encap(x, skb, esp); @@ -492,8 +491,8 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info return err; } - allocsz = ALIGN(skb->data_len + tailen, L1_CACHE_BYTES); - if (allocsz > ESP_SKB_FRAG_MAXSIZE) + if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE || + ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE) goto cow; if (!skb_cloned(skb)) { -- cgit v1.2.3 From 652a5405396dd37ea4af66a711c97e66f8192ae0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Apr 2022 11:13:33 -0700 Subject: ipv6: make ip6_rt_gc_expire an atomic_t [ Upstream commit 9cb7c013420f98fa6fd12fc6a5dc055170c108db ] Reads and Writes to ip6_rt_gc_expire always have been racy, as syzbot reported lately [1] There is a possible risk of under-flow, leading to unexpected high value passed to fib6_run_gc(), although I have not observed this in the field. Hosts hitting ip6_dst_gc() very hard are under pretty bad state anyway. [1] BUG: KCSAN: data-race in ip6_dst_gc / ip6_dst_gc read-write to 0xffff888102110744 of 4 bytes by task 13165 on cpu 1: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 read-write to 0xffff888102110744 of 4 bytes by task 11607 on cpu 0: ip6_dst_gc+0x1f3/0x220 net/ipv6/route.c:3311 dst_alloc+0x9b/0x160 net/core/dst.c:86 ip6_dst_alloc net/ipv6/route.c:344 [inline] icmp6_dst_alloc+0xb2/0x360 net/ipv6/route.c:3261 mld_sendpack+0x2b9/0x580 net/ipv6/mcast.c:1807 mld_send_cr net/ipv6/mcast.c:2119 [inline] mld_ifc_work+0x576/0x800 net/ipv6/mcast.c:2651 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 value changed: 0x00000bb3 -> 0x00000ba9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 11607 Comm: kworker/0:21 Not tainted 5.18.0-rc1-syzkaller-00037-g42e7a03d3bad-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: mld mld_ifc_work Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220413181333.649424-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/net/netns/ipv6.h | 4 ++-- net/ipv6/route.c | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 6bd7e5a85ce7..ff82983b7ab4 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -75,8 +75,8 @@ struct netns_ipv6 { struct list_head fib6_walkers; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; - unsigned int ip6_rt_gc_expire; - unsigned long ip6_rt_last_gc; + atomic_t ip6_rt_gc_expire; + unsigned long ip6_rt_last_gc; unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES bool fib6_has_custom_rules; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6b269595efaa..0ca7c780d97a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3303,6 +3303,7 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + unsigned int val; int entries; entries = dst_entries_get_fast(ops); @@ -3313,13 +3314,13 @@ static int ip6_dst_gc(struct dst_ops *ops) entries <= rt_max_size) goto out; - net->ipv6.ip6_rt_gc_expire++; - fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); + fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) - net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; + atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: - net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; + val = atomic_read(&net->ipv6.ip6_rt_gc_expire); + atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); return entries > rt_max_size; } @@ -6528,7 +6529,7 @@ static int __net_init ip6_route_net_init(struct net *net) net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; - net->ipv6.ip6_rt_gc_expire = 30*HZ; + atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: -- cgit v1.2.3 From 740411ee2f94632127338decbdd4e9994778573b Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 7 Apr 2022 19:13:07 -0500 Subject: scsi: iscsi: Release endpoint ID when its freed [ Upstream commit 3c6ae371b8a1ffba1fc415989fd581ebf841ed0a ] We can't release the endpoint ID until all references to the endpoint have been dropped or it could be allocated while in use. This has us use an idr instead of looping over all conns to find a free ID and then free the ID when all references have been dropped instead of when the device is only deleted. Link: https://lore.kernel.org/r/20220408001314.5014-4-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Reviewed-by: Wu Bo Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_transport_iscsi.c | 71 ++++++++++++++++++------------------- include/scsi/scsi_transport_iscsi.h | 2 +- 2 files changed, 36 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index c7b1b2e8bb02..bcdfcb25349a 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -86,6 +86,9 @@ struct iscsi_internal { struct transport_container session_cont; }; +static DEFINE_IDR(iscsi_ep_idr); +static DEFINE_MUTEX(iscsi_ep_idr_mutex); + static atomic_t iscsi_session_nr; /* sysfs session id for next new session */ static struct workqueue_struct *iscsi_eh_timer_workq; @@ -169,6 +172,11 @@ struct device_attribute dev_attr_##_prefix##_##_name = \ static void iscsi_endpoint_release(struct device *dev) { struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); + + mutex_lock(&iscsi_ep_idr_mutex); + idr_remove(&iscsi_ep_idr, ep->id); + mutex_unlock(&iscsi_ep_idr_mutex); + kfree(ep); } @@ -181,7 +189,7 @@ static ssize_t show_ep_handle(struct device *dev, struct device_attribute *attr, char *buf) { struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); - return sysfs_emit(buf, "%llu\n", (unsigned long long) ep->id); + return sysfs_emit(buf, "%d\n", ep->id); } static ISCSI_ATTR(ep, handle, S_IRUGO, show_ep_handle, NULL); @@ -194,48 +202,32 @@ static struct attribute_group iscsi_endpoint_group = { .attrs = iscsi_endpoint_attrs, }; -#define ISCSI_MAX_EPID -1 - -static int iscsi_match_epid(struct device *dev, const void *data) -{ - struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); - const uint64_t *epid = data; - - return *epid == ep->id; -} - struct iscsi_endpoint * iscsi_create_endpoint(int dd_size) { - struct device *dev; struct iscsi_endpoint *ep; - uint64_t id; - int err; - - for (id = 1; id < ISCSI_MAX_EPID; id++) { - dev = class_find_device(&iscsi_endpoint_class, NULL, &id, - iscsi_match_epid); - if (!dev) - break; - else - put_device(dev); - } - if (id == ISCSI_MAX_EPID) { - printk(KERN_ERR "Too many connections. Max supported %u\n", - ISCSI_MAX_EPID - 1); - return NULL; - } + int err, id; ep = kzalloc(sizeof(*ep) + dd_size, GFP_KERNEL); if (!ep) return NULL; + mutex_lock(&iscsi_ep_idr_mutex); + id = idr_alloc(&iscsi_ep_idr, ep, 0, -1, GFP_NOIO); + if (id < 0) { + mutex_unlock(&iscsi_ep_idr_mutex); + printk(KERN_ERR "Could not allocate endpoint ID. Error %d.\n", + id); + goto free_ep; + } + mutex_unlock(&iscsi_ep_idr_mutex); + ep->id = id; ep->dev.class = &iscsi_endpoint_class; - dev_set_name(&ep->dev, "ep-%llu", (unsigned long long) id); + dev_set_name(&ep->dev, "ep-%d", id); err = device_register(&ep->dev); if (err) - goto free_ep; + goto free_id; err = sysfs_create_group(&ep->dev.kobj, &iscsi_endpoint_group); if (err) @@ -249,6 +241,10 @@ unregister_dev: device_unregister(&ep->dev); return NULL; +free_id: + mutex_lock(&iscsi_ep_idr_mutex); + idr_remove(&iscsi_ep_idr, id); + mutex_unlock(&iscsi_ep_idr_mutex); free_ep: kfree(ep); return NULL; @@ -276,14 +272,17 @@ EXPORT_SYMBOL_GPL(iscsi_put_endpoint); */ struct iscsi_endpoint *iscsi_lookup_endpoint(u64 handle) { - struct device *dev; + struct iscsi_endpoint *ep; - dev = class_find_device(&iscsi_endpoint_class, NULL, &handle, - iscsi_match_epid); - if (!dev) - return NULL; + mutex_lock(&iscsi_ep_idr_mutex); + ep = idr_find(&iscsi_ep_idr, handle); + if (!ep) + goto unlock; - return iscsi_dev_to_endpoint(dev); + get_device(&ep->dev); +unlock: + mutex_unlock(&iscsi_ep_idr_mutex); + return ep; } EXPORT_SYMBOL_GPL(iscsi_lookup_endpoint); diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 037c77fb5dc5..3ecf9702287b 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -296,7 +296,7 @@ extern void iscsi_host_for_each_session(struct Scsi_Host *shost, struct iscsi_endpoint { void *dd_data; /* LLD private data */ struct device dev; - uint64_t id; + int id; struct iscsi_cls_conn *conn; }; -- cgit v1.2.3 From e4efe868aa14c8557841ecf30b13667f96a12111 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 7 Apr 2022 19:13:11 -0500 Subject: scsi: iscsi: Merge suspend fields [ Upstream commit 5bd856256f8c03e329f8ff36d8c8efcb111fe6df ] Move the tx and rx suspend fields into one flags field. Link: https://lore.kernel.org/r/20220408001314.5014-8-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/bnx2i/bnx2i_hwi.c | 2 +- drivers/scsi/bnx2i/bnx2i_iscsi.c | 2 +- drivers/scsi/cxgbi/libcxgbi.c | 6 +++--- drivers/scsi/libiscsi.c | 20 ++++++++++---------- drivers/scsi/libiscsi_tcp.c | 2 +- include/scsi/libiscsi.h | 9 +++++---- 6 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c index 5521469ce678..e16327a4b4c9 100644 --- a/drivers/scsi/bnx2i/bnx2i_hwi.c +++ b/drivers/scsi/bnx2i/bnx2i_hwi.c @@ -1977,7 +1977,7 @@ static int bnx2i_process_new_cqes(struct bnx2i_conn *bnx2i_conn) if (nopin->cq_req_sn != qp->cqe_exp_seq_sn) break; - if (unlikely(test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx))) { + if (unlikely(test_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags))) { if (nopin->op_code == ISCSI_OP_NOOP_IN && nopin->itt == (u16) RESERVED_ITT) { printk(KERN_ALERT "bnx2i: Unsolicited " diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 1b5f3e143f07..2e5241d12dc3 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -1721,7 +1721,7 @@ static int bnx2i_tear_down_conn(struct bnx2i_hba *hba, struct iscsi_conn *conn = ep->conn->cls_conn->dd_data; /* Must suspend all rx queue activity for this ep */ - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags); } /* CONN_DISCONNECT timeout may or may not be an issue depending * on what transcribed in TCP layer, different targets behave diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index 8c7d4dda4cf2..4365d52c6430 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -1634,11 +1634,11 @@ void cxgbi_conn_pdu_ready(struct cxgbi_sock *csk) log_debug(1 << CXGBI_DBG_PDU_RX, "csk 0x%p, conn 0x%p.\n", csk, conn); - if (unlikely(!conn || conn->suspend_rx)) { + if (unlikely(!conn || test_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags))) { log_debug(1 << CXGBI_DBG_PDU_RX, - "csk 0x%p, conn 0x%p, id %d, suspend_rx %lu!\n", + "csk 0x%p, conn 0x%p, id %d, conn flags 0x%lx!\n", csk, conn, conn ? conn->id : 0xFF, - conn ? conn->suspend_rx : 0xFF); + conn ? conn->flags : 0xFF); return; } diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index cbc263ec9d66..a4f26431b033 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1392,8 +1392,8 @@ static bool iscsi_set_conn_failed(struct iscsi_conn *conn) if (conn->stop_stage == 0) session->state = ISCSI_STATE_FAILED; - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); + set_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags); return true; } @@ -1454,7 +1454,7 @@ static int iscsi_xmit_task(struct iscsi_conn *conn, struct iscsi_task *task, * Do this after dropping the extra ref because if this was a requeue * it's removed from that list and cleanup_queued_task would miss it. */ - if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { + if (test_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags)) { /* * Save the task and ref in case we weren't cleaning up this * task and get woken up again. @@ -1532,7 +1532,7 @@ static int iscsi_data_xmit(struct iscsi_conn *conn) int rc = 0; spin_lock_bh(&conn->session->frwd_lock); - if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { + if (test_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags)) { ISCSI_DBG_SESSION(conn->session, "Tx suspended!\n"); spin_unlock_bh(&conn->session->frwd_lock); return -ENODATA; @@ -1746,7 +1746,7 @@ int iscsi_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *sc) goto fault; } - if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) { + if (test_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags)) { reason = FAILURE_SESSION_IN_RECOVERY; sc->result = DID_REQUEUE << 16; goto fault; @@ -1935,7 +1935,7 @@ static void fail_scsi_tasks(struct iscsi_conn *conn, u64 lun, int error) void iscsi_suspend_queue(struct iscsi_conn *conn) { spin_lock_bh(&conn->session->frwd_lock); - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); spin_unlock_bh(&conn->session->frwd_lock); } EXPORT_SYMBOL_GPL(iscsi_suspend_queue); @@ -1953,7 +1953,7 @@ void iscsi_suspend_tx(struct iscsi_conn *conn) struct Scsi_Host *shost = conn->session->host; struct iscsi_host *ihost = shost_priv(shost); - set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + set_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); if (ihost->workq) flush_workqueue(ihost->workq); } @@ -1961,7 +1961,7 @@ EXPORT_SYMBOL_GPL(iscsi_suspend_tx); static void iscsi_start_tx(struct iscsi_conn *conn) { - clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + clear_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); iscsi_conn_queue_work(conn); } @@ -3324,8 +3324,8 @@ int iscsi_conn_bind(struct iscsi_cls_session *cls_session, /* * Unblock xmitworker(), Login Phase will pass through. */ - clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx); - clear_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx); + clear_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags); + clear_bit(ISCSI_CONN_FLAG_SUSPEND_TX, &conn->flags); return 0; } EXPORT_SYMBOL_GPL(iscsi_conn_bind); diff --git a/drivers/scsi/libiscsi_tcp.c b/drivers/scsi/libiscsi_tcp.c index 2e9ffe3d1a55..883005757ddb 100644 --- a/drivers/scsi/libiscsi_tcp.c +++ b/drivers/scsi/libiscsi_tcp.c @@ -927,7 +927,7 @@ int iscsi_tcp_recv_skb(struct iscsi_conn *conn, struct sk_buff *skb, */ conn->last_recv = jiffies; - if (unlikely(conn->suspend_rx)) { + if (unlikely(test_bit(ISCSI_CONN_FLAG_SUSPEND_RX, &conn->flags))) { ISCSI_DBG_TCP(conn, "Rx suspended!\n"); *status = ISCSI_TCP_SUSPENDED; return 0; diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index 4ee233e5a6ff..bdb0ae11682d 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -52,8 +52,10 @@ enum { #define ISID_SIZE 6 -/* Connection suspend "bit" */ -#define ISCSI_SUSPEND_BIT 1 +/* Connection flags */ +#define ISCSI_CONN_FLAG_SUSPEND_TX BIT(0) +#define ISCSI_CONN_FLAG_SUSPEND_RX BIT(1) + #define ISCSI_ITT_MASK 0x1fff #define ISCSI_TOTAL_CMDS_MAX 4096 @@ -199,8 +201,7 @@ struct iscsi_conn { struct list_head cmdqueue; /* data-path cmd queue */ struct list_head requeue; /* tasks needing another run */ struct work_struct xmitwork; /* per-conn. xmit workqueue */ - unsigned long suspend_tx; /* suspend Tx */ - unsigned long suspend_rx; /* suspend Rx */ + unsigned long flags; /* ISCSI_CONN_FLAGs */ /* negotiated params */ unsigned max_recv_dlength; /* initiator_max_recv_dsl*/ -- cgit v1.2.3 From c7f4f3016fea68faba6c8846cc6b35717b253e7a Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 7 Apr 2022 19:13:12 -0500 Subject: scsi: iscsi: Fix NOP handling during conn recovery [ Upstream commit 44ac97109e42f87b1a34954704b81b6c8eca80c4 ] If a offload driver doesn't use the xmit workqueue, then when we are doing ep_disconnect libiscsi can still inject PDUs to the driver. This adds a check for if the connection is bound before trying to inject PDUs. Link: https://lore.kernel.org/r/20220408001314.5014-9-michael.christie@oracle.com Tested-by: Manish Rangankar Reviewed-by: Lee Duncan Reviewed-by: Chris Leech Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/libiscsi.c | 7 ++++++- include/scsi/libiscsi.h | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index a4f26431b033..0f2c7098f9d6 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -678,7 +678,8 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr, struct iscsi_task *task; itt_t itt; - if (session->state == ISCSI_STATE_TERMINATE) + if (session->state == ISCSI_STATE_TERMINATE || + !test_bit(ISCSI_CONN_FLAG_BOUND, &conn->flags)) return NULL; if (opcode == ISCSI_OP_LOGIN || opcode == ISCSI_OP_TEXT) { @@ -2214,6 +2215,8 @@ void iscsi_conn_unbind(struct iscsi_cls_conn *cls_conn, bool is_active) iscsi_suspend_tx(conn); spin_lock_bh(&session->frwd_lock); + clear_bit(ISCSI_CONN_FLAG_BOUND, &conn->flags); + if (!is_active) { /* * if logout timed out before userspace could even send a PDU @@ -3312,6 +3315,8 @@ int iscsi_conn_bind(struct iscsi_cls_session *cls_session, spin_lock_bh(&session->frwd_lock); if (is_leading) session->leadconn = conn; + + set_bit(ISCSI_CONN_FLAG_BOUND, &conn->flags); spin_unlock_bh(&session->frwd_lock); /* diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index bdb0ae11682d..d1e282f0d6f1 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -55,7 +55,7 @@ enum { /* Connection flags */ #define ISCSI_CONN_FLAG_SUSPEND_TX BIT(0) #define ISCSI_CONN_FLAG_SUSPEND_RX BIT(1) - +#define ISCSI_CONN_FLAG_BOUND BIT(2) #define ISCSI_ITT_MASK 0x1fff #define ISCSI_TOTAL_CMDS_MAX 4096 -- cgit v1.2.3 From 07bdd207774c7d4ed0a5e8486a2ee6157271cad1 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 21 Apr 2022 16:35:40 -0700 Subject: memcg: sync flush only if periodic flush is delayed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa upstream. Daniel Dao has reported [1] a regression on workloads that may trigger a lot of refaults (anon and file). The underlying issue is that flushing rstat is expensive. Although rstat flush are batched with (nr_cpus * MEMCG_BATCH) stat updates, it seems like there are workloads which genuinely do stat updates larger than batch value within short amount of time. Since the rstat flush can happen in the performance critical codepaths like page faults, such workload can suffer greatly. This patch fixes this regression by making the rstat flushing conditional in the performance critical codepaths. More specifically, the kernel relies on the async periodic rstat flusher to flush the stats and only if the periodic flusher is delayed by more than twice the amount of its normal time window then the kernel allows rstat flushing from the performance critical codepaths. Now the question: what are the side-effects of this change? The worst that can happen is the refault codepath will see 4sec old lruvec stats and may cause false (or missed) activations of the refaulted page which may under-or-overestimate the workingset size. Though that is not very concerning as the kernel can already miss or do false activations. There are two more codepaths whose flushing behavior is not changed by this patch and we may need to come to them in future. One is the writeback stats used by dirty throttling and second is the deactivation heuristic in the reclaim. For now keeping an eye on them and if there is report of regression due to these codepaths, we will reevaluate then. Link: https://lore.kernel.org/all/CA+wXwBSyO87ZX5PVwdHm-=dBjZYECGmfnydUicUyrQqndgX2MQ@mail.gmail.com [1] Link: https://lkml.kernel.org/r/20220304184040.1304781-1-shakeelb@google.com Fixes: 1f828223b799 ("memcg: flush lruvec stats in the refault") Signed-off-by: Shakeel Butt Reported-by: Daniel Dao Tested-by: Ivan Babrou Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Koutný Cc: Frank Hofmann Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 12 +++++++++++- mm/workingset.c | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d9b8df5ef212..d35439db047c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1002,6 +1002,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, } void mem_cgroup_flush_stats(void); +void mem_cgroup_flush_stats_delayed(void); void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); @@ -1422,6 +1423,10 @@ static inline void mem_cgroup_flush_stats(void) { } +static inline void mem_cgroup_flush_stats_delayed(void) +{ +} + static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8cdeb33d2cf9..971546bb99e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -650,6 +650,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { @@ -671,6 +674,7 @@ static void __mem_cgroup_flush_stats(void) if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; + flush_next_time = jiffies_64 + 2*FLUSH_TIME; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); spin_unlock_irqrestore(&stats_flush_lock, flag); @@ -682,10 +686,16 @@ void mem_cgroup_flush_stats(void) __mem_cgroup_flush_stats(); } +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + static void flush_memcg_stats_dwork(struct work_struct *w) { __mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } /** diff --git a/mm/workingset.c b/mm/workingset.c index d5b81e4f4cbe..880d882f3325 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -352,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow) inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file); - mem_cgroup_flush_stats(); + mem_cgroup_flush_stats_delayed(); /* * Compare the distance to the existing workingset size. We * don't activate pages that couldn't stay resident even if -- cgit v1.2.3 From 9dcb65cdf3128113ce9a48b05a72b6f8ef2bc257 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 21 Apr 2022 16:35:46 -0700 Subject: mm, hugetlb: allow for "high" userspace addresses commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream. This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") for hugetlb. This patch adds support for "high" userspace addresses that are optionally supported on the system and have to be requested via a hint mechanism ("high" addr parameter to mmap). Architectures such as powerpc and x86 achieve this by making changes to their architectural versions of hugetlb_get_unmapped_area() function. However, arm64 uses the generic version of that function. So take into account arch_get_mmap_base() and arch_get_mmap_end() in hugetlb_get_unmapped_area(). To allow that, move those two macros out of mm/mmap.c into include/linux/sched/mm.h If these macros are not defined in architectural code then they default to (TASK_SIZE) and (base) so should not introduce any behavioural changes to architectures that do not define them. For the time being, only ARM64 is affected by this change. Catalin (ARM64) said "We should have fixed hugetlb_get_unmapped_area() as well when we added support for 52-bit VA. The reason for commit f6795053dac8 was to prevent normal mmap() from returning addresses above 48-bit by default as some user-space had hard assumptions about this. It's a slight ABI change if you do this for hugetlb_get_unmapped_area() but I doubt anyone would notice. It's more likely that the current behaviour would cause issues, so I'd rather have them consistent. Basically when arm64 gained support for 52-bit addresses we did not want user-space calling mmap() to suddenly get such high addresses, otherwise we could have inadvertently broken some programs (similar behaviour to x86 here). Hence we added commit f6795053dac8. But we missed hugetlbfs which could still get such high mmap() addresses. So in theory that's a potential regression that should have bee addressed at the same time as commit f6795053dac8 (and before arm64 enabled 52-bit addresses)" Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses") Signed-off-by: Christophe Leroy Reviewed-by: Catalin Marinas Cc: Steve Capper Cc: Will Deacon Cc: [5.0.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hugetlbfs/inode.c | 9 +++++---- include/linux/sched/mm.h | 8 ++++++++ mm/mmap.c | 8 -------- 3 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 54c4e0b0dda4..bb0651a4a128 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = current->mm->mmap_base; + info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); @@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; + info.high_limit = arch_get_mmap_end(addr); addr = vm_unmapped_area(&info); } @@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + const unsigned long mmap_end = arch_get_mmap_end(addr); if (len & ~huge_page_mask(h)) return -EINVAL; @@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mmap_end - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 5561486fddef..95fb7aaaec8d 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -106,6 +106,14 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU +#ifndef arch_get_mmap_end +#define arch_get_mmap_end(addr) (TASK_SIZE) +#endif + +#ifndef arch_get_mmap_base +#define arch_get_mmap_base(addr, base) (base) +#endif + extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long diff --git a/mm/mmap.c b/mm/mmap.c index 049b8e5c18f0..6bb553ed5c55 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2113,14 +2113,6 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) return addr; } -#ifndef arch_get_mmap_end -#define arch_get_mmap_end(addr) (TASK_SIZE) -#endif - -#ifndef arch_get_mmap_base -#define arch_get_mmap_base(addr, base) (base) -#endif - /* Get an address range which is currently unmapped. * For shmat() with addr=0. * -- cgit v1.2.3 From 41ba681c63731872beab7bd86d8246e2ea121381 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Thu, 21 Apr 2022 16:36:01 -0700 Subject: oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup commit e4a38402c36e42df28eb1a5394be87e6571fb48a upstream. The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can be targeted by the oom reaper. This mapping is used to store the futex robust list head; the kernel does not keep a copy of the robust list and instead references a userspace address to maintain the robustness during a process death. A race can occur between exit_mm and the oom reaper that allows the oom reaper to free the memory of the futex robust list before the exit path has handled the futex death: CPU1 CPU2 -------------------------------------------------------------------- page_fault do_exit "signal" wake_oom_reaper oom_reaper oom_reap_task_mm (invalidates mm) exit_mm exit_mm_release futex_exit_release futex_cleanup exit_robust_list get_user (EFAULT- can't access memory) If the get_user EFAULT's, the kernel will be unable to recover the waiters on the robust_list, leaving userspace mutexes hung indefinitely. Delay the OOM reaper, allowing more time for the exit path to perform the futex cleanup. Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer Based on a patch by Michal Hocko. Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1] Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Joel Savitz Signed-off-by: Nico Pache Co-developed-by: Joel Savitz Suggested-by: Thomas Gleixner Acked-by: Thomas Gleixner Acked-by: Michal Hocko Cc: Rafael Aquini Cc: Waiman Long Cc: Herton R. Krzesinski Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Daniel Bristot de Oliveira Cc: David Rientjes Cc: Andrea Arcangeli Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Joel Savitz Cc: Darren Hart Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/sched.h | 1 + mm/oom_kill.c | 54 ++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9016bbacedf3..ad7ff332a0ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1436,6 +1436,7 @@ struct task_struct { int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bfa9e348c3a3..262f752d3d51 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -635,7 +635,7 @@ done: */ set_bit(MMF_OOM_SKIP, &mm->flags); - /* Drop a reference taken by wake_oom_reaper */ + /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); } @@ -645,12 +645,12 @@ static int oom_reaper(void *unused) struct task_struct *tsk = NULL; wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); + spin_lock_irq(&oom_reaper_lock); if (oom_reaper_list != NULL) { tsk = oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list; } - spin_unlock(&oom_reaper_lock); + spin_unlock_irq(&oom_reaper_lock); if (tsk) oom_reap_task(tsk); @@ -659,22 +659,48 @@ static int oom_reaper(void *unused) return 0; } -static void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct timer_list *timer) { - /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) - return; + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags; - get_task_struct(tsk); + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + } - spin_lock(&oom_reaper_lock); + spin_lock_irqsave(&oom_reaper_lock, flags); tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); + spin_unlock_irqrestore(&oom_reaper_lock, flags); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } +/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) +{ + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + return; + + get_task_struct(tsk); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -682,7 +708,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static inline void wake_oom_reaper(struct task_struct *tsk) +static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ @@ -933,7 +959,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) rcu_read_unlock(); if (can_oom_reap) - wake_oom_reaper(victim); + queue_oom_reaper(victim); mmdrop(mm); put_task_struct(victim); @@ -969,7 +995,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(victim); if (task_will_free_mem(victim)) { mark_oom_victim(victim); - wake_oom_reaper(victim); + queue_oom_reaper(victim); task_unlock(victim); put_task_struct(victim); return; @@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); + queue_oom_reaper(current); return true; } -- cgit v1.2.3 From bcba40bd36d705aba9c5fd4622e35118c2a46ed2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:22 +0100 Subject: netfilter: conntrack: convert to refcount_t api commit 719774377622bc4025d2a74f551b5dc2158c6c30 upstream. Convert nf_conn reference counting from atomic_t to refcount_t based api. refcount_t api provides more runtime sanity checks and will warn on certain constructs, e.g. refcount_inc() on a zero reference count, which usually indicates use-after-free. For this reason template allocation is changed to init the refcount to 1, the subsequenct add operations are removed. Likewise, init_conntrack() is changed to set the initial refcount to 1 instead refcount_inc(). This is safe because the new entry is not (yet) visible to other cpus. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/linux/netfilter/nf_conntrack_common.h | 8 ++++---- net/netfilter/nf_conntrack_core.c | 26 +++++++++++++------------- net/netfilter/nf_conntrack_expect.c | 4 ++-- net/netfilter/nf_conntrack_netlink.c | 6 +++--- net/netfilter/nf_conntrack_standalone.c | 4 ++-- net/netfilter/nf_flow_table_core.c | 2 +- net/netfilter/nf_synproxy_core.c | 1 - net/netfilter/nft_ct.c | 4 +--- net/netfilter/xt_CT.c | 3 +-- net/openvswitch/conntrack.c | 1 - net/sched/act_ct.c | 1 - 11 files changed, 27 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 700ea077ce2d..a03f7a80b9ab 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -2,7 +2,7 @@ #ifndef _NF_CONNTRACK_COMMON_H #define _NF_CONNTRACK_COMMON_H -#include +#include #include struct ip_conntrack_stat { @@ -25,19 +25,19 @@ struct ip_conntrack_stat { #define NFCT_PTRMASK ~(NFCT_INFOMASK) struct nf_conntrack { - atomic_t use; + refcount_t use; }; void nf_conntrack_destroy(struct nf_conntrack *nfct); static inline void nf_conntrack_put(struct nf_conntrack *nfct) { - if (nfct && atomic_dec_and_test(&nfct->use)) + if (nfct && refcount_dec_and_test(&nfct->use)) nf_conntrack_destroy(nfct); } static inline void nf_conntrack_get(struct nf_conntrack *nfct) { if (nfct) - atomic_inc(&nfct->use); + refcount_inc(&nfct->use); } #endif /* _NF_CONNTRACK_COMMON_H */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3a98a1316307..6d7840b8457b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -598,7 +598,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); nf_ct_zone_add(tmpl, zone); - atomic_set(&tmpl->ct_general.use, 0); + refcount_set(&tmpl->ct_general.use, 1); return tmpl; } @@ -631,7 +631,7 @@ destroy_conntrack(struct nf_conntrack *nfct) struct nf_conn *ct = (struct nf_conn *)nfct; pr_debug("destroy_conntrack(%p)\n", ct); - WARN_ON(atomic_read(&nfct->use) != 0); + WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { nf_ct_tmpl_free(ct); @@ -755,7 +755,7 @@ nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) /* caller must hold rcu readlock and none of the nf_conntrack_locks */ static void nf_ct_gc_expired(struct nf_conn *ct) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) + if (!refcount_inc_not_zero(&ct->ct_general.use)) return; if (nf_ct_should_gc(ct)) @@ -823,7 +823,7 @@ __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, * in, try to obtain a reference and re-check tuple */ ct = nf_ct_tuplehash_to_ctrack(h); - if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { + if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { if (likely(nf_ct_key_equal(h, tuple, zone, net))) goto found; @@ -920,7 +920,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) smp_wmb(); /* The caller holds a reference to this object */ - atomic_set(&ct->ct_general.use, 2); + refcount_set(&ct->ct_general.use, 2); __nf_conntrack_hash_insert(ct, hash, reply_hash); nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert); @@ -971,7 +971,7 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct) { struct nf_conn_tstamp *tstamp; - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); ct->status |= IPS_CONFIRMED; /* set conntrack timestamp, if enabled. */ @@ -1364,7 +1364,7 @@ static unsigned int early_drop_list(struct net *net, nf_ct_is_dying(tmp)) continue; - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; /* kill only if still in same netns -- might have moved due to @@ -1513,7 +1513,7 @@ static void gc_worker(struct work_struct *work) continue; /* need to take reference to avoid possible races */ - if (!atomic_inc_not_zero(&tmp->ct_general.use)) + if (!refcount_inc_not_zero(&tmp->ct_general.use)) continue; if (gc_worker_skip_ct(tmp)) { @@ -1622,7 +1622,7 @@ __nf_conntrack_alloc(struct net *net, /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ - atomic_set(&ct->ct_general.use, 0); + refcount_set(&ct->ct_general.use, 0); return ct; out: atomic_dec(&cnet->count); @@ -1647,7 +1647,7 @@ void nf_conntrack_free(struct nf_conn *ct) /* A freed object has refcnt == 0, that's * the golden rule for SLAB_TYPESAFE_BY_RCU */ - WARN_ON(atomic_read(&ct->ct_general.use) != 0); + WARN_ON(refcount_read(&ct->ct_general.use) != 0); nf_ct_ext_destroy(ct); kmem_cache_free(nf_conntrack_cachep, ct); @@ -1739,8 +1739,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (!exp) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); - /* Now it is inserted into the unconfirmed list, bump refcount */ - nf_conntrack_get(&ct->ct_general); + /* Now it is inserted into the unconfirmed list, set refcount to 1. */ + refcount_set(&ct->ct_general.use, 1); nf_ct_add_to_unconfirmed_list(ct); local_bh_enable(); @@ -2352,7 +2352,7 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), return NULL; found: - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); spin_unlock(lockp); local_bh_enable(); return ct; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index f562eeef4234..6d056ebba57c 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -203,12 +203,12 @@ nf_ct_find_expectation(struct net *net, * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * - * The atomic_inc_not_zero() check tells: If that fails, we + * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || - !atomic_inc_not_zero(&exp->master->ct_general.use))) + !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 1c02be04aaf5..ef0a78aa9ba9 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -508,7 +508,7 @@ nla_put_failure: static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { - if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) + if (nla_put_be32(skb, CTA_USE, htonl(refcount_read(&ct->ct_general.use)))) goto nla_put_failure; return 0; @@ -1200,7 +1200,7 @@ restart: ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_is_expired(ct)) { if (i < ARRAY_SIZE(nf_ct_evict) && - atomic_inc_not_zero(&ct->ct_general.use)) + refcount_inc_not_zero(&ct->ct_general.use)) nf_ct_evict[i++] = ct; continue; } @@ -1748,7 +1748,7 @@ restart: NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying ? true : false, 0); if (res < 0) { - if (!atomic_inc_not_zero(&ct->ct_general.use)) + if (!refcount_inc_not_zero(&ct->ct_general.use)) continue; cb->args[0] = cpu; cb->args[1] = (unsigned long)ct; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 80f675d884b2..3e1afd10a9b6 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -303,7 +303,7 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; WARN_ON(!ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (unlikely(!refcount_inc_not_zero(&ct->ct_general.use))) return 0; if (nf_ct_should_gc(ct)) { @@ -370,7 +370,7 @@ static int ct_seq_show(struct seq_file *s, void *v) ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); - seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); + seq_printf(s, "use=%u\n", refcount_read(&ct->ct_general.use)); if (seq_has_overflowed(s)) goto release; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index ed37bb9b4e58..b90eca7a2f22 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -48,7 +48,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) struct flow_offload *flow; if (unlikely(nf_ct_is_dying(ct) || - !atomic_inc_not_zero(&ct->ct_general.use))) + !refcount_inc_not_zero(&ct->ct_general.use))) return NULL; flow = kzalloc(sizeof(*flow), GFP_ATOMIC); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 3d6d49420db8..2dfc5dae0656 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -349,7 +349,6 @@ static int __net_init synproxy_net_init(struct net *net) goto err2; __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 54ecb9fbf2de..ee69c692056f 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -259,7 +259,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(atomic_read(&ct->ct_general.use) == 1)) { + if (likely(refcount_read(&ct->ct_general.use) == 1)) { nf_ct_zone_add(ct, &zone); } else { /* previous skb got queued to userspace */ @@ -270,7 +270,6 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, } } - atomic_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } #endif @@ -375,7 +374,6 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } - atomic_set(&tmp->ct_general.use, 1); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 0a913ce07425..267757b0392a 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -24,7 +24,7 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) return XT_CONTINUE; if (ct) { - atomic_inc(&ct->ct_general.use); + refcount_inc(&ct->ct_general.use); nf_ct_set(skb, ct, IP_CT_NEW); } else { nf_ct_set(skb, ct, IP_CT_UNTRACKED); @@ -201,7 +201,6 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); - nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index f2b64cab9af7..815916056e0d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1722,7 +1722,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, goto err_free_ct; __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 553bf41671a6..f4fd584fba08 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1232,7 +1232,6 @@ static int tcf_ct_fill_params(struct net *net, return -ENOMEM; } __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); - nf_conntrack_get(&tmpl->ct_general); p->tmpl = tmpl; return 0; -- cgit v1.2.3 From 67e4860eeed86a1eec0a86467723f95cbd785076 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jan 2022 05:03:25 +0100 Subject: netfilter: conntrack: avoid useless indirection during conntrack destruction commit 6ae7989c9af0d98ab64196f4f4c6f6499454bd23 upstream. nf_ct_put() results in a usesless indirection: nf_ct_put -> nf_conntrack_put -> nf_conntrack_destroy -> rcu readlock + indirect call of ct_hooks->destroy(). There are two _put helpers: nf_ct_put and nf_conntrack_put. The latter is what should be used in code that MUST NOT cause a linker dependency on the conntrack module (e.g. calls from core network stack). Everyone else should call nf_ct_put() instead. A followup patch will convert a few nf_conntrack_put() calls to nf_ct_put(), in particular from modules that already have a conntrack dependency such as act_ct or even nf_conntrack itself. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/linux/netfilter/nf_conntrack_common.h | 2 ++ include/net/netfilter/nf_conntrack.h | 8 ++++++-- net/netfilter/nf_conntrack_core.c | 12 ++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index a03f7a80b9ab..2770db2fa080 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -29,6 +29,8 @@ struct nf_conntrack { }; void nf_conntrack_destroy(struct nf_conntrack *nfct); + +/* like nf_ct_put, but without module dependency on nf_conntrack */ static inline void nf_conntrack_put(struct nf_conntrack *nfct) { if (nfct && refcount_dec_and_test(&nfct->use)) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index d24b0a34c8f0..34c266502a50 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -76,6 +76,8 @@ struct nf_conn { * Hint, SKB address this struct and refcnt via skb->_nfct and * helpers nf_conntrack_get() and nf_conntrack_put(). * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt, + * except that the latter uses internal indirection and does not + * result in a conntrack module dependency. * beware nf_ct_get() is different and don't inc refcnt. */ struct nf_conntrack ct_general; @@ -169,11 +171,13 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) return (struct nf_conn *)(nfct & NFCT_PTRMASK); } +void nf_ct_destroy(struct nf_conntrack *nfct); + /* decrement reference count on a conntrack */ static inline void nf_ct_put(struct nf_conn *ct) { - WARN_ON(!ct); - nf_conntrack_put(&ct->ct_general); + if (ct && refcount_dec_and_test(&ct->ct_general.use)) + nf_ct_destroy(&ct->ct_general); } /* Protocol module loading */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6d7840b8457b..31399c53dfb1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -571,7 +571,7 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) -/* Released via destroy_conntrack() */ +/* Released via nf_ct_destroy() */ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, gfp_t flags) @@ -625,12 +625,11 @@ static void destroy_gre_conntrack(struct nf_conn *ct) #endif } -static void -destroy_conntrack(struct nf_conntrack *nfct) +void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; - pr_debug("destroy_conntrack(%p)\n", ct); + pr_debug("%s(%p)\n", __func__, ct); WARN_ON(refcount_read(&nfct->use) != 0); if (unlikely(nf_ct_is_template(ct))) { @@ -656,9 +655,10 @@ destroy_conntrack(struct nf_conntrack *nfct) if (ct->master) nf_ct_put(ct->master); - pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); + pr_debug("%s: returning ct=%p to slab\n", __func__, ct); nf_conntrack_free(ct); } +EXPORT_SYMBOL(nf_ct_destroy); static void nf_ct_delete_from_lists(struct nf_conn *ct) { @@ -2825,7 +2825,7 @@ err_cachep: static struct nf_ct_hook nf_conntrack_hook = { .update = nf_conntrack_update, - .destroy = destroy_conntrack, + .destroy = nf_ct_destroy, .get_tuple_skb = nf_conntrack_get_tuple_skb, }; -- cgit v1.2.3