From 5cadfbd5a11e5495cac217534c5f788168b1afd7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Mar 2023 16:14:49 +0200 Subject: fuse: add feature flag for expire-only Add an init flag idicating whether the FUSE_EXPIRE_ONLY flag of FUSE_NOTIFY_INVAL_ENTRY is effective. This is needed for backports of this feature, otherwise the server could just check the protocol version. Fixes: 4f8d37020e1f ("fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY") Cc: # v6.2 Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 1b9d0dfae72d..b3fcab13fcd3 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -206,6 +206,7 @@ * - add extension header * - add FUSE_EXT_GROUPS * - add FUSE_CREATE_SUPP_GROUP + * - add FUSE_HAS_EXPIRE_ONLY */ #ifndef _LINUX_FUSE_H @@ -369,6 +370,7 @@ struct fuse_file_lock { * FUSE_HAS_INODE_DAX: use per inode DAX * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) + * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -406,6 +408,7 @@ struct fuse_file_lock { #define FUSE_SECURITY_CTX (1ULL << 32) #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) +#define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) /** * CUSE INIT request/reply flags -- cgit v1.2.3 From a6fe043880820981f6e4918240f967ea79bb063e Mon Sep 17 00:00:00 2001 From: Kameron Carr Date: Fri, 23 Jun 2023 15:09:49 -0700 Subject: Drivers: hv: Change hv_free_hyperv_page() to take void * argument Currently hv_free_hyperv_page() takes an unsigned long argument, which is inconsistent with the void * return value from the corresponding hv_alloc_hyperv_page() function and variants. This creates unnecessary extra casting. Change the hv_free_hyperv_page() argument type to void *. Also remove redundant casts from invocations of hv_alloc_hyperv_page() and variants. Signed-off-by: Kameron Carr Reviewed-by: Nuno Das Neves Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1687558189-19734-1-git-send-email-kameroncarr@linux.microsoft.com Signed-off-by: Wei Liu --- drivers/hv/connection.c | 13 ++++++------- drivers/hv/hv_common.c | 10 +++++----- include/asm-generic/mshyperv.h | 2 +- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index 5978e9dbc286..ebf15f31d97e 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -209,8 +209,7 @@ int vmbus_connect(void) * Setup the vmbus event connection for channel interrupt * abstraction stuff */ - vmbus_connection.int_page = - (void *)hv_alloc_hyperv_zeroed_page(); + vmbus_connection.int_page = hv_alloc_hyperv_zeroed_page(); if (vmbus_connection.int_page == NULL) { ret = -ENOMEM; goto cleanup; @@ -225,8 +224,8 @@ int vmbus_connect(void) * Setup the monitor notification facility. The 1st page for * parent->child and the 2nd page for child->parent */ - vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_page(); - vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_page(); + vmbus_connection.monitor_pages[0] = hv_alloc_hyperv_page(); + vmbus_connection.monitor_pages[1] = hv_alloc_hyperv_page(); if ((vmbus_connection.monitor_pages[0] == NULL) || (vmbus_connection.monitor_pages[1] == NULL)) { ret = -ENOMEM; @@ -333,15 +332,15 @@ void vmbus_disconnect(void) destroy_workqueue(vmbus_connection.work_queue); if (vmbus_connection.int_page) { - hv_free_hyperv_page((unsigned long)vmbus_connection.int_page); + hv_free_hyperv_page(vmbus_connection.int_page); vmbus_connection.int_page = NULL; } set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1); set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1); - hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[0]); - hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[1]); + hv_free_hyperv_page(vmbus_connection.monitor_pages[0]); + hv_free_hyperv_page(vmbus_connection.monitor_pages[1]); vmbus_connection.monitor_pages[0] = NULL; vmbus_connection.monitor_pages[1] = NULL; } diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 542a1d53b303..6a2258fef1fe 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -115,12 +115,12 @@ void *hv_alloc_hyperv_zeroed_page(void) } EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); -void hv_free_hyperv_page(unsigned long addr) +void hv_free_hyperv_page(void *addr) { if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - free_page(addr); + free_page((unsigned long)addr); else - kfree((void *)addr); + kfree(addr); } EXPORT_SYMBOL_GPL(hv_free_hyperv_page); @@ -253,7 +253,7 @@ static void hv_kmsg_dump_unregister(void) atomic_notifier_chain_unregister(&panic_notifier_list, &hyperv_panic_report_block); - hv_free_hyperv_page((unsigned long)hv_panic_page); + hv_free_hyperv_page(hv_panic_page); hv_panic_page = NULL; } @@ -270,7 +270,7 @@ static void hv_kmsg_dump_register(void) ret = kmsg_dump_register(&hv_kmsg_dumper); if (ret) { pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); - hv_free_hyperv_page((unsigned long)hv_panic_page); + hv_free_hyperv_page(hv_panic_page); hv_panic_page = NULL; } } diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 402a8c1c202d..a8f4b653ef4e 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -190,7 +190,7 @@ int hv_common_cpu_die(unsigned int cpu); void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); -void hv_free_hyperv_page(unsigned long addr); +void hv_free_hyperv_page(void *addr); /** * hv_cpu_number_to_vp_number() - Map CPU to VP. -- cgit v1.2.3 From d1478aea649e739a0a0e4890cd8b049ae5d08c13 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 29 Jun 2023 18:01:32 +0200 Subject: memory: tegra: Add dummy implementation on Tegra194 With the introduction of commit 9365bf006f53 ("PCI: tegra194: Add interconnect support in Tegra234"), the PCI driver on Tegra194 and later requires an interconnect provider. However, a provider is currently only exposed on Tegra234 and this causes PCI on Tegra194 to defer probe indefinitely. Fix this by adding a dummy implementation on Tegra194. This allows nodes to be provided to interconnect consumers, but doesn't do any bandwidth accounting or frequency scaling. Fixes: 9365bf006f53 ("PCI: tegra194: Add interconnect support in Tegra234") Reported-by: Jon Hunter Signed-off-by: Thierry Reding Reviewed-by: Sumit Gupta Tested-by: Sumit Gupta Link: https://lore.kernel.org/r/20230629160132.768940-1-thierry.reding@gmail.com Signed-off-by: Krzysztof Kozlowski --- drivers/memory/tegra/mc.c | 37 +++++++++++++++++++++++++++++++++++++ drivers/memory/tegra/tegra194.c | 1 + drivers/memory/tegra/tegra234.c | 23 +---------------------- include/soc/tegra/mc.h | 3 +++ 4 files changed, 42 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/memory/tegra/mc.c b/drivers/memory/tegra/mc.c index 4a750da1c12a..deb6e65b59af 100644 --- a/drivers/memory/tegra/mc.c +++ b/drivers/memory/tegra/mc.c @@ -755,6 +755,43 @@ const char *const tegra_mc_error_names[8] = { [6] = "SMMU translation error", }; +struct icc_node *tegra_mc_icc_xlate(struct of_phandle_args *spec, void *data) +{ + struct tegra_mc *mc = icc_provider_to_tegra_mc(data); + struct icc_node *node; + + list_for_each_entry(node, &mc->provider.nodes, node_list) { + if (node->id == spec->args[0]) + return node; + } + + /* + * If a client driver calls devm_of_icc_get() before the MC driver + * is probed, then return EPROBE_DEFER to the client driver. + */ + return ERR_PTR(-EPROBE_DEFER); +} + +static int tegra_mc_icc_get(struct icc_node *node, u32 *average, u32 *peak) +{ + *average = 0; + *peak = 0; + + return 0; +} + +static int tegra_mc_icc_set(struct icc_node *src, struct icc_node *dst) +{ + return 0; +} + +const struct tegra_mc_icc_ops tegra_mc_icc_ops = { + .xlate = tegra_mc_icc_xlate, + .aggregate = icc_std_aggregate, + .get_bw = tegra_mc_icc_get, + .set = tegra_mc_icc_set, +}; + /* * Memory Controller (MC) has few Memory Clients that are issuing memory * bandwidth allocation requests to the MC interconnect provider. The MC diff --git a/drivers/memory/tegra/tegra194.c b/drivers/memory/tegra/tegra194.c index b2416ee3ac26..26035ac3a1eb 100644 --- a/drivers/memory/tegra/tegra194.c +++ b/drivers/memory/tegra/tegra194.c @@ -1355,6 +1355,7 @@ const struct tegra_mc_soc tegra194_mc_soc = { MC_INT_SECURITY_VIOLATION | MC_INT_DECERR_EMEM, .has_addr_hi_reg = true, .ops = &tegra186_mc_ops, + .icc_ops = &tegra_mc_icc_ops, .ch_intmask = 0x00000f00, .global_intstatus_channel_shift = 8, }; diff --git a/drivers/memory/tegra/tegra234.c b/drivers/memory/tegra/tegra234.c index 8e873a7bc34f..4469430aa5fb 100644 --- a/drivers/memory/tegra/tegra234.c +++ b/drivers/memory/tegra/tegra234.c @@ -889,27 +889,6 @@ static int tegra234_mc_icc_aggregate(struct icc_node *node, u32 tag, u32 avg_bw, return 0; } -static struct icc_node* -tegra234_mc_of_icc_xlate(struct of_phandle_args *spec, void *data) -{ - struct tegra_mc *mc = icc_provider_to_tegra_mc(data); - unsigned int cl_id = spec->args[0]; - struct icc_node *node; - - list_for_each_entry(node, &mc->provider.nodes, node_list) { - if (node->id != cl_id) - continue; - - return node; - } - - /* - * If a client driver calls devm_of_icc_get() before the MC driver - * is probed, then return EPROBE_DEFER to the client driver. - */ - return ERR_PTR(-EPROBE_DEFER); -} - static int tegra234_mc_icc_get_init_bw(struct icc_node *node, u32 *avg, u32 *peak) { *avg = 0; @@ -919,7 +898,7 @@ static int tegra234_mc_icc_get_init_bw(struct icc_node *node, u32 *avg, u32 *pea } static const struct tegra_mc_icc_ops tegra234_mc_icc_ops = { - .xlate = tegra234_mc_of_icc_xlate, + .xlate = tegra_mc_icc_xlate, .aggregate = tegra234_mc_icc_aggregate, .get_bw = tegra234_mc_icc_get_init_bw, .set = tegra234_mc_icc_set, diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h index fc3001483e62..a5ef84944a06 100644 --- a/include/soc/tegra/mc.h +++ b/include/soc/tegra/mc.h @@ -175,6 +175,9 @@ struct tegra_mc_icc_ops { int (*get_bw)(struct icc_node *node, u32 *avg, u32 *peak); }; +struct icc_node *tegra_mc_icc_xlate(struct of_phandle_args *spec, void *data); +extern const struct tegra_mc_icc_ops tegra_mc_icc_ops; + struct tegra_mc_ops { /* * @probe: Callback to set up SoC-specific bits of the memory controller. This is called -- cgit v1.2.3 From be22255360f80d3af789daad00025171a65424a5 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 6 Jun 2023 21:59:24 +0800 Subject: jbd2: remove t_checkpoint_io_list Since t_checkpoint_io_list was stop using in jbd2_log_do_checkpoint() now, it's time to remove the whole t_checkpoint_io_list logic. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230606135928.434610-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 42 ++---------------------------------------- fs/jbd2/commit.c | 3 +-- include/linux/jbd2.h | 6 ------ 3 files changed, 3 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 25e3c20eb19f..55d6efdbea64 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -27,7 +27,7 @@ * * Called with j_list_lock held. */ -static inline void __buffer_unlink_first(struct journal_head *jh) +static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction = jh->b_cp_transaction; @@ -40,23 +40,6 @@ static inline void __buffer_unlink_first(struct journal_head *jh) } } -/* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - /* * Check a checkpoint buffer could be release or not. * @@ -503,15 +486,6 @@ again: break; if (need_resched() || spin_needbreak(&journal->j_list_lock)) break; - if (released) - continue; - - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, - nr_to_scan, &released); - if (*nr_to_scan == 0) - break; - if (need_resched() || spin_needbreak(&journal->j_list_lock)) - break; } while (transaction != last_transaction); if (transaction != last_transaction) { @@ -566,17 +540,6 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) */ if (need_resched()) return; - if (ret) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, destroy); - if (need_resched()) - return; /* * Stop scanning if we couldn't free the transaction. This * avoids pointless scanning of transactions which still @@ -661,7 +624,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) jbd2_journal_put_journal_head(jh); /* Is this transaction empty? */ - if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) + if (transaction->t_checkpoint_list) return 0; /* @@ -753,7 +716,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_forget == NULL); J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b33155dd7001..1073259902a6 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1141,8 +1141,7 @@ restart_loop: spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; /* Check if the transaction can be dropped now that we are finished */ - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); jbd2_journal_free_transaction(commit_transaction); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d860499e15e4..bd660aac8e07 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -613,12 +613,6 @@ struct transaction_s */ struct journal_head *t_checkpoint_list; - /* - * Doubly-linked circular list of all buffers submitted for IO while - * checkpointing. [j_list_lock] - */ - struct journal_head *t_checkpoint_io_list; - /* * Doubly-linked circular list of metadata buffers being * shadowed by log IO. The IO buffers on the iobuf list and -- cgit v1.2.3 From b98dba273a0e47dbfade89c9af73c5b012a4eabb Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 6 Jun 2023 21:59:25 +0800 Subject: jbd2: remove journal_clean_one_cp_list() journal_clean_one_cp_list() and journal_shrink_one_cp_list() are almost the same, so merge them into journal_shrink_one_cp_list(), remove the nr_to_scan parameter, always scan and try to free the whole checkpoint list. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230606135928.434610-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 75 ++++++++++----------------------------------- include/trace/events/jbd2.h | 12 +++----- 2 files changed, 21 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 55d6efdbea64..b94f847960c2 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -347,50 +347,10 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ -/* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and - * release them. If 'destroy' is set, clean all buffers unconditionally. - * - * Called with j_list_lock held. - * Returns 1 if we freed the transaction, 0 otherwise. - */ -static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - - if (!destroy && __cp_buffer_busy(jh)) - return 0; - - if (__jbd2_journal_remove_checkpoint(jh)) - return 1; - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - return 0; - } while (jh != last_jh); - - return 0; -} - /* * journal_shrink_one_cp_list * - * Find 'nr_to_scan' written-back checkpoint buffers in the given list + * Find all the written-back checkpoint buffers in the given list * and try to release them. If the whole transaction is released, set * the 'released' parameter. Return the number of released checkpointed * buffers. @@ -398,15 +358,15 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - unsigned long *nr_to_scan, - bool *released) + bool destroy, bool *released) { struct journal_head *last_jh; struct journal_head *next_jh = jh; unsigned long nr_freed = 0; int ret; - if (!jh || *nr_to_scan == 0) + *released = false; + if (!jh) return 0; last_jh = jh->b_cpprev; @@ -414,8 +374,7 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - (*nr_to_scan)--; - if (__cp_buffer_busy(jh)) + if (!destroy && __cp_buffer_busy(jh)) continue; nr_freed++; @@ -427,7 +386,7 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, if (need_resched()) break; - } while (jh != last_jh && *nr_to_scan); + } while (jh != last_jh); return nr_freed; } @@ -445,11 +404,11 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan) { transaction_t *transaction, *last_transaction, *next_transaction; - bool released; + bool __maybe_unused released; tid_t first_tid = 0, last_tid = 0, next_tid = 0; tid_t tid = 0; unsigned long nr_freed = 0; - unsigned long nr_scanned = *nr_to_scan; + unsigned long freed; again: spin_lock(&journal->j_list_lock); @@ -478,10 +437,11 @@ again: transaction = next_transaction; next_transaction = transaction->t_cpnext; tid = transaction->t_tid; - released = false; - nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, - nr_to_scan, &released); + freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, + false, &released); + nr_freed += freed; + (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) break; if (need_resched() || spin_needbreak(&journal->j_list_lock)) @@ -502,9 +462,8 @@ again: if (*nr_to_scan && next_tid) goto again; out: - nr_scanned -= *nr_to_scan; trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, - nr_freed, nr_scanned, next_tid); + nr_freed, next_tid); return nr_freed; } @@ -520,7 +479,7 @@ out: void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret; + bool released; transaction = journal->j_checkpoint_transactions; if (!transaction) @@ -531,8 +490,8 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction->t_checkpoint_list, - destroy); + journal_shrink_one_cp_list(transaction->t_checkpoint_list, + destroy, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if @@ -545,7 +504,7 @@ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) * avoids pointless scanning of transactions which still * weren't checkpointed. */ - if (!ret) + if (!released) return; } while (transaction != last_transaction); } diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 8f5ee380d309..5646ae15a957 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -462,11 +462,9 @@ TRACE_EVENT(jbd2_shrink_scan_exit, TRACE_EVENT(jbd2_shrink_checkpoint_list, TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, - unsigned long nr_freed, unsigned long nr_scanned, - tid_t next_tid), + unsigned long nr_freed, tid_t next_tid), - TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, - nr_scanned, next_tid), + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, next_tid), TP_STRUCT__entry( __field(dev_t, dev) @@ -474,7 +472,6 @@ TRACE_EVENT(jbd2_shrink_checkpoint_list, __field(tid_t, tid) __field(tid_t, last_tid) __field(unsigned long, nr_freed) - __field(unsigned long, nr_scanned) __field(tid_t, next_tid) ), @@ -484,15 +481,14 @@ TRACE_EVENT(jbd2_shrink_checkpoint_list, __entry->tid = tid; __entry->last_tid = last_tid; __entry->nr_freed = nr_freed; - __entry->nr_scanned = nr_scanned; __entry->next_tid = next_tid; ), TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " - "scanned %lu next transaction %u", + "next transaction %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->first_tid, __entry->tid, __entry->last_tid, - __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) + __entry->nr_freed, __entry->next_tid) ); #endif /* _TRACE_JBD2_H */ -- cgit v1.2.3 From 46f881b5b1758dc4a35fba4a643c10717d0cf427 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 6 Jun 2023 21:59:27 +0800 Subject: jbd2: fix a race when checking checkpoint buffer busy Before removing checkpoint buffer from the t_checkpoint_list, we have to check both BH_Dirty and BH_Lock bits together to distinguish buffers have not been or were being written back. But __cp_buffer_busy() checks them separately, it first check lock state and then check dirty, the window between these two checks could be raced by writing back procedure, which locks buffer and clears buffer dirty before I/O completes. So it cannot guarantee checkpointing buffers been written back to disk if some error happens later. Finally, it may clean checkpoint transactions and lead to inconsistent filesystem. jbd2_journal_forget() and __journal_try_to_free_buffer() also have the same problem (journal_unmap_buffer() escape from this issue since it's running under the buffer lock), so fix them through introducing a new helper to try holding the buffer lock and remove really clean buffer. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217490 Cc: stable@vger.kernel.org Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230606135928.434610-6-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 38 +++++++++++++++++++++++++++++++++++--- fs/jbd2/transaction.c | 17 +++++------------ include/linux/jbd2.h | 1 + 3 files changed, 41 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 42b34cab64fb..9ec91017a7f3 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -376,11 +376,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (!destroy && __cp_buffer_busy(jh)) - continue; + if (destroy) { + ret = __jbd2_journal_remove_checkpoint(jh); + } else { + ret = jbd2_journal_try_remove_checkpoint(jh); + if (ret < 0) + continue; + } nr_freed++; - ret = __jbd2_journal_remove_checkpoint(jh); if (ret) { *released = true; break; @@ -616,6 +620,34 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) return 1; } +/* + * Check the checkpoint buffer and try to remove it from the checkpoint + * list if it's clean. Returns -EBUSY if it is not clean, returns 1 if + * it frees the transaction, 0 otherwise. + * + * This function is called with j_list_lock held. + */ +int jbd2_journal_try_remove_checkpoint(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!trylock_buffer(bh)) + return -EBUSY; + if (buffer_dirty(bh)) { + unlock_buffer(bh); + return -EBUSY; + } + unlock_buffer(bh); + + /* + * Buffer is clean and the IO has finished (we held the buffer + * lock) so the checkpoint is done. We can safely remove the + * buffer from this transaction. + */ + JBUFFER_TRACE(jh, "remove from checkpoint list"); + return __jbd2_journal_remove_checkpoint(jh); +} + /* * journal_insert_checkpoint: put a committed buffer onto a checkpoint * list so that we know when it is safe to clean the transaction out of diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 18611241f451..6ef5022949c4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1784,8 +1784,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) * Otherwise, if the buffer has been written to disk, * it is safe to remove the checkpoint and drop it. */ - if (!buffer_dirty(bh)) { - __jbd2_journal_remove_checkpoint(jh); + if (jbd2_journal_try_remove_checkpoint(jh) >= 0) { spin_unlock(&journal->j_list_lock); goto drop; } @@ -2112,20 +2111,14 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) jh = bh2jh(bh); - if (buffer_locked(bh) || buffer_dirty(bh)) - goto out; - if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) - goto out; + return; spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL) { - /* written-back checkpointed metadata buffer */ - JBUFFER_TRACE(jh, "remove from checkpoint list"); - __jbd2_journal_remove_checkpoint(jh); - } + /* Remove written-back checkpointed metadata buffer */ + if (jh->b_cp_transaction != NULL) + jbd2_journal_try_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); -out: return; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index bd660aac8e07..44c298aa58d4 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1443,6 +1443,7 @@ extern void jbd2_journal_commit_transaction(journal_t *); void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); +int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); void jbd2_journal_destroy_checkpoint(journal_t *journal); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); -- cgit v1.2.3 From b321c31c9b7b309dcde5e8854b741c8e6a9a05f0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 13 Jul 2023 08:06:57 +0100 Subject: KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t preemption Xiang reports that VMs occasionally fail to boot on GICv4.1 systems when running a preemptible kernel, as it is possible that a vCPU is blocked without requesting a doorbell interrupt. The issue is that any preemption that occurs between vgic_v4_put() and schedule() on the block path will mark the vPE as nonresident and *not* request a doorbell irq. This occurs because when the vcpu thread is resumed on its way to block, vcpu_load() will make the vPE resident again. Once the vcpu actually blocks, we don't request a doorbell anymore, and the vcpu won't be woken up on interrupt delivery. Fix it by tracking that we're entering WFI, and key the doorbell request on that flag. This allows us not to make the vPE resident when going through a preempt/schedule cycle, meaning we don't lose any state. Cc: stable@vger.kernel.org Fixes: 8e01d9a396e6 ("KVM: arm64: vgic-v4: Move the GICv4 residency flow to be driven by vcpu_load/put") Reported-by: Xiang Chen Suggested-by: Zenghui Yu Tested-by: Xiang Chen Co-developed-by: Oliver Upton Signed-off-by: Marc Zyngier Acked-by: Zenghui Yu Link: https://lore.kernel.org/r/20230713070657.3873244-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/arm.c | 6 ++++-- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- arch/arm64/kvm/vgic/vgic-v4.c | 7 +++++-- include/kvm/arm_vgic.h | 2 +- 5 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8b6096753740..d3dd05bbfe23 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -727,6 +727,8 @@ struct kvm_vcpu_arch { #define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) /* PMUSERENR for the guest EL0 is on physical CPU */ #define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6)) +/* WFI instruction trapped */ +#define IN_WFI __vcpu_single_flag(sflags, BIT(7)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a402ea5511f3..72dc53a75d1c 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -718,13 +718,15 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) */ preempt_disable(); kvm_vgic_vmcr_sync(vcpu); - vgic_v4_put(vcpu, true); + vcpu_set_flag(vcpu, IN_WFI); + vgic_v4_put(vcpu); preempt_enable(); kvm_vcpu_halt(vcpu); vcpu_clear_flag(vcpu, IN_WFIT); preempt_disable(); + vcpu_clear_flag(vcpu, IN_WFI); vgic_v4_load(vcpu); preempt_enable(); } @@ -792,7 +794,7 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { /* The distributor enable bits were changed */ preempt_disable(); - vgic_v4_put(vcpu, false); + vgic_v4_put(vcpu); vgic_v4_load(vcpu); preempt_enable(); } diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index c3b8e132d599..3dfc8b84e03e 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -749,7 +749,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - WARN_ON(vgic_v4_put(vcpu, false)); + WARN_ON(vgic_v4_put(vcpu)); vgic_v3_vmcr_sync(vcpu); diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c index c1c28fe680ba..339a55194b2c 100644 --- a/arch/arm64/kvm/vgic/vgic-v4.c +++ b/arch/arm64/kvm/vgic/vgic-v4.c @@ -336,14 +336,14 @@ void vgic_v4_teardown(struct kvm *kvm) its_vm->vpes = NULL; } -int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db) +int vgic_v4_put(struct kvm_vcpu *vcpu) { struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) return 0; - return its_make_vpe_non_resident(vpe, need_db); + return its_make_vpe_non_resident(vpe, !!vcpu_get_flag(vcpu, IN_WFI)); } int vgic_v4_load(struct kvm_vcpu *vcpu) @@ -354,6 +354,9 @@ int vgic_v4_load(struct kvm_vcpu *vcpu) if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident) return 0; + if (vcpu_get_flag(vcpu, IN_WFI)) + return 0; + /* * Before making the VPE resident, make sure the redistributor * corresponding to our current CPU expects us here. See the diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 402b545959af..5b27f94d4fad 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -431,7 +431,7 @@ int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq, int vgic_v4_load(struct kvm_vcpu *vcpu); void vgic_v4_commit(struct kvm_vcpu *vcpu); -int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db); +int vgic_v4_put(struct kvm_vcpu *vcpu); /* CPU HP callbacks */ void kvm_vgic_cpu_up(void); -- cgit v1.2.3 From a66557c790208610bdc21c0c788e7b9524b1a356 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:19 -0700 Subject: net: bonding: remove kernel-doc comment marker Change an errant kernel-doc comment marker (/**) to a regular comment to prevent a kernel-doc warning. bonding.h:282: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Returns NULL if the net_device does not belong to any of the bond's slaves Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Randy Dunlap Cc: Jay Vosburgh Cc: Andy Gospodarek Link: https://lore.kernel.org/r/20230714045127.18752-2-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/bonding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index b57bec6e737e..30ac427cf0c6 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -277,7 +277,7 @@ struct bond_vlan_tag { unsigned short vlan_id; }; -/** +/* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read -- cgit v1.2.3 From a63e40444e1bdb2ccdcd7c2e4afa51fdbc6c8589 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:20 -0700 Subject: net: cfg802154: fix kernel-doc notation warnings Add an enum heading to the kernel-doc comments to prevent kernel-doc warnings. cfg802154.h:174: warning: Cannot understand * @WPAN_PHY_FLAG_TRANSMIT_POWER: Indicates that transceiver will support on line 174 - I thought it was a doc line cfg802154.h:192: warning: Enum value 'WPAN_PHY_FLAG_TXPOWER' not described in enum 'wpan_phy_flags' cfg802154.h:192: warning: Excess enum value 'WPAN_PHY_FLAG_TRANSMIT_POWER' description in 'wpan_phy_flags' Fixes: edea8f7c75ec ("cfg802154: introduce wpan phy flags") Signed-off-by: Randy Dunlap Cc: Alexander Aring Cc: Stefan Schmidt Cc: Marcel Holtmann Acked-by: Miquel Raynal Link: https://lore.kernel.org/r/20230714045127.18752-3-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/cfg802154.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e00057984489..f79ce133e51a 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -170,7 +170,8 @@ wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b) } /** - * @WPAN_PHY_FLAG_TRANSMIT_POWER: Indicates that transceiver will support + * enum wpan_phy_flags - WPAN PHY state flags + * @WPAN_PHY_FLAG_TXPOWER: Indicates that transceiver will support * transmit power setting. * @WPAN_PHY_FLAG_CCA_ED_LEVEL: Indicates that transceiver will support cca ed * level setting. -- cgit v1.2.3 From cfe57122bba5dc16ad64b162d97e42923fc7587e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:21 -0700 Subject: codel: fix kernel-doc notation warnings Use '@' before the struct member names in kernel-doc notation to prevent kernel-doc warnings. codel.h:158: warning: Function parameter or member 'ecn_mark' not described in 'codel_stats' codel.h:158: warning: Function parameter or member 'ce_mark' not described in 'codel_stats' Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Signed-off-by: Randy Dunlap Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: Dave Taht Link: https://lore.kernel.org/r/20230714045127.18752-4-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/codel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/codel.h b/include/net/codel.h index 5fed2f16cb8d..aa80f744826c 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -145,8 +145,8 @@ struct codel_vars { * @maxpacket: largest packet we've seen so far * @drop_count: temp count of dropped packets in dequeue() * @drop_len: bytes of dropped packets in dequeue() - * ecn_mark: number of packets we ECN marked instead of dropping - * ce_mark: number of packets CE marked because sojourn time was above ce_threshold + * @ecn_mark: number of packets we ECN marked instead of dropping + * @ce_mark: number of packets CE marked because sojourn time was above ce_threshold */ struct codel_stats { u32 maxpacket; -- cgit v1.2.3 From 839f55c5ebdfc2c0825bc7e711a7dedaca11f84e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:22 -0700 Subject: devlink: fix kernel-doc notation warnings Spell function or struct member names correctly. Use ':' instead of '-' for struct member entries. Mark one field as private in kernel-doc. Add a few entries that were missing. Fix a typo. These changes prevent kernel-doc warnings: devlink.h:252: warning: Function parameter or member 'field_id' not described in 'devlink_dpipe_match' devlink.h:267: warning: Function parameter or member 'field_id' not described in 'devlink_dpipe_action' devlink.h:310: warning: Function parameter or member 'match_values_count' not described in 'devlink_dpipe_entry' devlink.h:355: warning: Function parameter or member 'list' not described in 'devlink_dpipe_table' devlink.h:374: warning: Function parameter or member 'actions_dump' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'matches_dump' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'entries_dump' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'counters_set_update' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'size_get' not described in 'devlink_dpipe_table_ops' devlink.h:384: warning: Function parameter or member 'headers' not described in 'devlink_dpipe_headers' devlink.h:384: warning: Function parameter or member 'headers_count' not described in 'devlink_dpipe_headers' devlink.h:398: warning: Function parameter or member 'unit' not described in 'devlink_resource_size_params' devlink.h:487: warning: Function parameter or member 'id' not described in 'devlink_param' devlink.h:645: warning: Function parameter or member 'overwrite_mask' not described in 'devlink_flash_update_params' Fixes: 1555d204e743 ("devlink: Support for pipeline debug (dpipe)") Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction") Fixes: eabaef1896bc ("devlink: Add devlink_param register and unregister") Fixes: 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") Signed-off-by: Randy Dunlap Cc: Jiri Pirko Cc: Moshe Shemesh Cc: Jacob Keller Link: https://lore.kernel.org/r/20230714045127.18752-5-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9a3c51aa6e81..0cdb4b16e5b5 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -221,7 +221,7 @@ struct devlink_dpipe_field { /** * struct devlink_dpipe_header - dpipe header object * @name: header name - * @id: index, global/local detrmined by global bit + * @id: index, global/local determined by global bit * @fields: fields * @fields_count: number of fields * @global: indicates if header is shared like most protocol header @@ -241,7 +241,7 @@ struct devlink_dpipe_header { * @header_index: header index (packets can have several headers of same * type like in case of tunnels) * @header: header - * @fieled_id: field index + * @field_id: field index */ struct devlink_dpipe_match { enum devlink_dpipe_match_type type; @@ -256,7 +256,7 @@ struct devlink_dpipe_match { * @header_index: header index (packets can have several headers of same * type like in case of tunnels) * @header: header - * @fieled_id: field index + * @field_id: field index */ struct devlink_dpipe_action { enum devlink_dpipe_action_type type; @@ -292,7 +292,7 @@ struct devlink_dpipe_value { * struct devlink_dpipe_entry - table entry object * @index: index of the entry in the table * @match_values: match values - * @matche_values_count: count of matches tuples + * @match_values_count: count of matches tuples * @action_values: actions values * @action_values_count: count of actions values * @counter: value of counter @@ -342,7 +342,9 @@ struct devlink_dpipe_table_ops; */ struct devlink_dpipe_table { void *priv; + /* private: */ struct list_head list; + /* public: */ const char *name; bool counters_enabled; bool counter_control_extern; @@ -355,13 +357,13 @@ struct devlink_dpipe_table { /** * struct devlink_dpipe_table_ops - dpipe_table ops - * @actions_dump - dumps all tables actions - * @matches_dump - dumps all tables matches - * @entries_dump - dumps all active entries in the table - * @counters_set_update - when changing the counter status hardware sync + * @actions_dump: dumps all tables actions + * @matches_dump: dumps all tables matches + * @entries_dump: dumps all active entries in the table + * @counters_set_update: when changing the counter status hardware sync * maybe needed to allocate/free counter related * resources - * @size_get - get size + * @size_get: get size */ struct devlink_dpipe_table_ops { int (*actions_dump)(void *priv, struct sk_buff *skb); @@ -374,8 +376,8 @@ struct devlink_dpipe_table_ops { /** * struct devlink_dpipe_headers - dpipe headers - * @headers - header array can be shared (global bit) or driver specific - * @headers_count - count of headers + * @headers: header array can be shared (global bit) or driver specific + * @headers_count: count of headers */ struct devlink_dpipe_headers { struct devlink_dpipe_header **headers; @@ -387,7 +389,7 @@ struct devlink_dpipe_headers { * @size_min: minimum size which can be set * @size_max: maximum size which can be set * @size_granularity: size granularity - * @size_unit: resource's basic unit + * @unit: resource's basic unit */ struct devlink_resource_size_params { u64 size_min; @@ -457,6 +459,7 @@ struct devlink_flash_notify { /** * struct devlink_param - devlink configuration parameter data + * @id: devlink parameter id number * @name: name of the parameter * @generic: indicates if the parameter is generic or driver specific * @type: parameter type @@ -632,6 +635,7 @@ enum devlink_param_generic_id { * struct devlink_flash_update_params - Flash Update parameters * @fw: pointer to the firmware data to update from * @component: the flash component to update + * @overwrite_mask: which types of flash update are supported (may be %0) * * With the exception of fw, drivers must opt-in to parameters by * setting the appropriate bit in the supported_flash_update_params field in -- cgit v1.2.3 From d20909a0689f585fb6443da2d8797f7ad549f931 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:23 -0700 Subject: inet: frags: eliminate kernel-doc warning Modify the anonymous enum kernel-doc content so that it doesn't cause a kernel-doc warning. inet_frag.h:33: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Fixes: 1ab1934ed80a ("inet: frags: enum the flag definitions and add descriptions") Signed-off-by: Randy Dunlap Cc: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230714045127.18752-6-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/inet_frag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 325ad893f624..153960663ce4 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -29,7 +29,7 @@ struct fqdir { }; /** - * fragment queue flags + * enum: fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived -- cgit v1.2.3 From 201a08830d8c4698f97ce65329ba92e36b7f402a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:24 -0700 Subject: net: llc: fix kernel-doc notation warnings Use the corrent function parameter name or format to prevent kernel-doc warnings. Add 2 function parameter descriptions to prevent kernel-doc warnings. llc_pdu.h:278: warning: Function parameter or member 'da' not described in 'llc_pdu_decode_da' llc_pdu.h:278: warning: Excess function parameter 'sa' description in 'llc_pdu_decode_da' llc_pdu.h:330: warning: Function parameter or member 'skb' not described in 'llc_pdu_init_as_test_cmd' llc_pdu.h:379: warning: Function parameter or member 'svcs_supported' not described in 'llc_pdu_init_as_xid_cmd' llc_pdu.h:379: warning: Function parameter or member 'rx_window' not described in 'llc_pdu_init_as_xid_cmd' Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20230714045127.18752-7-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/llc_pdu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index 49aa79c7b278..7e73f8e5e497 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -269,7 +269,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) /** * llc_pdu_decode_da - extracts dest address of input frame * @skb: input skb that destination address must be extracted from it - * @sa: pointer to destination address (6 byte array). + * @da: pointer to destination address (6 byte array). * * This function extracts destination address(MAC) of input frame. */ @@ -321,7 +321,7 @@ static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb) /** * llc_pdu_init_as_test_cmd - sets PDU as TEST - * @skb - Address of the skb to build + * @skb: Address of the skb to build * * Sets a PDU as TEST */ @@ -369,6 +369,8 @@ struct llc_xid_info { /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID * @skb: input skb that header must be set into it. + * @svcs_supported: The class of the LLC (I or II) + * @rx_window: The size of the receive window of the LLC * * This function sets third,fourth,fifth and sixth bytes of LLC header as * a XID PDU. -- cgit v1.2.3 From d1533d726aa1efca3a7ae8f40c94ccb149d22e6a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:25 -0700 Subject: net: NSH: fix kernel-doc notation warning Use the struct member's name and the correct format to prevent a kernel-doc warning. nsh.h:200: warning: Function parameter or member 'context' not described in 'nsh_md1_ctx' Fixes: 1f0b7744c505 ("net: add NSH header structures and helpers") Signed-off-by: Randy Dunlap Cc: Jiri Benc Link: https://lore.kernel.org/r/20230714045127.18752-8-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/nsh.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/nsh.h b/include/net/nsh.h index 350b1ad11c7f..16a751093896 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -192,7 +192,7 @@ /** * struct nsh_md1_ctx - Keeps track of NSH context data - * @nshc<1-4>: NSH Contexts. + * @context: NSH Contexts. */ struct nsh_md1_ctx { __be32 context[4]; -- cgit v1.2.3 From d1cca974548d76e0fa8b6761d25f7b47376a3780 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:26 -0700 Subject: pie: fix kernel-doc notation warning Spell a struct member's name correctly to prevent a kernel-doc warning. pie.h:38: warning: Function parameter or member 'tupdate' not described in 'pie_params' Fixes: b42a3d7c7cff ("pie: improve comments and commenting style") Signed-off-by: Randy Dunlap Cc: Leslie Monis Cc: "Mohit P. Tahiliani" Cc: Gautam Ramakrishnan Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Link: https://lore.kernel.org/r/20230714045127.18752-9-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/pie.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/pie.h b/include/net/pie.h index 3fe2361e03b4..01cbc66825a4 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -17,7 +17,7 @@ /** * struct pie_params - contains pie parameters * @target: target delay in pschedtime - * @tudpate: interval at which drop probability is calculated + * @tupdate: interval at which drop probability is calculated * @limit: total number of packets that can be in the queue * @alpha: parameter to control drop probability * @beta: parameter to control drop probability -- cgit v1.2.3 From 04be3c95da8266a099c8446b6d1205ccf8a62e66 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:27 -0700 Subject: rsi: remove kernel-doc comment marker Change an errant kernel-doc comment marker (/**) to a regular comment to prevent a kernel-doc warning. rsi_91x.h:3: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Copyright (c) 2017 Redpine Signals Inc. Fixes: 4c10d56a76bb ("rsi: add header file rsi_91x") Signed-off-by: Randy Dunlap Cc: Prameela Rani Garnepudi Cc: Siva Rebbagondla Acked-by: Kalle Valo Link: https://lore.kernel.org/r/20230714045127.18752-10-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/rsi_91x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/rsi_91x.h b/include/net/rsi_91x.h index 040f07b47f1f..b2283762e446 100644 --- a/include/net/rsi_91x.h +++ b/include/net/rsi_91x.h @@ -1,4 +1,4 @@ -/** +/* * Copyright (c) 2017 Redpine Signals Inc. * * Permission to use, copy, modify, and/or distribute this software for any -- cgit v1.2.3 From dcb60f9c403e03133363563ac8ea5d8bba6c2be1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 12 Jul 2023 20:08:32 -0700 Subject: cpumask: eliminate kernel-doc warnings Update lib/cpumask.c and to fix all kernel-doc warnings: include/linux/cpumask.h:185: warning: Function parameter or member 'srcp1' not described in 'cpumask_first_and' include/linux/cpumask.h:185: warning: Function parameter or member 'srcp2' not described in 'cpumask_first_and' include/linux/cpumask.h:185: warning: Excess function parameter 'src1p' description in 'cpumask_first_and' include/linux/cpumask.h:185: warning: Excess function parameter 'src2p' description in 'cpumask_first_and' lib/cpumask.c:59: warning: Function parameter or member 'node' not described in 'alloc_cpumask_var_node' lib/cpumask.c:169: warning: Function parameter or member 'src1p' not described in 'cpumask_any_and_distribute' lib/cpumask.c:169: warning: Function parameter or member 'src2p' not described in 'cpumask_any_and_distribute' Fixes: 7b4967c53204 ("cpumask: Add alloc_cpumask_var_node()") Fixes: 839cad5fa54b ("cpumask: fix function description kernel-doc notation") Fixes: 93ba139ba819 ("cpumask: use find_first_and_bit()") Signed-off-by: Randy Dunlap Reviewed-by: Andy Shevchenko Signed-off-by: Yury Norov --- include/linux/cpumask.h | 8 ++++++-- lib/cpumask.c | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 0d2e2a38b92d..f10fb87d49db 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -175,8 +175,8 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 - * @src1p: the first input - * @src2p: the second input + * @srcp1: the first input + * @srcp2: the second input * * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ @@ -1197,6 +1197,10 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, /** * cpumap_print_list_to_buf - copies the cpumask into the buffer as * comma-separated list of cpus + * @buf: the buffer to copy into + * @mask: the cpumask to copy + * @off: in the string from which we are copying, we copy to @buf + * @count: the maximum number of bytes to print * * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. diff --git a/lib/cpumask.c b/lib/cpumask.c index de356f16773a..a7fd02b5ae26 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -45,6 +45,7 @@ EXPORT_SYMBOL(cpumask_next_wrap); * alloc_cpumask_var_node - allocate a struct cpumask on a given node * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags + * @node: memory node from which to allocate or %NUMA_NO_NODE * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in ) @@ -157,7 +158,9 @@ EXPORT_SYMBOL(cpumask_local_spread); static DEFINE_PER_CPU(int, distribute_cpu_mask_prev); /** - * cpumask_any_and_distribute - Return an arbitrary cpu within srcp1 & srcp2. + * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p. + * @src1p: first &cpumask for intersection + * @src2p: second &cpumask for intersection * * Iterated calls using the same srcp1 and srcp2 will be distributed within * their intersection. -- cgit v1.2.3 From 97b1d320f48c21e40cc42b4ac033f2520f9ecc5c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 18 Jul 2023 10:41:50 -0700 Subject: llc: Check netns in llc_estab_match() and llc_listener_match(). We will remove this restriction in llc_rcv() in the following patch, which means that the protocol handler must be aware of netns. if (!net_eq(dev_net(dev), &init_net)) goto drop; llc_rcv() fetches llc_type_handlers[llc_pdu_type(skb) - 1] and calls it if not NULL. If the PDU type is LLC_DEST_CONN, llc_conn_handler() is called to pass skb to corresponding sockets. Then, we must look up a proper socket in the same netns with skb->dev. llc_conn_handler() calls __llc_lookup() to look up a established or litening socket by __llc_lookup_established() and llc_lookup_listener(). Both functions iterate on a list and call llc_estab_match() or llc_listener_match() to check if the socket is the correct destination. However, these functions do not check netns. Also, bind() and connect() call llc_establish_connection(), which finally calls __llc_lookup_established(), to check if there is a conflicting socket. Let's test netns in llc_estab_match() and llc_listener_match(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/llc_conn.h | 2 +- net/llc/af_llc.c | 2 +- net/llc/llc_conn.c | 49 ++++++++++++++++++++++++++++++------------------- net/llc/llc_if.c | 2 +- 4 files changed, 33 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index 2c1ea3414640..374411b3066c 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -111,7 +111,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); int llc_conn_remove_acked_pdus(struct sock *conn, u8 nr, u16 *how_many_unacked); struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr); + struct llc_addr *laddr, const struct net *net); void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk); void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 57c35c960b2c..9b06c380866b 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -402,7 +402,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ - ask = llc_lookup_established(sap, &daddr, &laddr); + ask = llc_lookup_established(sap, &daddr, &laddr, &init_net); if (ask) { sock_put(ask); goto out_put; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 912aa9bd5e29..d037009ee10f 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -453,11 +453,13 @@ static int llc_exec_conn_trans_actions(struct sock *sk, static inline bool llc_estab_match(const struct llc_sap *sap, const struct llc_addr *daddr, const struct llc_addr *laddr, - const struct sock *sk) + const struct sock *sk, + const struct net *net) { struct llc_sock *llc = llc_sk(sk); - return llc->laddr.lsap == laddr->lsap && + return net_eq(sock_net(sk), net) && + llc->laddr.lsap == laddr->lsap && llc->daddr.lsap == daddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac) && ether_addr_equal(llc->daddr.mac, daddr->mac); @@ -468,6 +470,7 @@ static inline bool llc_estab_match(const struct llc_sap *sap, * @sap: SAP * @daddr: address of remote LLC (MAC + SAP) * @laddr: address of local LLC (MAC + SAP) + * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection using the remote * mac, remote sap, local mac, and local sap. Returns pointer for @@ -476,7 +479,8 @@ static inline bool llc_estab_match(const struct llc_sap *sap, */ static struct sock *__llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; @@ -486,12 +490,12 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap, rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { - if (llc_estab_match(sap, daddr, laddr, rc)) { + if (llc_estab_match(sap, daddr, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || - !llc_estab_match(sap, daddr, laddr, rc))) { + !llc_estab_match(sap, daddr, laddr, rc, net))) { sock_put(rc); continue; } @@ -513,29 +517,33 @@ found: struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { struct sock *sk; local_bh_disable(); - sk = __llc_lookup_established(sap, daddr, laddr); + sk = __llc_lookup_established(sap, daddr, laddr, net); local_bh_enable(); return sk; } static inline bool llc_listener_match(const struct llc_sap *sap, const struct llc_addr *laddr, - const struct sock *sk) + const struct sock *sk, + const struct net *net) { struct llc_sock *llc = llc_sk(sk); - return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && + return net_eq(sock_net(sk), net) && + sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } static struct sock *__llc_lookup_listener(struct llc_sap *sap, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; @@ -545,12 +553,12 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap, rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { - if (llc_listener_match(sap, laddr, rc)) { + if (llc_listener_match(sap, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || - !llc_listener_match(sap, laddr, rc))) { + !llc_listener_match(sap, laddr, rc, net))) { sock_put(rc); continue; } @@ -574,6 +582,7 @@ found: * llc_lookup_listener - Finds listener for local MAC + SAP * @sap: SAP * @laddr: address of local LLC (MAC + SAP) + * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection listening on * local mac, and local sap. Returns pointer for parent socket found, @@ -581,24 +590,26 @@ found: * Caller has to make sure local_bh is disabled. */ static struct sock *llc_lookup_listener(struct llc_sap *sap, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { + struct sock *rc = __llc_lookup_listener(sap, laddr, net); static struct llc_addr null_addr; - struct sock *rc = __llc_lookup_listener(sap, laddr); if (!rc) - rc = __llc_lookup_listener(sap, &null_addr); + rc = __llc_lookup_listener(sap, &null_addr, net); return rc; } static struct sock *__llc_lookup(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { - struct sock *sk = __llc_lookup_established(sap, daddr, laddr); + struct sock *sk = __llc_lookup_established(sap, daddr, laddr, net); - return sk ? : llc_lookup_listener(sap, laddr); + return sk ? : llc_lookup_listener(sap, laddr, net); } /** @@ -776,7 +787,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) llc_pdu_decode_da(skb, daddr.mac); llc_pdu_decode_dsap(skb, &daddr.lsap); - sk = __llc_lookup(sap, &saddr, &daddr); + sk = __llc_lookup(sap, &saddr, &daddr, dev_net(skb->dev)); if (!sk) goto drop; diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index dde9bf08a593..58a5f419adc6 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -92,7 +92,7 @@ int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, u8 dsap) daddr.lsap = dsap; memcpy(daddr.mac, dmac, sizeof(daddr.mac)); memcpy(laddr.mac, lmac, sizeof(laddr.mac)); - existing = llc_lookup_established(llc->sap, &daddr, &laddr); + existing = llc_lookup_established(llc->sap, &daddr, &laddr, sock_net(sk)); if (existing) { if (existing->sk_state == TCP_ESTABLISHED) { sk = existing; -- cgit v1.2.3 From 195ef75e19287b4bc413da3e3e3722b030ac881e Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 19 Jun 2023 01:04:31 +0300 Subject: Bluetooth: use RCU for hci_conn_params and iterate safely in hci_sync hci_update_accept_list_sync iterates over hdev->pend_le_conns and hdev->pend_le_reports, and waits for controller events in the loop body, without holding hdev lock. Meanwhile, these lists and the items may be modified e.g. by le_scan_cleanup. This can invalidate the list cursor or any other item in the list, resulting to invalid behavior (eg use-after-free). Use RCU for the hci_conn_params action lists. Since the loop bodies in hci_sync block and we cannot use RCU or hdev->lock for the whole loop, copy list items first and then iterate on the copy. Only the flags field is written from elsewhere, so READ_ONCE/WRITE_ONCE should guarantee we read valid values. Free params everywhere with hci_conn_params_free so the cleanup is guaranteed to be done properly. This fixes the following, which can be triggered e.g. by BlueZ new mgmt-tester case "Add + Remove Device Nowait - Success", or by changing hci_le_set_cig_params to always return false, and running iso-tester: ================================================================== BUG: KASAN: slab-use-after-free in hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) Read of size 8 at addr ffff888001265018 by task kworker/u3:0/32 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl (./arch/x86/include/asm/irqflags.h:134 lib/dump_stack.c:107) print_report (mm/kasan/report.c:320 mm/kasan/report.c:430) ? __virt_addr_valid (./include/linux/mmzone.h:1915 ./include/linux/mmzone.h:2011 arch/x86/mm/physaddr.c:65) ? hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) kasan_report (mm/kasan/report.c:538) ? hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) ? __pfx_hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2780) ? mutex_lock (kernel/locking/mutex.c:282) ? __pfx_mutex_lock (kernel/locking/mutex.c:282) ? __pfx_mutex_unlock (kernel/locking/mutex.c:538) ? __pfx_update_passive_scan_sync (net/bluetooth/hci_sync.c:2861) hci_cmd_sync_work (net/bluetooth/hci_sync.c:306) process_one_work (./arch/x86/include/asm/preempt.h:27 kernel/workqueue.c:2399) worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2538) ? __pfx_worker_thread (kernel/workqueue.c:2480) kthread (kernel/kthread.c:376) ? __pfx_kthread (kernel/kthread.c:331) ret_from_fork (arch/x86/entry/entry_64.S:314) Allocated by task 31: kasan_save_stack (mm/kasan/common.c:46) kasan_set_track (mm/kasan/common.c:52) __kasan_kmalloc (mm/kasan/common.c:374 mm/kasan/common.c:383) hci_conn_params_add (./include/linux/slab.h:580 ./include/linux/slab.h:720 net/bluetooth/hci_core.c:2277) hci_connect_le_scan (net/bluetooth/hci_conn.c:1419 net/bluetooth/hci_conn.c:1589) hci_connect_cis (net/bluetooth/hci_conn.c:2266) iso_connect_cis (net/bluetooth/iso.c:390) iso_sock_connect (net/bluetooth/iso.c:899) __sys_connect (net/socket.c:2003 net/socket.c:2020) __x64_sys_connect (net/socket.c:2027) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) Freed by task 15: kasan_save_stack (mm/kasan/common.c:46) kasan_set_track (mm/kasan/common.c:52) kasan_save_free_info (mm/kasan/generic.c:523) __kasan_slab_free (mm/kasan/common.c:238 mm/kasan/common.c:200 mm/kasan/common.c:244) __kmem_cache_free (mm/slub.c:1807 mm/slub.c:3787 mm/slub.c:3800) hci_conn_params_del (net/bluetooth/hci_core.c:2323) le_scan_cleanup (net/bluetooth/hci_conn.c:202) process_one_work (./arch/x86/include/asm/preempt.h:27 kernel/workqueue.c:2399) worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2538) kthread (kernel/kthread.c:376) ret_from_fork (arch/x86/entry/entry_64.S:314) ================================================================== Fixes: e8907f76544f ("Bluetooth: hci_sync: Make use of hci_cmd_sync_queue set 3") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 5 ++ net/bluetooth/hci_conn.c | 10 ++-- net/bluetooth/hci_core.c | 38 ++++++++++--- net/bluetooth/hci_event.c | 12 ++-- net/bluetooth/hci_sync.c | 117 +++++++++++++++++++++++++++++++++++---- net/bluetooth/mgmt.c | 26 ++++----- 6 files changed, 164 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9654567cfae3..870b6d3c5146 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -822,6 +822,7 @@ struct hci_conn_params { struct hci_conn *conn; bool explicit_connect; + /* Accessed without hdev->lock: */ hci_conn_flags_t flags; u8 privacy_mode; }; @@ -1573,7 +1574,11 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_clear_disabled(struct hci_dev *hdev); +void hci_conn_params_free(struct hci_conn_params *param); +void hci_pend_le_list_del_init(struct hci_conn_params *param); +void hci_pend_le_list_add(struct hci_conn_params *param, + struct list_head *list); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 056f9516e46d..db598942cab4 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -118,7 +118,7 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) */ params->explicit_connect = false; - list_del_init(¶ms->action); + hci_pend_le_list_del_init(params); switch (params->auto_connect) { case HCI_AUTO_CONN_EXPLICIT: @@ -127,10 +127,10 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) return; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: - list_add(¶ms->action, &hdev->pend_le_reports); + hci_pend_le_list_add(params, &hdev->pend_le_reports); break; default: break; @@ -1426,8 +1426,8 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev, if (params->auto_connect == HCI_AUTO_CONN_DISABLED || params->auto_connect == HCI_AUTO_CONN_REPORT || params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { - list_del_init(¶ms->action); - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, &hdev->pend_le_conns); } params->explicit_connect = true; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 48917c68358d..b421e196f60c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2249,21 +2249,45 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, return NULL; } -/* This function requires the caller holds hdev->lock */ +/* This function requires the caller holds hdev->lock or rcu_read_lock */ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *param; - list_for_each_entry(param, list, action) { + rcu_read_lock(); + + list_for_each_entry_rcu(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && - param->addr_type == addr_type) + param->addr_type == addr_type) { + rcu_read_unlock(); return param; + } } + rcu_read_unlock(); + return NULL; } +/* This function requires the caller holds hdev->lock */ +void hci_pend_le_list_del_init(struct hci_conn_params *param) +{ + if (list_empty(¶m->action)) + return; + + list_del_rcu(¶m->action); + synchronize_rcu(); + INIT_LIST_HEAD(¶m->action); +} + +/* This function requires the caller holds hdev->lock */ +void hci_pend_le_list_add(struct hci_conn_params *param, + struct list_head *list) +{ + list_add_rcu(¶m->action, list); +} + /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) @@ -2297,14 +2321,15 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, return params; } -static void hci_conn_params_free(struct hci_conn_params *params) +void hci_conn_params_free(struct hci_conn_params *params) { + hci_pend_le_list_del_init(params); + if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); } - list_del(¶ms->action); list_del(¶ms->list); kfree(params); } @@ -2342,8 +2367,7 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) continue; } - list_del(¶ms->list); - kfree(params); + hci_conn_params_free(params); } BT_DBG("All LE disabled connection parameters were removed"); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 95816a938cea..fbc5cb8548f2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1564,7 +1564,7 @@ static u8 hci_cc_le_set_privacy_mode(struct hci_dev *hdev, void *data, params = hci_conn_params_lookup(hdev, &cp->bdaddr, cp->bdaddr_type); if (params) - params->privacy_mode = cp->mode; + WRITE_ONCE(params->privacy_mode, cp->mode); hci_dev_unlock(hdev); @@ -2804,8 +2804,8 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_del_init(¶ms->action); - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; default: @@ -3423,8 +3423,8 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data, case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_del_init(¶ms->action); - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, &hdev->pend_le_conns); hci_update_passive_scan(hdev); break; @@ -5962,7 +5962,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, conn->dst_type); if (params) { - list_del_init(¶ms->action); + hci_pend_le_list_del_init(params); if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8561616abbe5..4d1e32bb6a9c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2160,15 +2160,23 @@ static int hci_le_del_accept_list_sync(struct hci_dev *hdev, return 0; } +struct conn_params { + bdaddr_t addr; + u8 addr_type; + hci_conn_flags_t flags; + u8 privacy_mode; +}; + /* Adds connection to resolve list if needed. * Setting params to NULL programs local hdev->irk */ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, - struct hci_conn_params *params) + struct conn_params *params) { struct hci_cp_le_add_to_resolv_list cp; struct smp_irk *irk; struct bdaddr_list_with_irk *entry; + struct hci_conn_params *p; if (!use_ll_privacy(hdev)) return 0; @@ -2203,6 +2211,16 @@ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, /* Default privacy mode is always Network */ params->privacy_mode = HCI_NETWORK_PRIVACY; + rcu_read_lock(); + p = hci_pend_le_action_lookup(&hdev->pend_le_conns, + ¶ms->addr, params->addr_type); + if (!p) + p = hci_pend_le_action_lookup(&hdev->pend_le_reports, + ¶ms->addr, params->addr_type); + if (p) + WRITE_ONCE(p->privacy_mode, HCI_NETWORK_PRIVACY); + rcu_read_unlock(); + done: if (hci_dev_test_flag(hdev, HCI_PRIVACY)) memcpy(cp.local_irk, hdev->irk, 16); @@ -2215,7 +2233,7 @@ done: /* Set Device Privacy Mode. */ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, - struct hci_conn_params *params) + struct conn_params *params) { struct hci_cp_le_set_privacy_mode cp; struct smp_irk *irk; @@ -2240,6 +2258,8 @@ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, bacpy(&cp.bdaddr, &irk->bdaddr); cp.mode = HCI_DEVICE_PRIVACY; + /* Note: params->privacy_mode is not updated since it is a copy */ + return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PRIVACY_MODE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } @@ -2249,7 +2269,7 @@ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, * properly set the privacy mode. */ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, - struct hci_conn_params *params, + struct conn_params *params, u8 *num_entries) { struct hci_cp_le_add_to_accept_list cp; @@ -2447,6 +2467,52 @@ struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, return __hci_cmd_sync_sk(hdev, opcode, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } +static struct conn_params *conn_params_copy(struct list_head *list, size_t *n) +{ + struct hci_conn_params *params; + struct conn_params *p; + size_t i; + + rcu_read_lock(); + + i = 0; + list_for_each_entry_rcu(params, list, action) + ++i; + *n = i; + + rcu_read_unlock(); + + p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL); + if (!p) + return NULL; + + rcu_read_lock(); + + i = 0; + list_for_each_entry_rcu(params, list, action) { + /* Racing adds are handled in next scan update */ + if (i >= *n) + break; + + /* No hdev->lock, but: addr, addr_type are immutable. + * privacy_mode is only written by us or in + * hci_cc_le_set_privacy_mode that we wait for. + * We should be idempotent so MGMT updating flags + * while we are processing is OK. + */ + bacpy(&p[i].addr, ¶ms->addr); + p[i].addr_type = params->addr_type; + p[i].flags = READ_ONCE(params->flags); + p[i].privacy_mode = READ_ONCE(params->privacy_mode); + ++i; + } + + rcu_read_unlock(); + + *n = i; + return p; +} + /* Device must not be scanning when updating the accept list. * * Update is done using the following sequence: @@ -2466,11 +2532,12 @@ struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, */ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) { - struct hci_conn_params *params; + struct conn_params *params; struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; u8 filter_policy; + size_t i, n; int err; /* Pause advertising if resolving list can be used as controllers @@ -2504,6 +2571,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type)) continue; + /* Pointers not dereferenced, no locks needed */ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); @@ -2532,23 +2600,50 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) * available accept list entries in the controller, then * just abort and return filer policy value to not use the * accept list. + * + * The list and params may be mutated while we wait for events, + * so make a copy and iterate it. */ - list_for_each_entry(params, &hdev->pend_le_conns, action) { - err = hci_le_add_accept_list_sync(hdev, params, &num_entries); - if (err) + + params = conn_params_copy(&hdev->pend_le_conns, &n); + if (!params) { + err = -ENOMEM; + goto done; + } + + for (i = 0; i < n; ++i) { + err = hci_le_add_accept_list_sync(hdev, ¶ms[i], + &num_entries); + if (err) { + kvfree(params); goto done; + } } + kvfree(params); + /* After adding all new pending connections, walk through * the list of pending reports and also add these to the * accept list if there is still space. Abort if space runs out. */ - list_for_each_entry(params, &hdev->pend_le_reports, action) { - err = hci_le_add_accept_list_sync(hdev, params, &num_entries); - if (err) + + params = conn_params_copy(&hdev->pend_le_reports, &n); + if (!params) { + err = -ENOMEM; + goto done; + } + + for (i = 0; i < n; ++i) { + err = hci_le_add_accept_list_sync(hdev, ¶ms[i], + &num_entries); + if (err) { + kvfree(params); goto done; + } } + kvfree(params); + /* Use the allowlist unless the following conditions are all true: * - We are not currently suspending * - There are 1 or more ADV monitors registered and it's not offloaded @@ -4837,12 +4932,12 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { + hci_pend_le_list_del_init(p); if (p->conn) { hci_conn_drop(p->conn); hci_conn_put(p->conn); p->conn = NULL; } - list_del_init(&p->action); } BT_DBG("All LE pending actions cleared"); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f7b2d0971f24..1e07d0f28972 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1297,15 +1297,15 @@ static void restart_le_actions(struct hci_dev *hdev) /* Needed for AUTO_OFF case where might not "really" * have been powered off. */ - list_del_init(&p->action); + hci_pend_le_list_del_init(p); switch (p->auto_connect) { case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_add(&p->action, &hdev->pend_le_conns); + hci_pend_le_list_add(p, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: - list_add(&p->action, &hdev->pend_le_reports); + hci_pend_le_list_add(p, &hdev->pend_le_reports); break; default: break; @@ -5169,7 +5169,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - params->flags = current_flags; + WRITE_ONCE(params->flags, current_flags); status = MGMT_STATUS_SUCCESS; /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY @@ -7580,7 +7580,7 @@ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, if (params->auto_connect == auto_connect) return 0; - list_del_init(¶ms->action); + hci_pend_le_list_del_init(params); switch (auto_connect) { case HCI_AUTO_CONN_DISABLED: @@ -7589,18 +7589,18 @@ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, * connect to device, keep connecting. */ if (params->explicit_connect) - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: if (params->explicit_connect) - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); else - list_add(¶ms->action, &hdev->pend_le_reports); + hci_pend_le_list_add(params, &hdev->pend_le_reports); break; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: if (!is_connected(hdev, addr, addr_type)) - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; } @@ -7823,9 +7823,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - list_del(¶ms->action); - list_del(¶ms->list); - kfree(params); + hci_conn_params_free(params); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); } else { @@ -7856,9 +7854,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, p->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } - list_del(&p->action); - list_del(&p->list); - kfree(p); + hci_conn_params_free(p); } bt_dev_dbg(hdev, "All LE connection parameters were removed"); -- cgit v1.2.3 From 6910e2eb39254d279bce5bc0f8eb6af45b59357c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 3 Jul 2023 13:30:48 +0200 Subject: Bluetooth: coredump: fix building with coredump disabled The btmtk driver uses an IS_ENABLED() check to conditionally compile the coredump support, but this fails to build because the hdev->dump member is in an #ifdef: drivers/bluetooth/btmtk.c: In function 'btmtk_process_coredump': drivers/bluetooth/btmtk.c:386:30: error: 'struct hci_dev' has no member named 'dump' 386 | schedule_delayed_work(&hdev->dump.dump_timeout, | ^~ The struct member doesn't really make a huge difference in the total size, so just remove the #ifdef around it to avoid adding similar checks around each user. Fixes: 872f8c253cb9e ("Bluetooth: btusb: mediatek: add MediaTek devcoredump support") Fixes: 9695ef876fd12 ("Bluetooth: Add support for hci devcoredump") Signed-off-by: Arnd Bergmann Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 870b6d3c5146..e01d52cb668c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -593,9 +593,7 @@ struct hci_dev { const char *fw_info; struct dentry *debugfs; -#ifdef CONFIG_DEV_COREDUMP struct hci_devcoredump dump; -#endif struct device dev; -- cgit v1.2.3 From 3641c90c4e369c8d0af5483e879174400a152cf8 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 20 Jul 2023 17:55:12 +0800 Subject: blk-mq: delete dead struct blk_mq_hw_ctx->queued field This counter is not used anywhere, so delete it. Signed-off-by: Chengming Zhou Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230720095512.1403123-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b96e00499f9e..495ca198775f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -397,8 +397,6 @@ struct blk_mq_hw_ctx { */ struct blk_mq_tags *sched_tags; - /** @queued: Number of queued requests. */ - unsigned long queued; /** @run: Number of dispatched requests. */ unsigned long run; -- cgit v1.2.3 From 4164245c76ff906c9086758e1c3f87082a7f5ef5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:49 +0000 Subject: tcp: annotate data-races around tp->keepalive_time do_tcp_getsockopt() reads tp->keepalive_time while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 7 +++++-- net/ipv4/tcp.c | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 226bce6d1e8c..9e1c3337cdfe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1517,9 +1517,12 @@ static inline int keepalive_intvl_when(const struct tcp_sock *tp) static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; - return tp->keepalive_time ? : - READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); + /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */ + val = READ_ONCE(tp->keepalive_time); + + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 03ae6554c78d..b4f7856dfb16 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3312,7 +3312,8 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; - tp->keepalive_time = val * HZ; + /* Paired with WRITE_ONCE() in keepalive_time_when() */ + WRITE_ONCE(tp->keepalive_time, val * HZ); if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); -- cgit v1.2.3 From 5ecf9d4f52ff2f1d4d44c9b68bc75688e82f13b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:50 +0000 Subject: tcp: annotate data-races around tp->keepalive_intvl do_tcp_getsockopt() reads tp->keepalive_intvl while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 9 +++++++-- net/ipv4/tcp.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9e1c3337cdfe..2d8d58dec652 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1509,9 +1509,14 @@ void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; + + /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl() + * and do_tcp_setsockopt(). + */ + val = READ_ONCE(tp->keepalive_intvl); - return tp->keepalive_intvl ? : - READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b4f7856dfb16..d55fe014e7c9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3345,7 +3345,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) return -EINVAL; lock_sock(sk); - tcp_sk(sk)->keepalive_intvl = val * HZ; + WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); release_sock(sk); return 0; } @@ -3559,7 +3559,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (val < 1 || val > MAX_TCP_KEEPINTVL) err = -EINVAL; else - tp->keepalive_intvl = val * HZ; + WRITE_ONCE(tp->keepalive_intvl, val * HZ); break; case TCP_KEEPCNT: if (val < 1 || val > MAX_TCP_KEEPCNT) -- cgit v1.2.3 From 6e5e1de616bf5f3df1769abc9292191dfad9110a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:51 +0000 Subject: tcp: annotate data-races around tp->keepalive_probes do_tcp_getsockopt() reads tp->keepalive_probes while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 9 +++++++-- net/ipv4/tcp.c | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2d8d58dec652..a8b93b2875ea 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1533,9 +1533,14 @@ static inline int keepalive_time_when(const struct tcp_sock *tp) static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; + + /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt() + * and do_tcp_setsockopt(). + */ + val = READ_ONCE(tp->keepalive_probes); - return tp->keepalive_probes ? : - READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d55fe014e7c9..574fd0da1673 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3357,7 +3357,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) return -EINVAL; lock_sock(sk); - tcp_sk(sk)->keepalive_probes = val; + /* Paired with READ_ONCE() in keepalive_probes() */ + WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); release_sock(sk); return 0; } @@ -3565,7 +3566,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (val < 1 || val > MAX_TCP_KEEPCNT) err = -EINVAL; else - tp->keepalive_probes = val; + WRITE_ONCE(tp->keepalive_probes, val); break; case TCP_SYNCNT: if (val < 1 || val > MAX_TCP_SYNCNT) -- cgit v1.2.3 From 1aeb87bc1440c5447a7fa2d6e3c2cca52cbd206b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:55 +0000 Subject: tcp: annotate data-races around tp->notsent_lowat tp->notsent_lowat can be read locklessly from do_tcp_getsockopt() and tcp_poll(). Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 6 +++++- net/ipv4/tcp.c | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index a8b93b2875ea..0ca972ebd3dd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2061,7 +2061,11 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); + u32 val; + + val = READ_ONCE(tp->notsent_lowat); + + return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } bool tcp_stream_memory_free(const struct sock *sk, int wake); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5beec71a5c41..2b2241e9b492 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3664,7 +3664,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; + WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: @@ -4164,7 +4164,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset); break; case TCP_NOTSENT_LOWAT: - val = tp->notsent_lowat; + val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; -- cgit v1.2.3 From 70f360dd7042cb843635ece9d28335a4addff9eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:57 +0000 Subject: tcp: annotate data-races around fastopenq.max_qlen This field can be read locklessly. Fixes: 1536e2857bd3 ("tcp: Add a TCP_FASTOPEN socket option to get a max backlog on its listner") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-12-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b4c08ac86983..91a37c99ba66 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -513,7 +513,7 @@ static inline void fastopen_queue_tune(struct sock *sk, int backlog) struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); - queue->fastopenq.max_qlen = min_t(unsigned int, backlog, somaxconn); + WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn)); } static inline void tcp_move_syn(struct tcp_sock *tp, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3e137e9a18f5..8ed52e1e3c99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4145,7 +4145,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - val = icsk->icsk_accept_queue.fastopenq.max_qlen; + val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 45cc7f1ca296..85e4953f1182 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -296,6 +296,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; + int max_qlen; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying @@ -308,10 +309,11 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; - if (fastopenq->max_qlen == 0) + max_qlen = READ_ONCE(fastopenq->max_qlen); + if (max_qlen == 0) return false; - if (fastopenq->qlen >= fastopenq->max_qlen) { + if (fastopenq->qlen >= max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; -- cgit v1.2.3 From 80ddce5f2dbd0e83eadc9f9d373439180d599fe5 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Sat, 8 Jul 2023 13:27:19 +0200 Subject: thermal: core: constify params in thermal_zone_device_register Since commit 3d439b1a2ad3 ("thermal/core: Alloc-copy-free the thermal zone parameters structure"), thermal_zone_device_register() allocates a copy of the tzp argument and callers need not explicitly manage its lifetime. This means the function no longer cares about the parameter being mutable, so constify it. No functional change. Signed-off-by: Ahmad Fatoum Acked-by: Daniel Lezcano Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 4 ++-- include/linux/thermal.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 842f678c1c3e..cc2b5e81c620 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1203,7 +1203,7 @@ EXPORT_SYMBOL_GPL(thermal_zone_get_crit_temp); struct thermal_zone_device * thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *trips, int num_trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, - struct thermal_zone_params *tzp, int passive_delay, + const struct thermal_zone_params *tzp, int passive_delay, int polling_delay) { struct thermal_zone_device *tz; @@ -1371,7 +1371,7 @@ EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips); struct thermal_zone_device *thermal_zone_device_register(const char *type, int ntrips, int mask, void *devdata, struct thermal_zone_device_ops *ops, - struct thermal_zone_params *tzp, int passive_delay, + const struct thermal_zone_params *tzp, int passive_delay, int polling_delay) { return thermal_zone_device_register_with_trips(type, NULL, ntrips, mask, diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 87837094d549..dee66ade89a0 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -301,14 +301,14 @@ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); #ifdef CONFIG_THERMAL struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, - struct thermal_zone_params *, int, int); + const struct thermal_zone_params *, int, int); void thermal_zone_device_unregister(struct thermal_zone_device *); struct thermal_zone_device * thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int, int, void *, struct thermal_zone_device_ops *, - struct thermal_zone_params *, int, int); + const struct thermal_zone_params *, int, int); void *thermal_zone_device_priv(struct thermal_zone_device *tzd); const char *thermal_zone_device_type(struct thermal_zone_device *tzd); @@ -348,7 +348,7 @@ void thermal_zone_device_critical(struct thermal_zone_device *tz); static inline struct thermal_zone_device *thermal_zone_device_register( const char *type, int trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, - struct thermal_zone_params *tzp, + const struct thermal_zone_params *tzp, int passive_delay, int polling_delay) { return ERR_PTR(-ENODEV); } static inline void thermal_zone_device_unregister( -- cgit v1.2.3 From e7b915219baa4b2bf30afe1dbaca7a24ff627ad2 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 13 Jul 2023 16:57:40 +0200 Subject: PM: sleep: wakeirq: drop unused enable helpers Drop the wake-irq enable and disable helpers which have not been used since commit bed570307ed7 ("PM / wakeirq: Fix dedicated wakeirq for drivers not using autosuspend"). Note that these functions are essentially just leftovers from the first iteration of the wake-irq implementation where device drivers were supposed to call these functions themselves instead of PM core (as is also indicated by the bogus kernel doc comments). Signed-off-by: Johan Hovold Reviewed-by: Tony Lindgren Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeirq.c | 49 -------------------------------------------- include/linux/pm_wakeirq.h | 10 --------- 2 files changed, 59 deletions(-) (limited to 'include') diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index afd094dec5ca..42171f766dcb 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -194,7 +194,6 @@ err_free: return err; } - /** * dev_pm_set_dedicated_wake_irq - Request a dedicated wake-up interrupt * @dev: Device entry @@ -206,11 +205,6 @@ err_free: * Sets up a threaded interrupt handler for a device that has * a dedicated wake-up interrupt in addition to the device IO * interrupt. - * - * The interrupt starts disabled, and needs to be managed for - * the device by the bus code or the device driver using - * dev_pm_enable_wake_irq*() and dev_pm_disable_wake_irq*() - * functions. */ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) { @@ -232,11 +226,6 @@ EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq); * the status of WAKE_IRQ_DEDICATED_REVERSE to tell rpm_suspend() * to enable dedicated wake-up interrupt after running the runtime suspend * callback for @dev. - * - * The interrupt starts disabled, and needs to be managed for - * the device by the bus code or the device driver using - * dev_pm_enable_wake_irq*() and dev_pm_disable_wake_irq*() - * functions. */ int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq) { @@ -244,44 +233,6 @@ int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq) } EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq_reverse); -/** - * dev_pm_enable_wake_irq - Enable device wake-up interrupt - * @dev: Device - * - * Optionally called from the bus code or the device driver for - * runtime_resume() to override the PM runtime core managed wake-up - * interrupt handling to enable the wake-up interrupt. - * - * Note that for runtime_suspend()) the wake-up interrupts - * should be unconditionally enabled unlike for suspend() - * that is conditional. - */ -void dev_pm_enable_wake_irq(struct device *dev) -{ - struct wake_irq *wirq = dev->power.wakeirq; - - if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED)) - enable_irq(wirq->irq); -} -EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq); - -/** - * dev_pm_disable_wake_irq - Disable device wake-up interrupt - * @dev: Device - * - * Optionally called from the bus code or the device driver for - * runtime_suspend() to override the PM runtime core managed wake-up - * interrupt handling to disable the wake-up interrupt. - */ -void dev_pm_disable_wake_irq(struct device *dev) -{ - struct wake_irq *wirq = dev->power.wakeirq; - - if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED)) - disable_irq_nosync(wirq->irq); -} -EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq); - /** * dev_pm_enable_wake_irq_check - Checks and enables wake-up interrupt * @dev: Device diff --git a/include/linux/pm_wakeirq.h b/include/linux/pm_wakeirq.h index dd42d16945d0..d9642c6cf852 100644 --- a/include/linux/pm_wakeirq.h +++ b/include/linux/pm_wakeirq.h @@ -10,8 +10,6 @@ extern int dev_pm_set_wake_irq(struct device *dev, int irq); extern int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq); extern int dev_pm_set_dedicated_wake_irq_reverse(struct device *dev, int irq); extern void dev_pm_clear_wake_irq(struct device *dev); -extern void dev_pm_enable_wake_irq(struct device *dev); -extern void dev_pm_disable_wake_irq(struct device *dev); #else /* !CONFIG_PM */ @@ -34,13 +32,5 @@ static inline void dev_pm_clear_wake_irq(struct device *dev) { } -static inline void dev_pm_enable_wake_irq(struct device *dev) -{ -} - -static inline void dev_pm_disable_wake_irq(struct device *dev) -{ -} - #endif /* CONFIG_PM */ #endif /* _LINUX_PM_WAKEIRQ_H */ -- cgit v1.2.3 From 94d166c5318c6edd1e079df8552233443e909c33 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Jul 2023 11:05:56 +0200 Subject: vxlan: calculate correct header length for GPE VXLAN-GPE does not add an extra inner Ethernet header. Take that into account when calculating header length. This causes problems in skb_tunnel_check_pmtu, where incorrect PMTU is cached. In the collect_md mode (which is the only mode that VXLAN-GPE supports), there's no magic auto-setting of the tunnel interface MTU. It can't be, since the destination and thus the underlying interface may be different for each packet. So, the administrator is responsible for setting the correct tunnel interface MTU. Apparently, the administrators are capable enough to calculate that the maximum MTU for VXLAN-GPE is (their_lower_MTU - 36). They set the tunnel interface MTU to 1464. If you run a TCP stream over such interface, it's then segmented according to the MTU 1464, i.e. producing 1514 bytes frames. Which is okay, this still fits the lower MTU. However, skb_tunnel_check_pmtu (called from vxlan_xmit_one) uses 50 as the header size and thus incorrectly calculates the frame size to be 1528. This leads to ICMP too big message being generated (locally), PMTU of 1450 to be cached and the TCP stream to be resegmented. The fix is to use the correct actual header size, especially for skb_tunnel_check_pmtu calculation. Fixes: e1e5314de08ba ("vxlan: implement GPE") Signed-off-by: Jiri Benc Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/vxlan/vxlan_core.c | 23 ++++++++++------------- include/net/vxlan.h | 13 +++++++++---- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 1726297f2e0d..8eb9839a3ca6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8479,7 +8479,7 @@ static void ixgbe_atr(struct ixgbe_ring *ring, struct ixgbe_adapter *adapter = q_vector->adapter; if (unlikely(skb_tail_pointer(skb) < hdr.network + - VXLAN_HEADROOM)) + vxlan_headroom(0))) return; /* verify the port is recognized as VXLAN */ diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 78744549c1b3..42be8a26c171 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2516,7 +2516,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ndst = &rt->dst; - err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM, + err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; @@ -2577,7 +2577,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto out_unlock; } - err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM, + err = skb_tunnel_check_pmtu(skb, ndst, + vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; @@ -2989,14 +2990,12 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu) struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); - bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6); /* This check is different than dev->max_mtu, because it looks at * the lowerdev->mtu, rather than the static dev->max_mtu */ if (lowerdev) { - int max_mtu = lowerdev->mtu - - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); + int max_mtu = lowerdev->mtu - vxlan_headroom(vxlan->cfg.flags); if (new_mtu > max_mtu) return -EINVAL; } @@ -3644,11 +3643,11 @@ static void vxlan_config_apply(struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; unsigned short needed_headroom = ETH_HLEN; - bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6); int max_mtu = ETH_MAX_MTU; + u32 flags = conf->flags; if (!changelink) { - if (conf->flags & VXLAN_F_GPE) + if (flags & VXLAN_F_GPE) vxlan_raw_setup(dev); else vxlan_ether_setup(dev); @@ -3673,8 +3672,7 @@ static void vxlan_config_apply(struct net_device *dev, dev->needed_tailroom = lowerdev->needed_tailroom; - max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : - VXLAN_HEADROOM); + max_mtu = lowerdev->mtu - vxlan_headroom(flags); if (max_mtu < ETH_MIN_MTU) max_mtu = ETH_MIN_MTU; @@ -3685,10 +3683,9 @@ static void vxlan_config_apply(struct net_device *dev, if (dev->mtu > max_mtu) dev->mtu = max_mtu; - if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) - needed_headroom += VXLAN6_HEADROOM; - else - needed_headroom += VXLAN_HEADROOM; + if (flags & VXLAN_F_COLLECT_METADATA) + flags |= VXLAN_F_IPV6; + needed_headroom += vxlan_headroom(flags); dev->needed_headroom = needed_headroom; memcpy(&vxlan->cfg, conf, sizeof(*conf)); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 0be91ca78d3a..1648240c9668 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -386,10 +386,15 @@ static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, return features; } -/* IP header + UDP + VXLAN + Ethernet header */ -#define VXLAN_HEADROOM (20 + 8 + 8 + 14) -/* IPv6 header + UDP + VXLAN + Ethernet header */ -#define VXLAN6_HEADROOM (40 + 8 + 8 + 14) +static inline int vxlan_headroom(u32 flags) +{ + /* VXLAN: IP4/6 header + UDP + VXLAN + Ethernet header */ + /* VXLAN-GPE: IP4/6 header + UDP + VXLAN */ + return (flags & VXLAN_F_IPV6 ? sizeof(struct ipv6hdr) : + sizeof(struct iphdr)) + + sizeof(struct udphdr) + sizeof(struct vxlanhdr) + + (flags & VXLAN_F_GPE ? 0 : ETH_HLEN); +} static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) { -- cgit v1.2.3 From d11b0df7ddf1831f3e170972f43186dad520bfcc Mon Sep 17 00:00:00 2001 From: Stewart Smith Date: Fri, 21 Jul 2023 15:24:10 -0700 Subject: tcp: Reduce chance of collisions in inet6_hashfn(). For both IPv4 and IPv6 incoming TCP connections are tracked in a hash table with a hash over the source & destination addresses and ports. However, the IPv6 hash is insufficient and can lead to a high rate of collisions. The IPv6 hash used an XOR to fit everything into the 96 bits for the fast jenkins hash, meaning it is possible for an external entity to ensure the hash collides, thus falling back to a linear search in the bucket, which is slow. We take the approach of hash the full length of IPv6 address in __ipv6_addr_jhash() so that all users can benefit from a more secure version. While this may look like it adds overhead, the reality of modern CPUs means that this is unmeasurable in real world scenarios. In simulating with llvm-mca, the increase in cycles for the hashing code was ~16 cycles on Skylake (from a base of ~155), and an extra ~9 on Nehalem (base of ~173). In commit dd6d2910c5e0 ("netfilter: conntrack: switch to siphash") netfilter switched from a jenkins hash to a siphash, but even the faster hsiphash is a more significant overhead (~20-30%) in some preliminary testing. So, in this patch, we keep to the more conservative approach to ensure we don't add much overhead per SYN. In testing, this results in a consistently even spread across the connection buckets. In both testing and real-world scenarios, we have not found any measurable performance impact. Fixes: 08dcdbf6a7b9 ("ipv6: use a stronger hash for tcp") Signed-off-by: Stewart Smith Signed-off-by: Samuel Mendoza-Jonas Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230721222410.17914-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7332296eca44..2acc4c808d45 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -752,12 +752,8 @@ static inline u32 ipv6_addr_hash(const struct in6_addr *a) /* more secured version of ipv6_addr_hash() */ static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { - u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; - - return jhash_3words(v, - (__force u32)a->s6_addr32[2], - (__force u32)a->s6_addr32[3], - initval); + return jhash2((__force const u32 *)a->s6_addr32, + ARRAY_SIZE(a->s6_addr32), initval); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) -- cgit v1.2.3 From e0933b526fbfd937c4a8f4e35fcdd49f0e22d411 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 6 Jul 2023 13:14:12 -0700 Subject: block: Fix a source code comment in include/uapi/linux/blkzoned.h Fix the symbolic names for zone conditions in the blkzoned.h header file. Cc: Hannes Reinecke Cc: Damien Le Moal Fixes: 6a0cb1bc106f ("block: Implement support for zoned block devices") Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230706201422.3987341-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index b80fcc9ea525..f85743ef6e7d 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -51,13 +51,13 @@ enum blk_zone_type { * * The Zone Condition state machine in the ZBC/ZAC standards maps the above * deinitions as: - * - ZC1: Empty | BLK_ZONE_EMPTY + * - ZC1: Empty | BLK_ZONE_COND_EMPTY * - ZC2: Implicit Open | BLK_ZONE_COND_IMP_OPEN * - ZC3: Explicit Open | BLK_ZONE_COND_EXP_OPEN - * - ZC4: Closed | BLK_ZONE_CLOSED - * - ZC5: Full | BLK_ZONE_FULL - * - ZC6: Read Only | BLK_ZONE_READONLY - * - ZC7: Offline | BLK_ZONE_OFFLINE + * - ZC4: Closed | BLK_ZONE_COND_CLOSED + * - ZC5: Full | BLK_ZONE_COND_FULL + * - ZC6: Read Only | BLK_ZONE_COND_READONLY + * - ZC7: Offline | BLK_ZONE_COND_OFFLINE * * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should * be considered invalid. -- cgit v1.2.3 From c1ed39ec116272935528ca9b348b8ee79b0791da Mon Sep 17 00:00:00 2001 From: Winston Wen Date: Mon, 24 Jul 2023 10:10:56 +0800 Subject: fs/nls: make load_nls() take a const parameter load_nls() take a char * parameter, use it to find nls module in list or construct the module name to load it. This change make load_nls() take a const parameter, so we don't need do some cast like this: ses->local_nls = load_nls((char *)ctx->local_nls->charset); Suggested-by: Stephen Rothwell Signed-off-by: Winston Wen Reviewed-by: Paulo Alcantara Reviewed-by: Christian Brauner Signed-off-by: Steve French --- fs/nls/nls_base.c | 4 ++-- include/linux/nls.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 52ccd34b1e79..a026dbd3593f 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -272,7 +272,7 @@ int unregister_nls(struct nls_table * nls) return -EINVAL; } -static struct nls_table *find_nls(char *charset) +static struct nls_table *find_nls(const char *charset) { struct nls_table *nls; spin_lock(&nls_lock); @@ -288,7 +288,7 @@ static struct nls_table *find_nls(char *charset) return nls; } -struct nls_table *load_nls(char *charset) +struct nls_table *load_nls(const char *charset) { return try_then_request_module(find_nls(charset), "nls_%s", charset); } diff --git a/include/linux/nls.h b/include/linux/nls.h index 499e486b3722..e0bf8367b274 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -47,7 +47,7 @@ enum utf16_endian { /* nls_base.c */ extern int __register_nls(struct nls_table *, struct module *); extern int unregister_nls(struct nls_table *); -extern struct nls_table *load_nls(char *); +extern struct nls_table *load_nls(const char *charset); extern void unload_nls(struct nls_table *); extern struct nls_table *load_nls_default(void); #define register_nls(nls) __register_nls((nls), THIS_MODULE) -- cgit v1.2.3 From 39b1320e5dc2b707dfb5c25b0298ce9d4fc05aea Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 25 Jul 2023 10:13:17 +0800 Subject: drm/fb-helper: Remove unused inline function drm_fb_helper_defio_init() Since commit 8e86dee02253 ("drm/fb-helper: Remove drm_fb_helper_defio_init() and update docs") this inline helper not used anymore. Fixes: 8e86dee02253 ("drm/fb-helper: Remove drm_fb_helper_defio_init() and update docs") Signed-off-by: YueHaibing Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20230725021317.8080-1-yuehaibing@huawei.com --- include/drm/drm_fb_helper.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index 4863b0f8299e..375737fd6c36 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -368,11 +368,6 @@ static inline void drm_fb_helper_deferred_io(struct fb_info *info, { } -static inline int drm_fb_helper_defio_init(struct drm_fb_helper *fb_helper) -{ - return -ENODEV; -} - static inline void drm_fb_helper_set_suspend(struct drm_fb_helper *fb_helper, bool suspend) { -- cgit v1.2.3 From a0ade8404c3bc2bf2631cb0f20d372eed22d9d96 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 24 Jul 2023 14:34:25 -0700 Subject: af_packet: Fix warning of fortified memcpy() in packet_getname(). syzkaller found a warning in packet_getname() [0], where we try to copy 16 bytes to sockaddr_ll.sll_addr[8]. Some devices (ip6gre, vti6, ip6tnl) have 16 bytes address expressed by struct in6_addr. Also, Infiniband has 32 bytes as MAX_ADDR_LEN. The write seems to overflow, but actually not since we use struct sockaddr_storage defined in __sys_getsockname() and its size is 128 (_K_SS_MAXSIZE) bytes. Thus, we have sufficient room after sll_addr[] as __data[]. To avoid the warning, let's add a flex array member union-ed with sll_addr. Another option would be to use strncpy() and limit the copied length to sizeof(sll_addr), but it will return the partial address and break an application that passes sockaddr_storage to getsockname(). [0]: memcpy: detected field-spanning write (size 16) of single field "sll->sll_addr" at net/packet/af_packet.c:3604 (size 8) WARNING: CPU: 0 PID: 255 at net/packet/af_packet.c:3604 packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 Modules linked in: CPU: 0 PID: 255 Comm: syz-executor750 Not tainted 6.5.0-rc1-00330-g60cc1f7d0605 #4 Hardware name: linux,dummy-virt (DT) pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 lr : packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 sp : ffff800089887bc0 x29: ffff800089887bc0 x28: ffff000010f80f80 x27: 0000000000000003 x26: dfff800000000000 x25: ffff700011310f80 x24: ffff800087d55000 x23: dfff800000000000 x22: ffff800089887c2c x21: 0000000000000010 x20: ffff00000de08310 x19: ffff800089887c20 x18: ffff800086ab1630 x17: 20646c6569662065 x16: 6c676e697320666f x15: 0000000000000001 x14: 1fffe0000d56d7ca x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: 0000000000000000 x9 : 3e60944c3da92b00 x8 : 3e60944c3da92b00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff8000898874f8 x4 : ffff800086ac99e0 x3 : ffff8000803f8808 x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000000 Call trace: packet_getname+0x25c/0x3a0 net/packet/af_packet.c:3604 __sys_getsockname+0x168/0x24c net/socket.c:2042 __do_sys_getsockname net/socket.c:2057 [inline] __se_sys_getsockname net/socket.c:2054 [inline] __arm64_sys_getsockname+0x7c/0x94 net/socket.c:2054 __invoke_syscall arch/arm64/kernel/syscall.c:38 [inline] invoke_syscall+0x98/0x2c0 arch/arm64/kernel/syscall.c:52 el0_svc_common+0x134/0x240 arch/arm64/kernel/syscall.c:139 do_el0_svc+0x64/0x198 arch/arm64/kernel/syscall.c:188 el0_svc+0x2c/0x7c arch/arm64/kernel/entry-common.c:647 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:665 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:591 Fixes: df8fc4e934c1 ("kbuild: Enable -fstrict-flex-arrays=3") Reported-by: syzkaller Suggested-by: Kees Cook Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230724213425.22920-3-kuniyu@amazon.com Reviewed-by: Simon Horman Reviewed-by: Kees Cook Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_packet.h | 6 +++++- net/packet/af_packet.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 9efc42382fdb..4d0ad22f83b5 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -18,7 +18,11 @@ struct sockaddr_ll { unsigned short sll_hatype; unsigned char sll_pkttype; unsigned char sll_halen; - unsigned char sll_addr[8]; + union { + unsigned char sll_addr[8]; + /* Actual length is in sll_halen. */ + __DECLARE_FLEX_ARRAY(unsigned char, sll_addr_flex); + }; }; /* Packet types */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 85ff90a03b0c..8e3ddec4c3d5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3601,7 +3601,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; - memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); + memcpy(sll->sll_addr_flex, dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; -- cgit v1.2.3 From 58f6259b7a08f8d47d4629609703d358b042f0fd Mon Sep 17 00:00:00 2001 From: Rahul Singh Date: Tue, 18 Jul 2023 12:31:07 +0100 Subject: xen/evtchn: Introduce new IOCTL to bind static evtchn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Xen 4.17 supports the creation of static evtchns. To allow user space application to bind static evtchns introduce new ioctl "IOCTL_EVTCHN_BIND_STATIC". Existing IOCTL doing more than binding that’s why we need to introduce the new IOCTL to only bind the static event channels. Static evtchns to be available for use during the lifetime of the guest. When the application exits, __unbind_from_irq() ends up being called from release() file operations because of that static evtchns are getting closed. To avoid closing the static event channel, add the new bool variable "is_static" in "struct irq_info" to mark the event channel static when creating the event channel to avoid closing the static evtchn. Also, take this opportunity to remove the open-coded version of the evtchn close in drivers/xen/evtchn.c file and use xen_evtchn_close(). Signed-off-by: Rahul Singh Reviewed-by: Oleksandr Tyshchenko Acked-by: Stefano Stabellini Link: https://lore.kernel.org/r/ae7329bf1713f83e4aad4f3fa0f316258c40a3e9.1689677042.git.rahul.singh@arm.com Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 16 +++++----------- drivers/xen/evtchn.c | 35 ++++++++++++++++++++++++++--------- include/uapi/xen/evtchn.h | 9 +++++++++ include/xen/events.h | 11 ++++++++++- 4 files changed, 50 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index c7715f8bd452..3bdd5b59661d 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -112,6 +112,7 @@ struct irq_info { unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */ u64 eoi_time; /* Time in jiffies when to EOI. */ raw_spinlock_t lock; + bool is_static; /* Is event channel static */ union { unsigned short virq; @@ -815,15 +816,6 @@ static void xen_free_irq(unsigned irq) irq_free_desc(irq); } -static void xen_evtchn_close(evtchn_port_t port) -{ - struct evtchn_close close; - - close.port = port; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); -} - /* Not called for lateeoi events. */ static void event_handler_exit(struct irq_info *info) { @@ -982,7 +974,8 @@ static void __unbind_from_irq(unsigned int irq) unsigned int cpu = cpu_from_irq(irq); struct xenbus_device *dev; - xen_evtchn_close(evtchn); + if (!info->is_static) + xen_evtchn_close(evtchn); switch (type_from_irq(irq)) { case IRQT_VIRQ: @@ -1574,7 +1567,7 @@ int xen_set_irq_priority(unsigned irq, unsigned priority) } EXPORT_SYMBOL_GPL(xen_set_irq_priority); -int evtchn_make_refcounted(evtchn_port_t evtchn) +int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static) { int irq = get_evtchn_to_irq(evtchn); struct irq_info *info; @@ -1590,6 +1583,7 @@ int evtchn_make_refcounted(evtchn_port_t evtchn) WARN_ON(info->refcnt != -1); info->refcnt = 1; + info->is_static = is_static; return 0; } diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index c99415a70051..9139a7364df5 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -366,10 +366,10 @@ static int evtchn_resize_ring(struct per_user_data *u) return 0; } -static int evtchn_bind_to_user(struct per_user_data *u, evtchn_port_t port) +static int evtchn_bind_to_user(struct per_user_data *u, evtchn_port_t port, + bool is_static) { struct user_evtchn *evtchn; - struct evtchn_close close; int rc = 0; /* @@ -402,14 +402,14 @@ static int evtchn_bind_to_user(struct per_user_data *u, evtchn_port_t port) if (rc < 0) goto err; - rc = evtchn_make_refcounted(port); + rc = evtchn_make_refcounted(port, is_static); return rc; err: /* bind failed, should close the port now */ - close.port = port; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); + if (!is_static) + xen_evtchn_close(port); + del_evtchn(u, evtchn); return rc; } @@ -456,7 +456,7 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, bind_virq.port); + rc = evtchn_bind_to_user(u, bind_virq.port, false); if (rc == 0) rc = bind_virq.port; break; @@ -482,7 +482,7 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, bind_interdomain.local_port); + rc = evtchn_bind_to_user(u, bind_interdomain.local_port, false); if (rc == 0) rc = bind_interdomain.local_port; break; @@ -507,7 +507,7 @@ static long evtchn_ioctl(struct file *file, if (rc != 0) break; - rc = evtchn_bind_to_user(u, alloc_unbound.port); + rc = evtchn_bind_to_user(u, alloc_unbound.port, false); if (rc == 0) rc = alloc_unbound.port; break; @@ -536,6 +536,23 @@ static long evtchn_ioctl(struct file *file, break; } + case IOCTL_EVTCHN_BIND_STATIC: { + struct ioctl_evtchn_bind bind; + struct user_evtchn *evtchn; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + rc = -EISCONN; + evtchn = find_evtchn(u, bind.port); + if (evtchn) + break; + + rc = evtchn_bind_to_user(u, bind.port, true); + break; + } + case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify notify; struct user_evtchn *evtchn; diff --git a/include/uapi/xen/evtchn.h b/include/uapi/xen/evtchn.h index 7fbf732f168f..aef2b75f3413 100644 --- a/include/uapi/xen/evtchn.h +++ b/include/uapi/xen/evtchn.h @@ -101,4 +101,13 @@ struct ioctl_evtchn_restrict_domid { domid_t domid; }; +/* + * Bind statically allocated @port. + */ +#define IOCTL_EVTCHN_BIND_STATIC \ + _IOC(_IOC_NONE, 'E', 7, sizeof(struct ioctl_evtchn_bind)) +struct ioctl_evtchn_bind { + unsigned int port; +}; + #endif /* __LINUX_PUBLIC_EVTCHN_H__ */ diff --git a/include/xen/events.h b/include/xen/events.h index ac1281c5ead6..95970a2f7695 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -69,7 +69,7 @@ int xen_set_irq_priority(unsigned irq, unsigned priority); /* * Allow extra references to event channels exposed to userspace by evtchn */ -int evtchn_make_refcounted(evtchn_port_t evtchn); +int evtchn_make_refcounted(evtchn_port_t evtchn, bool is_static); int evtchn_get(evtchn_port_t evtchn); void evtchn_put(evtchn_port_t evtchn); @@ -141,4 +141,13 @@ void xen_init_IRQ(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); +static inline void xen_evtchn_close(evtchn_port_t port) +{ + struct evtchn_close close; + + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); +} + #endif /* _XEN_EVENTS_H */ -- cgit v1.2.3 From fb3bd914b3ec28f5fb697ac55c4846ac2d542855 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 28 Jun 2023 11:02:39 +0200 Subject: x86/srso: Add a Speculative RAS Overflow mitigation Add a mitigation for the speculative return address stack overflow vulnerability found on AMD processors. The mitigation works by ensuring all RET instructions speculate to a controlled location, similar to how speculation is controlled in the retpoline sequence. To accomplish this, the __x86_return_thunk forces the CPU to mispredict every function return using a 'safe return' sequence. To ensure the safety of this mitigation, the kernel must ensure that the safe return sequence is itself free from attacker interference. In Zen3 and Zen4, this is accomplished by creating a BTB alias between the untraining function srso_untrain_ret_alias() and the safe return function srso_safe_ret_alias() which results in evicting a potentially poisoned BTB entry and using that safe one for all function returns. In older Zen1 and Zen2, this is accomplished using a reinterpretation technique similar to Retbleed one: srso_untrain_ret() and srso_safe_ret(). Signed-off-by: Borislav Petkov (AMD) --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/srso.rst | 133 ++++++++++++++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 11 ++ arch/x86/Kconfig | 7 ++ arch/x86/include/asm/cpufeatures.h | 5 + arch/x86/include/asm/nospec-branch.h | 15 ++- arch/x86/include/asm/processor.h | 2 + arch/x86/kernel/alternative.c | 4 +- arch/x86/kernel/cpu/amd.c | 14 +++ arch/x86/kernel/cpu/bugs.c | 106 +++++++++++++++++++ arch/x86/kernel/cpu/common.c | 8 +- arch/x86/kernel/vmlinux.lds.S | 29 +++++- arch/x86/lib/retpoline.S | 82 ++++++++++++++- drivers/base/cpu.c | 8 ++ include/linux/cpu.h | 2 + tools/objtool/arch/x86/decode.c | 5 +- 16 files changed, 422 insertions(+), 10 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/srso.rst (limited to 'include') diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index e0614760a99e..ff4d3fa2a75c 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -19,3 +19,4 @@ are configurable at compile, boot or run time. l1d_flush.rst processor_mmio_stale_data.rst cross-thread-rsb.rst + srso diff --git a/Documentation/admin-guide/hw-vuln/srso.rst b/Documentation/admin-guide/hw-vuln/srso.rst new file mode 100644 index 000000000000..32eb5e6db272 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/srso.rst @@ -0,0 +1,133 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Speculative Return Stack Overflow (SRSO) +======================================== + +This is a mitigation for the speculative return stack overflow (SRSO) +vulnerability found on AMD processors. The mechanism is by now the well +known scenario of poisoning CPU functional units - the Branch Target +Buffer (BTB) and Return Address Predictor (RAP) in this case - and then +tricking the elevated privilege domain (the kernel) into leaking +sensitive data. + +AMD CPUs predict RET instructions using a Return Address Predictor (aka +Return Address Stack/Return Stack Buffer). In some cases, a non-architectural +CALL instruction (i.e., an instruction predicted to be a CALL but is +not actually a CALL) can create an entry in the RAP which may be used +to predict the target of a subsequent RET instruction. + +The specific circumstances that lead to this varies by microarchitecture +but the concern is that an attacker can mis-train the CPU BTB to predict +non-architectural CALL instructions in kernel space and use this to +control the speculative target of a subsequent kernel RET, potentially +leading to information disclosure via a speculative side-channel. + +The issue is tracked under CVE-2023-20569. + +Affected processors +------------------- + +AMD Zen, generations 1-4. That is, all families 0x17 and 0x19. Older +processors have not been investigated. + +System information and options +------------------------------ + +First of all, it is required that the latest microcode be loaded for +mitigations to be effective. + +The sysfs file showing SRSO mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/spec_rstack_overflow + +The possible values in this file are: + + - 'Not affected' The processor is not vulnerable + + - 'Vulnerable: no microcode' The processor is vulnerable, no + microcode extending IBPB functionality + to address the vulnerability has been + applied. + + - 'Mitigation: microcode' Extended IBPB functionality microcode + patch has been applied. It does not + address User->Kernel and Guest->Host + transitions protection but it does + address User->User and VM->VM attack + vectors. + + (spec_rstack_overflow=microcode) + + - 'Mitigation: safe RET' Software-only mitigation. It complements + the extended IBPB microcode patch + functionality by addressing User->Kernel + and Guest->Host transitions protection. + + Selected by default or by + spec_rstack_overflow=safe-ret + + - 'Mitigation: IBPB' Similar protection as "safe RET" above + but employs an IBPB barrier on privilege + domain crossings (User->Kernel, + Guest->Host). + + (spec_rstack_overflow=ibpb) + + - 'Mitigation: IBPB on VMEXIT' Mitigation addressing the cloud provider + scenario - the Guest->Host transitions + only. + + (spec_rstack_overflow=ibpb-vmexit) + +In order to exploit vulnerability, an attacker needs to: + + - gain local access on the machine + + - break kASLR + + - find gadgets in the running kernel in order to use them in the exploit + + - potentially create and pin an additional workload on the sibling + thread, depending on the microarchitecture (not necessary on fam 0x19) + + - run the exploit + +Considering the performance implications of each mitigation type, the +default one is 'Mitigation: safe RET' which should take care of most +attack vectors, including the local User->Kernel one. + +As always, the user is advised to keep her/his system up-to-date by +applying software updates regularly. + +The default setting will be reevaluated when needed and especially when +new attack vectors appear. + +As one can surmise, 'Mitigation: safe RET' does come at the cost of some +performance depending on the workload. If one trusts her/his userspace +and does not want to suffer the performance impact, one can always +disable the mitigation with spec_rstack_overflow=off. + +Similarly, 'Mitigation: IBPB' is another full mitigation type employing +an indrect branch prediction barrier after having applied the required +microcode patch for one's system. This mitigation comes also at +a performance cost. + +Mitigation: safe RET +-------------------- + +The mitigation works by ensuring all RET instructions speculate to +a controlled location, similar to how speculation is controlled in the +retpoline sequence. To accomplish this, the __x86_return_thunk forces +the CPU to mispredict every function return using a 'safe return' +sequence. + +To ensure the safety of this mitigation, the kernel must ensure that the +safe return sequence is itself free from attacker interference. In Zen3 +and Zen4, this is accomplished by creating a BTB alias between the +untraining function srso_untrain_ret_alias() and the safe return +function srso_safe_ret_alias() which results in evicting a potentially +poisoned BTB entry and using that safe one for all function returns. + +In older Zen1 and Zen2, this is accomplished using a reinterpretation +technique similar to Retbleed one: srso_untrain_ret() and +srso_safe_ret(). diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..f5ec3dade58e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5875,6 +5875,17 @@ Not specifying this option is equivalent to spectre_v2_user=auto. + spec_rstack_overflow= + [X86] Control RAS overflow mitigation on AMD Zen CPUs + + off - Disable mitigation + microcode - Enable microcode mitigation only + safe-ret - Enable sw-only safe RET mitigation (default) + ibpb - Enable mitigation by issuing IBPB on + kernel entry + ibpb-vmexit - Issue IBPB only on VMEXIT + (cloud-specific mitigation) + spec_store_bypass_disable= [HW] Control Speculative Store Bypass (SSB) Disable mitigation (Speculative Store Bypass vulnerability) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..d29f1e28a936 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2593,6 +2593,13 @@ config CPU_IBRS_ENTRY This mitigates both spectre_v2 and retbleed at great cost to performance. +config CPU_SRSO + bool "Mitigate speculative RAS overflow on AMD" + depends on CPU_SUP_AMD && X86_64 && RETHUNK + default y + help + Enable the SRSO mitigation needed on AMD Zen1-4 machines. + config SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1f6d904c6481..bc1b4d68e616 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -309,6 +309,9 @@ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */ +#define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */ +#define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ + /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ @@ -484,4 +487,6 @@ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */ +/* BUG word 2 */ +#define X86_BUG_SRSO X86_BUG(1*32 + 0) /* AMD SRSO bug */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1a65cf4acb2b..43fe1c747085 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -211,7 +211,8 @@ * eventually turn into it's own annotation. */ .macro VALIDATE_UNRET_END -#if defined(CONFIG_NOINSTR_VALIDATION) && defined(CONFIG_CPU_UNRET_ENTRY) +#if defined(CONFIG_NOINSTR_VALIDATION) && \ + (defined(CONFIG_CPU_UNRET_ENTRY) || defined(CONFIG_CPU_SRSO)) ANNOTATE_RETPOLINE_SAFE nop #endif @@ -296,6 +297,11 @@ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ __stringify(RESET_CALL_DEPTH), X86_FEATURE_CALL_DEPTH #endif + +#ifdef CONFIG_CPU_SRSO + ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \ + "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS +#endif .endm .macro UNTRAIN_RET_FROM_CALL @@ -307,6 +313,11 @@ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB, \ __stringify(RESET_CALL_DEPTH_FROM_CALL), X86_FEATURE_CALL_DEPTH #endif + +#ifdef CONFIG_CPU_SRSO + ALTERNATIVE_2 "", "call srso_untrain_ret", X86_FEATURE_SRSO, \ + "call srso_untrain_ret_alias", X86_FEATURE_SRSO_ALIAS +#endif .endm @@ -332,6 +343,8 @@ extern retpoline_thunk_t __x86_indirect_jump_thunk_array[]; extern void __x86_return_thunk(void); extern void zen_untrain_ret(void); +extern void srso_untrain_ret(void); +extern void srso_untrain_ret_alias(void); extern void entry_ibpb(void); #ifdef CONFIG_CALL_THUNKS diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d46300e94f85..7c67db7c9f53 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -682,9 +682,11 @@ extern u16 get_llc_id(unsigned int cpu); #ifdef CONFIG_CPU_SUP_AMD extern u32 amd_get_nodes_per_socket(void); extern u32 amd_get_highest_perf(void); +extern bool cpu_has_ibpb_brtype_microcode(void); #else static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } +static inline bool cpu_has_ibpb_brtype_microcode(void) { return false; } #endif extern unsigned long arch_align_stack(unsigned long sp); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2dcf3a06af09..920a8ca7a8f8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -707,7 +707,9 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) int i = 0; /* Patch the custom return thunks... */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_feature_enabled(X86_FEATURE_RETHUNK) || + cpu_feature_enabled(X86_FEATURE_SRSO) || + cpu_feature_enabled(X86_FEATURE_SRSO_ALIAS)) { i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 571abf808ea3..169cb255c483 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1235,3 +1235,17 @@ u32 amd_get_highest_perf(void) return 255; } EXPORT_SYMBOL_GPL(amd_get_highest_perf); + +bool cpu_has_ibpb_brtype_microcode(void) +{ + u8 fam = boot_cpu_data.x86; + + if (fam == 0x17) { + /* Zen1/2 IBPB flushes branch type predictions too. */ + return boot_cpu_has(X86_FEATURE_AMD_IBPB); + } else if (fam == 0x19) { + return false; + } + + return false; +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 9e2a91830f72..31cef61da03a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -47,6 +47,7 @@ static void __init taa_select_mitigation(void); static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); +static void __init srso_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -160,6 +161,7 @@ void __init cpu_select_mitigations(void) md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + srso_select_mitigation(); } /* @@ -2185,6 +2187,95 @@ static int __init l1tf_cmdline(char *str) } early_param("l1tf", l1tf_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt + +enum srso_mitigation { + SRSO_MITIGATION_NONE, + SRSO_MITIGATION_MICROCODE, + SRSO_MITIGATION_SAFE_RET, +}; + +enum srso_mitigation_cmd { + SRSO_CMD_OFF, + SRSO_CMD_MICROCODE, + SRSO_CMD_SAFE_RET, +}; + +static const char * const srso_strings[] = { + [SRSO_MITIGATION_NONE] = "Vulnerable", + [SRSO_MITIGATION_MICROCODE] = "Mitigation: microcode", + [SRSO_MITIGATION_SAFE_RET] = "Mitigation: safe RET", +}; + +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; +static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; + +static int __init srso_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) + srso_cmd = SRSO_CMD_OFF; + else if (!strcmp(str, "microcode")) + srso_cmd = SRSO_CMD_MICROCODE; + else if (!strcmp(str, "safe-ret")) + srso_cmd = SRSO_CMD_SAFE_RET; + else + pr_err("Ignoring unknown SRSO option (%s).", str); + + return 0; +} +early_param("spec_rstack_overflow", srso_parse_cmdline); + +#define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options." + +static void __init srso_select_mitigation(void) +{ + bool has_microcode; + + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + return; + + has_microcode = cpu_has_ibpb_brtype_microcode(); + if (!has_microcode) { + pr_warn("IBPB-extending microcode not applied!\n"); + pr_warn(SRSO_NOTICE); + } + + switch (srso_cmd) { + case SRSO_CMD_OFF: + return; + + case SRSO_CMD_MICROCODE: + if (has_microcode) { + srso_mitigation = SRSO_MITIGATION_MICROCODE; + pr_warn(SRSO_NOTICE); + } + break; + + case SRSO_CMD_SAFE_RET: + if (IS_ENABLED(CONFIG_CPU_SRSO)) { + if (boot_cpu_data.x86 == 0x19) + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + else + setup_force_cpu_cap(X86_FEATURE_SRSO); + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + } else { + pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); + return; + } + break; + + default: + break; + + } + + pr_info("%s%s\n", srso_strings[srso_mitigation], (has_microcode ? "" : ", no microcode")); +} + #undef pr_fmt #define pr_fmt(fmt) fmt @@ -2382,6 +2473,13 @@ static ssize_t retbleed_show_state(char *buf) return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } +static ssize_t srso_show_state(char *buf) +{ + return sysfs_emit(buf, "%s%s\n", + srso_strings[srso_mitigation], + (cpu_has_ibpb_brtype_microcode() ? "" : ", no microcode")); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2431,6 +2529,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RETBLEED: return retbleed_show_state(buf); + case X86_BUG_SRSO: + return srso_show_state(buf); + default: break; } @@ -2495,4 +2596,9 @@ ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, cha { return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); } + +ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRSO); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 52683fddafaf..d4d823eae0fc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1250,6 +1250,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RETBLEED BIT(3) /* CPU is affected by SMT (cross-thread) return predictions */ #define SMT_RSB BIT(4) +/* CPU is affected by SRSO */ +#define SRSO BIT(5) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1281,8 +1283,9 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_AMD(0x15, RETBLEED), VULNBL_AMD(0x16, RETBLEED), - VULNBL_AMD(0x17, RETBLEED | SMT_RSB), + VULNBL_AMD(0x17, RETBLEED | SMT_RSB | SRSO), VULNBL_HYGON(0x18, RETBLEED | SMT_RSB), + VULNBL_AMD(0x19, SRSO), {} }; @@ -1406,6 +1409,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_matches(cpu_vuln_blacklist, SMT_RSB)) setup_force_cpu_bug(X86_BUG_SMT_RSB); + if (cpu_matches(cpu_vuln_blacklist, SRSO)) + setup_force_cpu_bug(X86_BUG_SRSO); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 03c885d3640f..e76813230192 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -134,13 +134,27 @@ SECTIONS SOFTIRQENTRY_TEXT #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; - *(.text.__x86.*) + *(.text.__x86.indirect_thunk) + *(.text.__x86.return_thunk) __indirect_thunk_end = .; #endif STATIC_CALL_TEXT ALIGN_ENTRY_TEXT_BEGIN +#ifdef CONFIG_CPU_SRSO + *(.text.__x86.rethunk_untrain) +#endif + ENTRY_TEXT + +#ifdef CONFIG_CPU_SRSO + /* + * See the comment above srso_untrain_ret_alias()'s + * definition. + */ + . = srso_untrain_ret_alias | (1 << 2) | (1 << 8) | (1 << 14) | (1 << 20); + *(.text.__x86.rethunk_safe) +#endif ALIGN_ENTRY_TEXT_END *(.gnu.warning) @@ -509,7 +523,18 @@ INIT_PER_CPU(irq_stack_backing_store); #endif #ifdef CONFIG_RETHUNK -. = ASSERT((__x86_return_thunk & 0x3f) == 0, "__x86_return_thunk not cacheline-aligned"); +. = ASSERT((__ret & 0x3f) == 0, "__ret not cacheline-aligned"); +. = ASSERT((srso_safe_ret & 0x3f) == 0, "srso_safe_ret not cacheline-aligned"); +#endif + +#ifdef CONFIG_CPU_SRSO +/* + * GNU ld cannot do XOR so do: (A | B) - (A & B) in order to compute the XOR + * of the two function addresses: + */ +. = ASSERT(((srso_untrain_ret_alias | srso_safe_ret_alias) - + (srso_untrain_ret_alias & srso_safe_ret_alias)) == ((1 << 2) | (1 << 8) | (1 << 14) | (1 << 20)), + "SRSO function pair won't alias"); #endif #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 3fd066d42ec0..845cfb0d748f 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -11,6 +11,7 @@ #include #include #include +#include .section .text.__x86.indirect_thunk @@ -131,6 +132,45 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) */ #ifdef CONFIG_RETHUNK +/* + * srso_untrain_ret_alias() and srso_safe_ret_alias() are placed at + * special addresses: + * + * - srso_untrain_ret_alias() is 2M aligned + * - srso_safe_ret_alias() is also in the same 2M page but bits 2, 8, 14 + * and 20 in its virtual address are set (while those bits in the + * srso_untrain_ret_alias() function are cleared). + * + * This guarantees that those two addresses will alias in the branch + * target buffer of Zen3/4 generations, leading to any potential + * poisoned entries at that BTB slot to get evicted. + * + * As a result, srso_safe_ret_alias() becomes a safe return. + */ +#ifdef CONFIG_CPU_SRSO + .section .text.__x86.rethunk_untrain + +SYM_START(srso_untrain_ret_alias, SYM_L_GLOBAL, SYM_A_NONE) + ASM_NOP2 + lfence + jmp __x86_return_thunk +SYM_FUNC_END(srso_untrain_ret_alias) +__EXPORT_THUNK(srso_untrain_ret_alias) + + .section .text.__x86.rethunk_safe +#endif + +/* Needs a definition for the __x86_return_thunk alternative below. */ +SYM_START(srso_safe_ret_alias, SYM_L_GLOBAL, SYM_A_NONE) +#ifdef CONFIG_CPU_SRSO + add $8, %_ASM_SP + UNWIND_HINT_FUNC +#endif + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_FUNC_END(srso_safe_ret_alias) + .section .text.__x86.return_thunk /* @@ -143,7 +183,7 @@ SYM_CODE_END(__x86_indirect_jump_thunk_array) * from re-poisioning the BTB prediction. */ .align 64 - .skip 64 - (__x86_return_thunk - zen_untrain_ret), 0xcc + .skip 64 - (__ret - zen_untrain_ret), 0xcc SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) ANNOTATE_NOENDBR /* @@ -175,10 +215,10 @@ SYM_START(zen_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) * evicted, __x86_return_thunk will suffer Straight Line Speculation * which will be contained safely by the INT3. */ -SYM_INNER_LABEL(__x86_return_thunk, SYM_L_GLOBAL) +SYM_INNER_LABEL(__ret, SYM_L_GLOBAL) ret int3 -SYM_CODE_END(__x86_return_thunk) +SYM_CODE_END(__ret) /* * Ensure the TEST decoding / BTB invalidation is complete. @@ -189,11 +229,45 @@ SYM_CODE_END(__x86_return_thunk) * Jump back and execute the RET in the middle of the TEST instruction. * INT3 is for SLS protection. */ - jmp __x86_return_thunk + jmp __ret int3 SYM_FUNC_END(zen_untrain_ret) __EXPORT_THUNK(zen_untrain_ret) +/* + * SRSO untraining sequence for Zen1/2, similar to zen_untrain_ret() + * above. On kernel entry, srso_untrain_ret() is executed which is a + * + * movabs $0xccccccc308c48348,%rax + * + * and when the return thunk executes the inner label srso_safe_ret() + * later, it is a stack manipulation and a RET which is mispredicted and + * thus a "safe" one to use. + */ + .align 64 + .skip 64 - (srso_safe_ret - srso_untrain_ret), 0xcc +SYM_START(srso_untrain_ret, SYM_L_GLOBAL, SYM_A_NONE) + ANNOTATE_NOENDBR + .byte 0x48, 0xb8 + +SYM_INNER_LABEL(srso_safe_ret, SYM_L_GLOBAL) + add $8, %_ASM_SP + ret + int3 + int3 + int3 + lfence + call srso_safe_ret + int3 +SYM_CODE_END(srso_safe_ret) +SYM_FUNC_END(srso_untrain_ret) +__EXPORT_THUNK(srso_untrain_ret) + +SYM_FUNC_START(__x86_return_thunk) + ALTERNATIVE_2 "jmp __ret", "call srso_safe_ret", X86_FEATURE_SRSO, \ + "call srso_safe_ret_alias", X86_FEATURE_SRSO_ALIAS + int3 +SYM_CODE_END(__x86_return_thunk) EXPORT_SYMBOL(__x86_return_thunk) #endif /* CONFIG_RETHUNK */ diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index c1815b9dae68..f111586d1cce 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -577,6 +577,12 @@ ssize_t __weak cpu_show_retbleed(struct device *dev, return sysfs_emit(buf, "Not affected\n"); } +ssize_t __weak cpu_show_spec_rstack_overflow(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -588,6 +594,7 @@ static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); +static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -601,6 +608,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_srbds.attr, &dev_attr_mmio_stale_data.attr, &dev_attr_retbleed.attr, + &dev_attr_spec_rstack_overflow.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6e6e57ec69e8..23ac87be1ff1 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -70,6 +70,8 @@ extern ssize_t cpu_show_mmio_stale_data(struct device *dev, char *buf); extern ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 2e1caabecb18..2d51fa8da9e8 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -824,5 +824,8 @@ bool arch_is_retpoline(struct symbol *sym) bool arch_is_rethunk(struct symbol *sym) { - return !strcmp(sym->name, "__x86_return_thunk"); + return !strcmp(sym->name, "__x86_return_thunk") || + !strcmp(sym->name, "srso_untrain_ret") || + !strcmp(sym->name, "srso_safe_ret") || + !strcmp(sym->name, "__ret"); } -- cgit v1.2.3 From 09eadda27ca4afc3f560efea265bbe7a93ef786d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 19 Mar 2023 22:29:28 +0100 Subject: backlight: corgi_lcd: fix missing prototype The corgi_lcd_limit_intensity() function is called from platform and defined in a driver, but the driver does not see the declaration: drivers/video/backlight/corgi_lcd.c:434:6: error: no previous prototype for 'corgi_lcd_limit_intensity' [-Werror=missing-prototypes] 434 | void corgi_lcd_limit_intensity(int limit) Move the prototype into a header that can be included from both sides to shut up the warning. Reviewed-by: Daniel Thompson Signed-off-by: Arnd Bergmann --- arch/arm/mach-pxa/sharpsl_pm.h | 1 - arch/arm/mach-pxa/spitz_pm.c | 1 + include/linux/spi/corgi_lcd.h | 2 ++ 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/arm/mach-pxa/sharpsl_pm.h b/arch/arm/mach-pxa/sharpsl_pm.h index 20e4cab64d85..623167f30ec2 100644 --- a/arch/arm/mach-pxa/sharpsl_pm.h +++ b/arch/arm/mach-pxa/sharpsl_pm.h @@ -105,5 +105,4 @@ void sharpsl_pm_led(int val); #define MAX1111_ACIN_VOLT 6u int sharpsl_pm_pxa_read_max1111(int channel); -void corgi_lcd_limit_intensity(int limit); #endif diff --git a/arch/arm/mach-pxa/spitz_pm.c b/arch/arm/mach-pxa/spitz_pm.c index 1c021cef965f..8bc4ea51a0c1 100644 --- a/arch/arm/mach-pxa/spitz_pm.c +++ b/arch/arm/mach-pxa/spitz_pm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/spi/corgi_lcd.h b/include/linux/spi/corgi_lcd.h index 0b857616919c..fc6c1515dc54 100644 --- a/include/linux/spi/corgi_lcd.h +++ b/include/linux/spi/corgi_lcd.h @@ -15,4 +15,6 @@ struct corgi_lcd_platform_data { void (*kick_battery)(void); }; +void corgi_lcd_limit_intensity(int limit); + #endif /* __LINUX_SPI_CORGI_LCD_H */ -- cgit v1.2.3 From 71c8f9cf2623d0db79665f876b95afcdd8214aec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jul 2023 21:00:25 +0200 Subject: mtd: spi-nor: avoid holes in struct spi_mem_op gcc gets confused when -ftrivial-auto-var-init=pattern is used on sparse bit fields such as 'struct spi_mem_op', which caused the previous false positive warning about an uninitialized variable: drivers/mtd/spi-nor/spansion.c: error: 'op' is used uninitialized [-Werror=uninitialized] In fact, the variable is fully initialized and gcc does not see it being used, so the warning is entirely bogus. The problem appears to be a misoptimization in the initialization of single bit fields when the rest of the bytes are not initialized. A previous workaround added another initialization, which ended up shutting up the warning in spansion.c, though it apparently still happens in other files as reported by Peter Foley in the gcc bugzilla. The workaround of adding a fake initialization seems particularly bad because it would set values that can never be correct but prevent the compiler from warning about actually missing initializations. Revert the broken workaround and instead pad the structure to only have bitfields that add up to full bytes, which should avoid this behavior in all drivers. I also filed a new bug against gcc with what I found, so this can hopefully be addressed in future gcc releases. At the moment, only gcc-12 and gcc-13 are affected. Cc: Peter Foley Cc: Pedro Falcato Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110743 Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108402 Link: https://godbolt.org/z/efMMsG1Kx Fixes: 420c4495b5e56 ("mtd: spi-nor: spansion: make sure local struct does not contain garbage") Signed-off-by: Arnd Bergmann Acked-by: Mark Brown Acked-by: Tudor Ambarus Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20230719190045.4007391-1-arnd@kernel.org --- drivers/mtd/spi-nor/spansion.c | 4 ++-- include/linux/spi/spi-mem.h | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/mtd/spi-nor/spansion.c b/drivers/mtd/spi-nor/spansion.c index 36876aa849ed..15f9a80c10b9 100644 --- a/drivers/mtd/spi-nor/spansion.c +++ b/drivers/mtd/spi-nor/spansion.c @@ -361,7 +361,7 @@ static int cypress_nor_determine_addr_mode_by_sr1(struct spi_nor *nor, */ static int cypress_nor_set_addr_mode_nbytes(struct spi_nor *nor) { - struct spi_mem_op op = {}; + struct spi_mem_op op; u8 addr_mode; int ret; @@ -492,7 +492,7 @@ s25fs256t_post_bfpt_fixup(struct spi_nor *nor, const struct sfdp_parameter_header *bfpt_header, const struct sfdp_bfpt *bfpt) { - struct spi_mem_op op = {}; + struct spi_mem_op op; int ret; ret = cypress_nor_set_addr_mode_nbytes(nor); diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 8e984d75f5b6..6b0a7dc48a4b 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -101,6 +101,7 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; u16 opcode; } cmd; @@ -108,6 +109,7 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; u64 val; } addr; @@ -115,12 +117,14 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; } dummy; struct { u8 buswidth; u8 dtr : 1; u8 ecc : 1; + u8 __pad : 6; enum spi_mem_data_dir dir; unsigned int nbytes; union { -- cgit v1.2.3 From b1f02b95758d05b799731d939e76a0bd6da312db Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Sat, 22 Jul 2023 00:51:07 +0200 Subject: mm: fix memory ordering for mm_lock_seq and vm_lock_seq mm->mm_lock_seq effectively functions as a read/write lock; therefore it must be used with acquire/release semantics. A specific example is the interaction between userfaultfd_register() and lock_vma_under_rcu(). userfaultfd_register() does the following from the point where it changes a VMA's flags to the point where concurrent readers are permitted again (in a simple scenario where only a single private VMA is accessed and no merging/splitting is involved): userfaultfd_register userfaultfd_set_vm_flags vm_flags_reset vma_start_write down_write(&vma->vm_lock->lock) vma->vm_lock_seq = mm_lock_seq [marks VMA as busy] up_write(&vma->vm_lock->lock) vm_flags_init [sets VM_UFFD_* in __vm_flags] vma->vm_userfaultfd_ctx.ctx = ctx mmap_write_unlock vma_end_write_all WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1) [unlocks VMA] There are no memory barriers in between the __vm_flags update and the mm->mm_lock_seq update that unlocks the VMA, so the unlock can be reordered to above the `vm_flags_init()` call, which means from the perspective of a concurrent reader, a VMA can be marked as a userfaultfd VMA while it is not VMA-locked. That's bad, we definitely need a store-release for the unlock operation. The non-atomic write to vma->vm_lock_seq in vma_start_write() is mostly fine because all accesses to vma->vm_lock_seq that matter are always protected by the VMA lock. There is a racy read in vma_start_read() though that can tolerate false-positives, so we should be using WRITE_ONCE() to keep things tidy and data-race-free (including for KCSAN). On the other side, lock_vma_under_rcu() works as follows in the relevant region for locking and userfaultfd check: lock_vma_under_rcu vma_start_read vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [early bailout] down_read_trylock(&vma->vm_lock->lock) vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [main check] userfaultfd_armed checks vma->vm_flags & __VM_UFFD_FLAGS Here, the interesting aspect is how far down the mm->mm_lock_seq read can be reordered - if this read is reordered down below the vma->vm_flags access, this could cause lock_vma_under_rcu() to partly operate on information that was read while the VMA was supposed to be locked. To prevent this kind of downwards bleeding of the mm->mm_lock_seq read, we need to read it with a load-acquire. Some of the comment wording is based on suggestions by Suren. BACKPORT WARNING: One of the functions changed by this patch (which I've written against Linus' tree) is vma_try_start_write(), but this function no longer exists in mm/mm-everything. I don't know whether the merged version of this patch will be ordered before or after the patch that removes vma_try_start_write(). If you're backporting this patch to a tree with vma_try_start_write(), make sure this patch changes that function. Link: https://lkml.kernel.org/r/20230721225107.942336-1-jannh@google.com Fixes: 5e31275cc997 ("mm: add per-VMA lock and helper functions to control it") Signed-off-by: Jann Horn Reviewed-by: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 29 +++++++++++++++++++++++------ include/linux/mm_types.h | 28 ++++++++++++++++++++++++++++ include/linux/mmap_lock.h | 10 ++++++++-- 3 files changed, 59 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dd73e4f3d8e..406ab9ea818f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -641,8 +641,14 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} */ static inline bool vma_start_read(struct vm_area_struct *vma) { - /* Check before locking. A race might cause false locked result. */ - if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -653,8 +659,13 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -676,7 +687,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); + *mm_lock_seq = vma->vm_mm->mm_lock_seq; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -688,7 +699,13 @@ static inline void vma_start_write(struct vm_area_struct *vma) return; down_write(&vma->vm_lock->lock); - vma->vm_lock_seq = mm_lock_seq; + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); } @@ -702,7 +719,7 @@ static inline bool vma_try_start_write(struct vm_area_struct *vma) if (!down_write_trylock(&vma->vm_lock->lock)) return false; - vma->vm_lock_seq = mm_lock_seq; + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de10fc797c8e..5e74ce4a28cd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -514,6 +514,20 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK + /* + * Can only be written (using WRITE_ONCE()) while holding both: + * - mmap_lock (in write mode) + * - vm_lock->lock (in write mode) + * Can be read reliably while holding one of: + * - mmap_lock (in read or write mode) + * - vm_lock->lock (in read or write mode) + * Can be read unreliably (using READ_ONCE()) for pessimistic bailout + * while holding nothing (except RCU to keep the VMA struct allocated). + * + * This sequence counter is explicitly allowed to overflow; sequence + * counter reuse can only lead to occasional unnecessary use of the + * slowpath. + */ int vm_lock_seq; struct vma_lock *vm_lock; @@ -679,6 +693,20 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + /* + * This field has lock-like semantics, meaning it is sometimes + * accessed with ACQUIRE/RELEASE semantics. + * Roughly speaking, incrementing the sequence number is + * equivalent to releasing locks on VMAs; reading the sequence + * number can be part of taking a read lock on a VMA. + * + * Can be modified under write mmap_lock using RELEASE + * semantics. + * Can be read with no other protection when holding write + * mmap_lock. + * Can be read with ACQUIRE semantics if not holding write + * mmap_lock. + */ int mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index aab8f1b28d26..e05e167dbd16 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -76,8 +76,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); - /* No races during update due to exclusive mmap_lock being held */ - WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); + /* + * Nobody can concurrently modify mm->mm_lock_seq due to exclusive + * mmap_lock being held. + * We need RELEASE semantics here to ensure that preceding stores into + * the VMA take effect before we unlock it with this store. + * Pairs with ACQUIRE semantics in vma_start_read(). + */ + smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); } #else static inline void vma_end_write_all(struct mm_struct *mm) {} -- cgit v1.2.3 From 4d50e50045aa46d9f3e578ed2edea9bd0a123d24 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jul 2023 14:58:15 +0000 Subject: net: flower: fix stack-out-of-bounds in fl_set_key_cfm() Typical misuse of nla_parse_nested(array, XXX_MAX, ...); array must be declared as struct nlattr *array[XXX_MAX + 1]; v2: Based on feedbacks from Ido Schimmel and Zahari Doychev, I also changed TCA_FLOWER_KEY_CFM_OPT_MAX and cfm_opt_policy definitions. syzbot reported: BUG: KASAN: stack-out-of-bounds in __nla_validate_parse+0x136/0x2bd0 lib/nlattr.c:588 Write of size 32 at addr ffffc90003a0ee20 by task syz-executor296/5014 CPU: 0 PID: 5014 Comm: syz-executor296 Not tainted 6.5.0-rc2-syzkaller-00307-gd192f5382581 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/12/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x1e7/0x2d0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0x163/0x540 mm/kasan/report.c:475 kasan_report+0x175/0x1b0 mm/kasan/report.c:588 kasan_check_range+0x27e/0x290 mm/kasan/generic.c:187 __asan_memset+0x23/0x40 mm/kasan/shadow.c:84 __nla_validate_parse+0x136/0x2bd0 lib/nlattr.c:588 __nla_parse+0x40/0x50 lib/nlattr.c:700 nla_parse_nested include/net/netlink.h:1262 [inline] fl_set_key_cfm+0x1e3/0x440 net/sched/cls_flower.c:1718 fl_set_key+0x2168/0x6620 net/sched/cls_flower.c:1884 fl_tmplt_create+0x1fe/0x510 net/sched/cls_flower.c:2666 tc_chain_tmplt_add net/sched/cls_api.c:2959 [inline] tc_ctl_chain+0x131d/0x1ac0 net/sched/cls_api.c:3068 rtnetlink_rcv_msg+0x82b/0xf50 net/core/rtnetlink.c:6424 netlink_rcv_skb+0x1df/0x430 net/netlink/af_netlink.c:2549 netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] netlink_unicast+0x7c3/0x990 net/netlink/af_netlink.c:1365 netlink_sendmsg+0xa2a/0xd60 net/netlink/af_netlink.c:1914 sock_sendmsg_nosec net/socket.c:725 [inline] sock_sendmsg net/socket.c:748 [inline] ____sys_sendmsg+0x592/0x890 net/socket.c:2494 ___sys_sendmsg net/socket.c:2548 [inline] __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2577 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f54c6150759 Code: 48 83 c4 28 c3 e8 d7 19 00 00 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffe06c30578 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f54c619902d RCX: 00007f54c6150759 RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000003 RBP: 00007ffe06c30590 R08: 0000000000000000 R09: 00007ffe06c305f0 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f54c61c35f0 R13: 00007ffe06c30778 R14: 0000000000000001 R15: 0000000000000001 The buggy address belongs to stack of task syz-executor296/5014 and is located at offset 32 in frame: fl_set_key_cfm+0x0/0x440 net/sched/cls_flower.c:374 This frame has 1 object: [32, 56) 'nla_cfm_opt' The buggy address belongs to the virtual mapping at [ffffc90003a08000, ffffc90003a11000) created by: copy_process+0x5c8/0x4290 kernel/fork.c:2330 Fixes: 7cfffd5fed3e ("net: flower: add support for matching cfm fields") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Simon Horman Reviewed-by: Ido Schimmel Reviewed-by: Zahari Doychev Link: https://lore.kernel.org/r/20230726145815.943910-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 4 +++- net/sched/cls_flower.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7865f5a9885b..4f3932bb712d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -710,9 +710,11 @@ enum { TCA_FLOWER_KEY_CFM_OPT_UNSPEC, TCA_FLOWER_KEY_CFM_MD_LEVEL, TCA_FLOWER_KEY_CFM_OPCODE, - TCA_FLOWER_KEY_CFM_OPT_MAX, + __TCA_FLOWER_KEY_CFM_OPT_MAX, }; +#define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1) + #define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ /* Match-all classifier */ diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 8da9d039d964..9f0711da9c95 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -776,7 +776,8 @@ mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, }; -static const struct nla_policy cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX] = { +static const struct nla_policy +cfm_opt_policy[TCA_FLOWER_KEY_CFM_OPT_MAX + 1] = { [TCA_FLOWER_KEY_CFM_MD_LEVEL] = NLA_POLICY_MAX(NLA_U8, FLOW_DIS_CFM_MDL_MAX), [TCA_FLOWER_KEY_CFM_OPCODE] = { .type = NLA_U8 }, @@ -1709,7 +1710,7 @@ static int fl_set_key_cfm(struct nlattr **tb, struct fl_flow_key *mask, struct netlink_ext_ack *extack) { - struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX]; + struct nlattr *nla_cfm_opt[TCA_FLOWER_KEY_CFM_OPT_MAX + 1]; int err; if (!tb[TCA_FLOWER_KEY_CFM]) -- cgit v1.2.3 From 800959e697de0c55613a2ce40bca52520a421c9f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 25 Jul 2023 21:48:08 +0800 Subject: ftrace: Remove unused extern declarations commit 6a9c981b1e96 ("ftrace: Remove unused function ftrace_arch_read_dyn_info()") left ftrace_arch_read_dyn_info() extern declaration. And commit 1d74f2a0f64b ("ftrace: remove ftrace_ip_converted()") leave ftrace_ip_converted() declaration. Link: https://lore.kernel.org/linux-trace-kernel/20230725134808.9716-1-yuehaibing@huawei.com Cc: Cc: Signed-off-by: YueHaibing Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ce156c7704ee..aad9cf8876b5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -684,7 +684,6 @@ void __init ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable); /* defined in arch */ -extern int ftrace_ip_converted(unsigned long ip); extern int ftrace_dyn_arch_init(void); extern void ftrace_replace_code(int enable); extern int ftrace_update_ftrace_func(ftrace_func_t func); @@ -859,9 +858,6 @@ static inline int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_a } #endif -/* May be defined in arch */ -extern int ftrace_arch_read_dyn_info(char *buf, int size); - extern int skip_trace(unsigned long ip); extern void ftrace_module_init(struct module *mod); extern void ftrace_module_enable(struct module *mod); -- cgit v1.2.3 From 7938cd15436873f649f31cb867bac2d88ca564d0 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Thu, 27 Jul 2023 17:33:56 +0200 Subject: net: gro: fix misuse of CB in udp socket lookup This patch fixes a misuse of IP{6}CB(skb) in GRO, while calling to `udp6_lib_lookup2` when handling udp tunnels. `udp6_lib_lookup2` fetch the device from CB. The fix changes it to fetch the device from `skb->dev`. l3mdev case requires special attention since it has a master and a slave device. Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket") Reported-by: Gal Pressman Signed-off-by: Richard Gobert Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/gro.h | 43 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/udp.c | 8 ++++++-- net/ipv4/udp_offload.c | 7 +++++-- net/ipv6/udp.c | 8 ++++++-- net/ipv6/udp_offload.c | 7 +++++-- 5 files changed, 65 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index 75efa6fb8441..88644b3ca660 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -452,6 +452,49 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, gro_normal_list(napi); } +/* This function is the alternative of 'inet_iif' and 'inet_sdif' + * functions in case we can not rely on fields of IPCB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + *iif = inet_iif(skb) ?: skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + +/* This function is the alternative of 'inet6_iif' and 'inet6_sdif' + * functions in case we can not rely on fields of IP6CB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ + *iif = skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + extern struct list_head offload_base; #endif /* _NET_IPV6_GRO_H */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 42a96b3547c9..abfa860367aa 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,6 +114,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include #endif @@ -555,10 +556,13 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - inet_sdif(skb), net->ipv4.udp_table, NULL); + iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index f402946da344..0f46b3c2e4ac 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -609,10 +609,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, - iph->daddr, dport, inet_iif(skb), - inet_sdif(skb), net->ipv4.udp_table, NULL); + iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b7c972aa09a7..e5da5d1cb215 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -300,10 +301,13 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), net->ipv4.udp_table, NULL); + &iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 09fa7a42cb93..6b95ba241ebe 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,10 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net(skb->dev); + int iif, sdif; + + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, - &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), net->ipv4.udp_table, NULL); + &iph->daddr, dport, iif, + sdif, net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE -- cgit v1.2.3 From 3c5b4d69c358a9275a8de98f87caf6eda644b086 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 28 Jul 2023 15:03:15 +0000 Subject: net: annotate data-races around sk->sk_mark sk->sk_mark is often read while another thread could change the value. Fixes: 4a19ec5800fc ("[NET]: Introducing socket mark socket option.") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_sock.h | 7 ++++--- include/net/ip.h | 2 +- include/net/route.h | 4 ++-- net/can/raw.c | 2 +- net/core/sock.c | 4 ++-- net/dccp/ipv6.c | 4 ++-- net/ipv4/inet_diag.c | 4 ++-- net/ipv4/ip_output.c | 4 ++-- net/ipv4/route.c | 4 ++-- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 4 ++-- net/ipv6/route.c | 7 ++++--- net/ipv6/tcp_ipv6.c | 6 +++--- net/ipv6/udp.c | 4 ++-- net/l2tp/l2tp_ip6.c | 2 +- net/mptcp/sockopt.c | 2 +- net/netfilter/nft_socket.c | 2 +- net/netfilter/xt_socket.c | 4 ++-- net/packet/af_packet.c | 6 +++--- net/smc/af_smc.c | 2 +- net/xdp/xsk.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 23 files changed, 42 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index caa20a905531..0bb32bfc6183 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -107,11 +107,12 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_mark && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) + u32 mark = READ_ONCE(sk->sk_mark); + + if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) return skb->mark; - return sk->sk_mark; + return mark; } static inline int inet_request_bound_dev_if(const struct sock *sk, diff --git a/include/net/ip.h b/include/net/ip.h index 50d435855ae2..332521170d9b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -93,7 +93,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); - ipcm->sockc.mark = inet->sk.sk_mark; + ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); ipcm->sockc.tsflags = inet->sk.sk_tsflags; ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; diff --git a/include/net/route.h b/include/net/route.h index 5a5c726472bd..8c2a8e7d8f8e 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -168,7 +168,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi __be16 dport, __be16 sport, __u8 proto, __u8 tos, int oif) { - flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, + flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); @@ -301,7 +301,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_sk(sk)->transparent) flow_flags |= FLOWI_FLAG_ANYSRC; - flowi4_init_output(fl4, oif, sk->sk_mark, ip_sock_rt_tos(sk), + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); } diff --git a/net/can/raw.c b/net/can/raw.c index ba6b52b1d776..e10f59375659 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -865,7 +865,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); diff --git a/net/core/sock.c b/net/core/sock.c index 96616eb3869d..d831a3df2cef 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -990,7 +990,7 @@ EXPORT_SYMBOL(sock_set_rcvbuf); static void __sock_set_mark(struct sock *sk, u32 val) { if (val != sk->sk_mark) { - sk->sk_mark = val; + WRITE_ONCE(sk->sk_mark, val); sk_dst_reset(sk); } } @@ -1851,7 +1851,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, optval, optlen, len); case SO_MARK: - v.val = sk->sk_mark; + v.val = READ_ONCE(sk->sk_mark); break; case SO_RCVMARK: diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 7249ef218178..d29d1163203d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -238,8 +238,8 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, - sk->sk_priority); + err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt, + np->tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index b812eb36f0e3..f7426926a104 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -150,7 +150,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, } #endif - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark)) + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark))) goto errout; if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || @@ -799,7 +799,7 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; if (sk_fullsock(sk)) - entry.mark = sk->sk_mark; + entry.mark = READ_ONCE(sk->sk_mark); else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else if (sk->sk_state == TCP_TIME_WAIT) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e70839257f7..bcdbf448324a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -186,7 +186,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, skb->priority = sk->sk_priority; if (!skb->mark) - skb->mark = sk->sk_mark; + skb->mark = READ_ONCE(sk->sk_mark); /* Send it out. */ return ip_local_out(net, skb->sk, skb); @@ -529,7 +529,7 @@ packet_routed: /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = READ_ONCE(sk->sk_mark); res = ip_local_out(net, sk, skb); rcu_read_unlock(); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98d7e6ba7493..92fede388d52 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -518,7 +518,7 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct inet_sock *inet = inet_sk(sk); oif = sk->sk_bound_dev_if; - mark = sk->sk_mark; + mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; @@ -552,7 +552,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; - flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, + flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk) & IPTOS_RT_MASK, ip_sock_rt_scope(sk), inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 069642014636..894653be033a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -931,7 +931,7 @@ static void tcp_v4_send_ack(const struct sock *sk, ctl_sk = this_cpu_read(ipv4_tcp_sk); sock_net_set(ctl_sk, net); ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : sk->sk_priority; transmit_time = tcp_transmit_time(sk); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index f804c11e2146..c2c291827a2c 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -120,7 +120,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init_sk(&ipc6, np); ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ac1cef094c5f..39b7d727ba40 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -774,12 +774,12 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + ipc6.sockc.mark = fl6.flowi6_mark; if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 64e873f5895f..56a55585eb79 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2951,7 +2951,8 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) if (!oif && skb->dev) oif = l3mdev_master_ifindex(skb->dev); - ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid); + ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), + sk->sk_uid); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || @@ -3172,8 +3173,8 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { - ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, - sk->sk_uid); + ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, + READ_ONCE(sk->sk_mark), sk->sk_uid); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4714eb695913..3ec563742ac4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -564,8 +564,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, fl6, skb->mark ? : sk->sk_mark, opt, - tclass, sk->sk_priority); + err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), + opt, tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } @@ -939,7 +939,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else - mark = sk->sk_mark; + mark = READ_ONCE(sk->sk_mark); skb_set_delivery_time(buff, tcp_transmit_time(sk), true); } if (txhash) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e5da5d1cb215..f787e6b8424c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -628,7 +628,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) { if (tunnel) { ip6_redirect(skb, sock_net(sk), inet6_iif(skb), - sk->sk_mark, sk->sk_uid); + READ_ONCE(sk->sk_mark), sk->sk_uid); } else { ip6_sk_redirect(skb, sk); } @@ -1360,7 +1360,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipcm6_init(&ipc6); ipc6.gso_size = READ_ONCE(up->gso_size); ipc6.sockc.tsflags = sk->sk_tsflags; - ipc6.sockc.mark = sk->sk_mark; + ipc6.sockc.mark = READ_ONCE(sk->sk_mark); /* destination address check */ if (sin6) { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index b1623f9c4f92..ff78217f0cb1 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -519,7 +519,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* Get and verify the address */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = sk->sk_mark; + fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; ipcm6_init(&ipc6); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 63f7a09335c5..a3f1fe810cc9 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -103,7 +103,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in break; case SO_MARK: if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) { - ssk->sk_mark = sk->sk_mark; + WRITE_ONCE(ssk->sk_mark, sk->sk_mark); sk_dst_reset(ssk); } break; diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 84def74698b7..9ed85be79452 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -107,7 +107,7 @@ static void nft_socket_eval(const struct nft_expr *expr, break; case NFT_SOCKET_MARK: if (sk_fullsock(sk)) { - *dest = sk->sk_mark; + *dest = READ_ONCE(sk->sk_mark); } else { regs->verdict.code = NFT_BREAK; return; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 7013f55f05d1..76e01f292aaf 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -77,7 +77,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); @@ -138,7 +138,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && transparent && sk_fullsock(sk)) - pskb->mark = sk->sk_mark; + pskb->mark = READ_ONCE(sk->sk_mark); if (sk != skb->sk) sock_gen_put(sk); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8e3ddec4c3d5..d9aa21a2b3a1 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2051,7 +2051,7 @@ retry: skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; + skb->mark = READ_ONCE(sk->sk_mark); skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); @@ -2586,7 +2586,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; skb->priority = po->sk.sk_priority; - skb->mark = po->sk.sk_mark; + skb->mark = READ_ONCE(po->sk.sk_mark); skb->tstamp = sockc->transmit_time; skb_setup_tx_timestamp(skb, sockc->tsflags); skb_zcopy_set_nouarg(skb, ph.raw); @@ -2988,7 +2988,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = sk->sk_mark; + sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a7f887d91d89..0c013d2b5d8f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -445,7 +445,7 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, nsk->sk_rcvbuf = osk->sk_rcvbuf; nsk->sk_sndtimeo = osk->sk_sndtimeo; nsk->sk_rcvtimeo = osk->sk_rcvtimeo; - nsk->sk_mark = osk->sk_mark; + nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_priority = osk->sk_priority; nsk->sk_rcvlowat = osk->sk_rcvlowat; nsk->sk_bound_dev_if = osk->sk_bound_dev_if; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 31dca4ecb2c5..b89adb52a977 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -505,7 +505,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb->dev = dev; skb->priority = xs->sk.sk_priority; - skb->mark = xs->sk.sk_mark; + skb->mark = READ_ONCE(xs->sk.sk_mark); skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; skb->destructor = xsk_destruct_skb; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e7617c9959c3..d6b405782b63 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2250,7 +2250,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, match = xfrm_selector_match(&pol->selector, fl, family); if (match) { - if ((sk->sk_mark & pol->mark.m) != pol->mark.v || + if ((READ_ONCE(sk->sk_mark) & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; -- cgit v1.2.3 From 83c35180abfdfb22f3d7703b0c85ad2d442ed2c5 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 25 Jul 2023 08:42:10 +0300 Subject: serial: core: Controller id cannot be negative The controller id cannot be negative. Let's fix the ctrl_id in preparation for adding port_id to fix the device name. Fixes: 84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM") Reported-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20230725054216.45696-2-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 6d58c57acdaa..201813d888df 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -459,7 +459,7 @@ struct uart_port { struct serial_rs485 *rs485); int (*iso7816_config)(struct uart_port *, struct serial_iso7816 *iso7816); - int ctrl_id; /* optional serial core controller id */ + unsigned int ctrl_id; /* optional serial core controller id */ unsigned int irq; /* irq number */ unsigned long irqflags; /* irq flags */ unsigned int uartclk; /* base uart clock */ -- cgit v1.2.3 From d962de6ae51f9b76ad736220077cda83084090b1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Tue, 25 Jul 2023 08:42:11 +0300 Subject: serial: core: Fix serial core port id to not use port->line The serial core port id should be serial core controller specific port instance, which is not always the port->line index. For example, 8250 driver maps a number of legacy ports, and when a hardware specific device driver takes over, we typically have one driver instance for each port. Let's instead add port->port_id to keep track serial ports mapped to each serial core controller instance. Currently this is only a cosmetic issue for the serial core port device names. The issue can be noticed looking at /sys/bus/serial-base/devices for example though. Let's fix the issue to avoid port addressing issues later on. Fixes: 84a9582fd203 ("serial: core: Start managing serial controllers to enable runtime PM") Reviewed-by: Andy Shevchenko Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20230725054216.45696-3-tony@atomide.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 2 ++ drivers/tty/serial/serial_base_bus.c | 2 +- include/linux/serial_core.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index 914e0e6251bf..6fd9ebff81a4 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -497,6 +497,7 @@ static struct uart_8250_port *serial8250_setup_port(int index) up = &serial8250_ports[index]; up->port.line = index; + up->port.port_id = index; serial8250_init_port(up); if (!base_ops) @@ -1040,6 +1041,7 @@ int serial8250_register_8250_port(const struct uart_8250_port *up) uart_remove_one_port(&serial8250_reg, &uart->port); uart->port.ctrl_id = up->port.ctrl_id; + uart->port.port_id = up->port.port_id; uart->port.iobase = up->port.iobase; uart->port.membase = up->port.membase; uart->port.irq = up->port.irq; diff --git a/drivers/tty/serial/serial_base_bus.c b/drivers/tty/serial/serial_base_bus.c index 6ff59c89d867..e6f457b268d9 100644 --- a/drivers/tty/serial/serial_base_bus.c +++ b/drivers/tty/serial/serial_base_bus.c @@ -136,7 +136,7 @@ struct serial_port_device *serial_base_port_add(struct uart_port *port, err = serial_base_device_init(port, &port_dev->dev, &ctrl_dev->dev, &serial_port_type, serial_base_port_release, - port->line); + port->port_id); if (err) goto err_put_device; diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 201813d888df..a156d2ed8d9e 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -460,6 +460,7 @@ struct uart_port { int (*iso7816_config)(struct uart_port *, struct serial_iso7816 *iso7816); unsigned int ctrl_id; /* optional serial core controller id */ + unsigned int port_id; /* optional serial core port id */ unsigned int irq; /* irq number */ unsigned long irqflags; /* irq flags */ unsigned int uartclk; /* base uart clock */ -- cgit v1.2.3 From f3ec2b5d879ef5bbcb24678914641343cb6399a2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 31 Jul 2023 14:38:27 +0300 Subject: xfrm: don't skip free of empty state in acquire policy In destruction flow, the assignment of NULL to xso->dev caused to skip of xfrm_dev_state_free() call, which was called in xfrm_state_put(to_put) routine. Instead of open-coded variant of xfrm_dev_state_delete() and xfrm_dev_state_free(), let's use them directly. Fixes: f8a70afafc17 ("xfrm: add TX datapath support for IPsec packet offload mode") Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 + net/xfrm/xfrm_state.c | 8 ++------ 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 151ca95dd08d..363c7d510554 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1984,6 +1984,7 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) if (dev->xfrmdev_ops->xdo_dev_state_free) dev->xfrmdev_ops->xdo_dev_state_free(x); xso->dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); } } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 49e63eea841d..bda5327bf34d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1324,12 +1324,8 @@ found: struct xfrm_dev_offload *xso = &x->xso; if (xso->type == XFRM_DEV_OFFLOAD_PACKET) { - xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); - xso->dir = 0; - netdev_put(xso->dev, &xso->dev_tracker); - xso->dev = NULL; - xso->real_dev = NULL; - xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + xfrm_dev_state_delete(x); + xfrm_dev_state_free(x); } #endif x->km.state = XFRM_STATE_DEAD; -- cgit v1.2.3 From 16e95a62eed18864aecac404f1e4eed764c363f2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 25 Jul 2023 13:39:12 +0800 Subject: powercap: intel_rapl: Fix a sparse warning in TPMI interface Depends on the interface used, the RAPL registers can be either MSR indexes or memory mapped IO addresses. Current RAPL common code uses u64 to save both MSR and memory mapped IO registers. With this, when handling register address with an __iomem annotation, it triggers a sparse warning like below: sparse warnings: (new ones prefixed by >>) >> drivers/powercap/intel_rapl_tpmi.c:141:41: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected unsigned long long [usertype] *tpmi_rapl_regs @@ got void [noderef] __iomem * @@ drivers/powercap/intel_rapl_tpmi.c:141:41: sparse: expected unsigned long long [usertype] *tpmi_rapl_regs drivers/powercap/intel_rapl_tpmi.c:141:41: sparse: got void [noderef] __iomem * Fix the problem by using a union to save the registers instead. Suggested-by: David Laight Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202307031405.dy3druuy-lkp@intel.com/ Tested-by: Wang Wendy Signed-off-by: Zhang Rui [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 14 +++---- drivers/powercap/intel_rapl_msr.c | 49 +++++++++++----------- drivers/powercap/intel_rapl_tpmi.c | 17 ++++---- .../intel/int340x_thermal/processor_thermal_rapl.c | 16 +++---- include/linux/intel_rapl.h | 14 +++++-- 5 files changed, 58 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 4e646e5e48f6..8fac57b28f8a 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -818,7 +818,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, return -EINVAL; ra.reg = rd->regs[rpi->id]; - if (!ra.reg) + if (!ra.reg.val) return -EINVAL; /* non-hardware data are collected by the polling thread */ @@ -830,7 +830,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, ra.mask = rpi->mask; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { - pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg, rd->rp->name, rd->name); + pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); return -EIO; } @@ -920,7 +920,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) ra.mask = ~0; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg, rd->rp->name, rd->name); + ra.reg.val, rd->rp->name, rd->name); return -ENODEV; } @@ -948,7 +948,7 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) ra.mask = ~0; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg, rd->rp->name, rd->name); + ra.reg.val, rd->rp->name, rd->name); return -ENODEV; } @@ -1135,7 +1135,7 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) ra.mask = ~0; if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", - ra.reg, rd->rp->name, rd->name); + ra.reg.val, rd->rp->name, rd->name); return -ENODEV; } @@ -1411,8 +1411,8 @@ static int rapl_get_domain_unit(struct rapl_domain *rd) struct rapl_defaults *defaults = get_defaults(rd->rp); int ret; - if (!rd->regs[RAPL_DOMAIN_REG_UNIT]) { - if (!rd->rp->priv->reg_unit) { + if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) { + if (!rd->rp->priv->reg_unit.val) { pr_err("No valid Unit register found\n"); return -ENODEV; } diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 569e25eab1e1..dd471021f237 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -34,28 +34,32 @@ static struct rapl_if_priv *rapl_msr_priv; static struct rapl_if_priv rapl_msr_priv_intel = { .type = RAPL_IF_MSR, - .reg_unit = MSR_RAPL_POWER_UNIT, - .regs[RAPL_DOMAIN_PACKAGE] = { - MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO }, - .regs[RAPL_DOMAIN_PP0] = { - MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 }, - .regs[RAPL_DOMAIN_PP1] = { - MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 }, - .regs[RAPL_DOMAIN_DRAM] = { - MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO }, - .regs[RAPL_DOMAIN_PLATFORM] = { - MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0}, + .reg_unit.msr = MSR_RAPL_POWER_UNIT, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PKG_POWER_LIMIT, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_STATUS].msr = MSR_PKG_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PERF].msr = MSR_PKG_PERF_STATUS, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_INFO].msr = MSR_PKG_POWER_INFO, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PP0_POWER_LIMIT, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_STATUS].msr = MSR_PP0_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_POLICY].msr = MSR_PP0_POLICY, + .regs[RAPL_DOMAIN_PP1][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PP1_POWER_LIMIT, + .regs[RAPL_DOMAIN_PP1][RAPL_DOMAIN_REG_STATUS].msr = MSR_PP1_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PP1][RAPL_DOMAIN_REG_POLICY].msr = MSR_PP1_POLICY, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_LIMIT].msr = MSR_DRAM_POWER_LIMIT, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_STATUS].msr = MSR_DRAM_ENERGY_STATUS, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_PERF].msr = MSR_DRAM_PERF_STATUS, + .regs[RAPL_DOMAIN_DRAM][RAPL_DOMAIN_REG_INFO].msr = MSR_DRAM_POWER_INFO, + .regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT].msr = MSR_PLATFORM_POWER_LIMIT, + .regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS].msr = MSR_PLATFORM_ENERGY_STATUS, .limits[RAPL_DOMAIN_PACKAGE] = BIT(POWER_LIMIT2), .limits[RAPL_DOMAIN_PLATFORM] = BIT(POWER_LIMIT2), }; static struct rapl_if_priv rapl_msr_priv_amd = { .type = RAPL_IF_MSR, - .reg_unit = MSR_AMD_RAPL_POWER_UNIT, - .regs[RAPL_DOMAIN_PACKAGE] = { - 0, MSR_AMD_PKG_ENERGY_STATUS, 0, 0, 0 }, - .regs[RAPL_DOMAIN_PP0] = { - 0, MSR_AMD_CORE_ENERGY_STATUS, 0, 0, 0 }, + .reg_unit.msr = MSR_AMD_RAPL_POWER_UNIT, + .regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_STATUS].msr = MSR_AMD_PKG_ENERGY_STATUS, + .regs[RAPL_DOMAIN_PP0][RAPL_DOMAIN_REG_STATUS].msr = MSR_AMD_CORE_ENERGY_STATUS, }; /* Handles CPU hotplug on multi-socket systems. @@ -99,10 +103,8 @@ static int rapl_cpu_down_prep(unsigned int cpu) static int rapl_msr_read_raw(int cpu, struct reg_action *ra) { - u32 msr = (u32)ra->reg; - - if (rdmsrl_safe_on_cpu(cpu, msr, &ra->value)) { - pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu); + if (rdmsrl_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { + pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu); return -EIO; } ra->value &= ra->mask; @@ -112,17 +114,16 @@ static int rapl_msr_read_raw(int cpu, struct reg_action *ra) static void rapl_msr_update_func(void *info) { struct reg_action *ra = info; - u32 msr = (u32)ra->reg; u64 val; - ra->err = rdmsrl_safe(msr, &val); + ra->err = rdmsrl_safe(ra->reg.msr, &val); if (ra->err) return; val &= ~ra->mask; val |= ra->value; - ra->err = wrmsrl_safe(msr, val); + ra->err = wrmsrl_safe(ra->reg.msr, val); } static int rapl_msr_write_raw(int cpu, struct reg_action *ra) @@ -171,7 +172,7 @@ static int rapl_msr_probe(struct platform_device *pdev) if (id) { rapl_msr_priv->limits[RAPL_DOMAIN_PACKAGE] |= BIT(POWER_LIMIT4); - rapl_msr_priv->regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PL4] = + rapl_msr_priv->regs[RAPL_DOMAIN_PACKAGE][RAPL_DOMAIN_REG_PL4].msr = MSR_VR_CURRENT_CONFIG; pr_info("PL4 support detected.\n"); } diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 4f4f13ded225..891c90fefd8b 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -59,10 +59,10 @@ static struct powercap_control_type *tpmi_control_type; static int tpmi_rapl_read_raw(int id, struct reg_action *ra) { - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - ra->value = readq((void __iomem *)ra->reg); + ra->value = readq(ra->reg.mmio); ra->value &= ra->mask; return 0; @@ -72,15 +72,15 @@ static int tpmi_rapl_write_raw(int id, struct reg_action *ra) { u64 val; - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - val = readq((void __iomem *)ra->reg); + val = readq(ra->reg.mmio); val &= ~ra->mask; val |= ra->value; - writeq(val, (void __iomem *)ra->reg); + writeq(val, ra->reg.mmio); return 0; } @@ -138,8 +138,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) enum tpmi_rapl_register reg_index; enum rapl_domain_reg_id reg_id; int tpmi_domain_size, tpmi_domain_flags; - u64 *tpmi_rapl_regs = trp->base + offset; - u64 tpmi_domain_header = readq((void __iomem *)tpmi_rapl_regs); + u64 tpmi_domain_header = readq(trp->base + offset); /* Domain Parent bits are ignored for now */ tpmi_domain_version = tpmi_domain_header & 0xff; @@ -180,7 +179,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) return -EINVAL; } - if (trp->priv.regs[domain_type][RAPL_DOMAIN_REG_UNIT]) { + if (trp->priv.regs[domain_type][RAPL_DOMAIN_REG_UNIT].mmio) { pr_warn(FW_BUG "Duplicate Domain type %d\n", tpmi_domain_type); return -EINVAL; } @@ -218,7 +217,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) default: continue; } - trp->priv.regs[domain_type][reg_id] = (u64)&tpmi_rapl_regs[reg_index]; + trp->priv.regs[domain_type][reg_id].mmio = trp->base + offset + reg_index * 8; } return 0; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index 013f1633f082..2f00fc3bf274 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -57,10 +57,10 @@ static int rapl_mmio_cpu_down_prep(unsigned int cpu) static int rapl_mmio_read_raw(int cpu, struct reg_action *ra) { - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - ra->value = readq((void __iomem *)ra->reg); + ra->value = readq(ra->reg.mmio); ra->value &= ra->mask; return 0; } @@ -69,13 +69,13 @@ static int rapl_mmio_write_raw(int cpu, struct reg_action *ra) { u64 val; - if (!ra->reg) + if (!ra->reg.mmio) return -EINVAL; - val = readq((void __iomem *)ra->reg); + val = readq(ra->reg.mmio); val &= ~ra->mask; val |= ra->value; - writeq(val, (void __iomem *)ra->reg); + writeq(val, ra->reg.mmio); return 0; } @@ -92,13 +92,13 @@ int proc_thermal_rapl_add(struct pci_dev *pdev, struct proc_thermal_device *proc for (domain = RAPL_DOMAIN_PACKAGE; domain < RAPL_DOMAIN_MAX; domain++) { for (reg = RAPL_DOMAIN_REG_LIMIT; reg < RAPL_DOMAIN_REG_MAX; reg++) if (rapl_regs->regs[domain][reg]) - rapl_mmio_priv.regs[domain][reg] = - (u64)proc_priv->mmio_base + + rapl_mmio_priv.regs[domain][reg].mmio = + proc_priv->mmio_base + rapl_regs->regs[domain][reg]; rapl_mmio_priv.limits[domain] = rapl_regs->limits[domain]; } rapl_mmio_priv.type = RAPL_IF_MMIO; - rapl_mmio_priv.reg_unit = (u64)proc_priv->mmio_base + rapl_regs->reg_unit; + rapl_mmio_priv.reg_unit.mmio = proc_priv->mmio_base + rapl_regs->reg_unit; rapl_mmio_priv.read_raw = rapl_mmio_read_raw; rapl_mmio_priv.write_raw = rapl_mmio_write_raw; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index e6936cb25047..33f21bd85dbf 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -100,10 +100,16 @@ struct rapl_package; #define RAPL_DOMAIN_NAME_LENGTH 16 +union rapl_reg { + void __iomem *mmio; + u32 msr; + u64 val; +}; + struct rapl_domain { char name[RAPL_DOMAIN_NAME_LENGTH]; enum rapl_domain_type id; - u64 regs[RAPL_DOMAIN_REG_MAX]; + union rapl_reg regs[RAPL_DOMAIN_REG_MAX]; struct powercap_zone power_zone; struct rapl_domain_data rdd; struct rapl_power_limit rpl[NR_POWER_LIMITS]; @@ -116,7 +122,7 @@ struct rapl_domain { }; struct reg_action { - u64 reg; + union rapl_reg reg; u64 mask; u64 value; int err; @@ -143,8 +149,8 @@ struct rapl_if_priv { enum rapl_if_type type; struct powercap_control_type *control_type; enum cpuhp_state pcap_rapl_online; - u64 reg_unit; - u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; + union rapl_reg reg_unit; + union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra); int (*write_raw)(int id, struct reg_action *ra); -- cgit v1.2.3 From 0a8589055936d8feb56477123a8373ac634018fa Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 24 Jul 2023 13:23:14 +0900 Subject: ata,scsi: do not issue START STOP UNIT on resume During system resume, ata_port_pm_resume() triggers ata EH to 1) Resume the controller 2) Reset and rescan the ports 3) Revalidate devices This EH execution is started asynchronously from ata_port_pm_resume(), which means that when sd_resume() is executed, none or only part of the above processing may have been executed. However, sd_resume() issues a START STOP UNIT to wake up the drive from sleep mode. This command is translated to ATA with ata_scsi_start_stop_xlat() and issued to the device. However, depending on the state of execution of the EH process and revalidation triggerred by ata_port_pm_resume(), two things may happen: 1) The START STOP UNIT fails if it is received before the controller has been reenabled at the beginning of the EH execution. This is visible with error messages like: ata10.00: device reported invalid CHS sector 0 sd 9:0:0:0: [sdc] Start/Stop Unit failed: Result: hostbyte=DID_OK driverbyte=DRIVER_OK sd 9:0:0:0: [sdc] Sense Key : Illegal Request [current] sd 9:0:0:0: [sdc] Add. Sense: Unaligned write command sd 9:0:0:0: PM: dpm_run_callback(): scsi_bus_resume+0x0/0x90 returns -5 sd 9:0:0:0: PM: failed to resume async: error -5 2) The START STOP UNIT command is received while the EH process is on-going, which mean that it is stopped and must wait for its completion, at which point the command is rather useless as the drive is already fully spun up already. This case results also in a significant delay in sd_resume() which is observable by users as the entire system resume completion is delayed. Given that ATA devices will be woken up by libata activity on resume, sd_resume() has no need to issue a START STOP UNIT command, which solves the above mentioned problems. Do not issue this command by introducing the new scsi_device flag no_start_on_resume and setting this flag to 1 in ata_scsi_dev_config(). sd_resume() is modified to issue a START STOP UNIT command only if this flag is not set. Reported-by: Paul Ausbeck Closes: https://bugzilla.kernel.org/show_bug.cgi?id=215880 Fixes: a19a93e4c6a9 ("scsi: core: pm: Rely on the device driver core for async power management") Signed-off-by: Damien Le Moal Tested-by: Tanner Watkins Tested-by: Paul Ausbeck Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche --- drivers/ata/libata-scsi.c | 7 +++++++ drivers/scsi/sd.c | 9 ++++++--- include/scsi/scsi_device.h | 1 + 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 370d18aca71e..c6ece32de8e3 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1100,7 +1100,14 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev) } } else { sdev->sector_size = ata_id_logical_sector_size(dev->id); + /* + * Stop the drive on suspend but do not issue START STOP UNIT + * on resume as this is not necessary and may fail: the device + * will be woken up by ata_port_pm_resume() with a port reset + * and device revalidation. + */ sdev->manage_start_stop = 1; + sdev->no_start_on_resume = 1; } /* diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 68b12afa0721..3c668cfb146d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3876,7 +3876,7 @@ static int sd_suspend_runtime(struct device *dev) static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); - int ret; + int ret = 0; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; @@ -3884,8 +3884,11 @@ static int sd_resume(struct device *dev) if (!sdkp->device->manage_start_stop) return 0; - sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); - ret = sd_start_stop_device(sdkp, 1); + if (!sdkp->device->no_start_on_resume) { + sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); + ret = sd_start_stop_device(sdkp, 1); + } + if (!ret) opal_unlock_from_suspend(sdkp->opal_dev); return ret; diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 75b2235b99e2..b9230b6add04 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -194,6 +194,7 @@ struct scsi_device { unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ unsigned manage_start_stop:1; /* Let HLD (sd) manage start/stop */ + unsigned no_start_on_resume:1; /* Do not issue START_STOP_UNIT on resume */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ unsigned select_no_atn:1; -- cgit v1.2.3 From 0756384fb1bd38adb2ebcfd1307422f433a1d772 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 31 Jul 2023 16:02:08 -0400 Subject: vxlan: Fix nexthop hash size The nexthop code expects a 31 bit hash, such as what is returned by fib_multipath_hash() and rt6_multipath_hash(). Passing the 32 bit hash returned by skb_get_hash() can lead to problems related to the fact that 'int hash' is a negative number when the MSB is set. In the case of hash threshold nexthop groups, nexthop_select_path_hthr() will disproportionately select the first nexthop group entry. In the case of resilient nexthop groups, nexthop_select_path_res() may do an out of bounds access in nh_buckets[], for example: hash = -912054133 num_nh_buckets = 2 bucket_index = 65535 which leads to the following panic: BUG: unable to handle page fault for address: ffffc900025910c8 PGD 100000067 P4D 100000067 PUD 10026b067 PMD 0 Oops: 0002 [#1] PREEMPT SMP KASAN NOPTI CPU: 4 PID: 856 Comm: kworker/4:3 Not tainted 6.5.0-rc2+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Workqueue: ipv6_addrconf addrconf_dad_work RIP: 0010:nexthop_select_path+0x197/0xbf0 Code: c1 e4 05 be 08 00 00 00 4c 8b 35 a4 14 7e 01 4e 8d 6c 25 00 4a 8d 7c 25 08 48 01 dd e8 c2 25 15 ff 49 8d 7d 08 e8 39 13 15 ff <4d> 89 75 08 48 89 ef e8 7d 12 15 ff 48 8b 5d 00 e8 14 55 2f 00 85 RSP: 0018:ffff88810c36f260 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000002000c0 RCX: ffffffffaf02dd77 RDX: dffffc0000000000 RSI: 0000000000000008 RDI: ffffc900025910c8 RBP: ffffc900025910c0 R08: 0000000000000001 R09: fffff520004b2219 R10: ffffc900025910cf R11: 31392d2068736168 R12: 00000000002000c0 R13: ffffc900025910c0 R14: 00000000fffef608 R15: ffff88811840e900 FS: 0000000000000000(0000) GS:ffff8881f7000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc900025910c8 CR3: 0000000129d00000 CR4: 0000000000750ee0 PKRU: 55555554 Call Trace: ? __die+0x23/0x70 ? page_fault_oops+0x1ee/0x5c0 ? __pfx_is_prefetch.constprop.0+0x10/0x10 ? __pfx_page_fault_oops+0x10/0x10 ? search_bpf_extables+0xfe/0x1c0 ? fixup_exception+0x3b/0x470 ? exc_page_fault+0xf6/0x110 ? asm_exc_page_fault+0x26/0x30 ? nexthop_select_path+0x197/0xbf0 ? nexthop_select_path+0x197/0xbf0 ? lock_is_held_type+0xe7/0x140 vxlan_xmit+0x5b2/0x2340 ? __lock_acquire+0x92b/0x3370 ? __pfx_vxlan_xmit+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 ? __pfx_register_lock_class+0x10/0x10 ? skb_network_protocol+0xce/0x2d0 ? dev_hard_start_xmit+0xca/0x350 ? __pfx_vxlan_xmit+0x10/0x10 dev_hard_start_xmit+0xca/0x350 __dev_queue_xmit+0x513/0x1e20 ? __pfx___dev_queue_xmit+0x10/0x10 ? __pfx_lock_release+0x10/0x10 ? mark_held_locks+0x44/0x90 ? skb_push+0x4c/0x80 ? eth_header+0x81/0xe0 ? __pfx_eth_header+0x10/0x10 ? neigh_resolve_output+0x215/0x310 ? ip6_finish_output2+0x2ba/0xc90 ip6_finish_output2+0x2ba/0xc90 ? lock_release+0x236/0x3e0 ? ip6_mtu+0xbb/0x240 ? __pfx_ip6_finish_output2+0x10/0x10 ? find_held_lock+0x83/0xa0 ? lock_is_held_type+0xe7/0x140 ip6_finish_output+0x1ee/0x780 ip6_output+0x138/0x460 ? __pfx_ip6_output+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 ? __pfx_ip6_finish_output+0x10/0x10 NF_HOOK.constprop.0+0xc0/0x420 ? __pfx_NF_HOOK.constprop.0+0x10/0x10 ? ndisc_send_skb+0x2c0/0x960 ? __pfx_lock_release+0x10/0x10 ? __local_bh_enable_ip+0x93/0x110 ? lock_is_held_type+0xe7/0x140 ndisc_send_skb+0x4be/0x960 ? __pfx_ndisc_send_skb+0x10/0x10 ? mark_held_locks+0x65/0x90 ? find_held_lock+0x83/0xa0 ndisc_send_ns+0xb0/0x110 ? __pfx_ndisc_send_ns+0x10/0x10 addrconf_dad_work+0x631/0x8e0 ? lock_acquire+0x180/0x3f0 ? __pfx_addrconf_dad_work+0x10/0x10 ? mark_held_locks+0x24/0x90 process_one_work+0x582/0x9c0 ? __pfx_process_one_work+0x10/0x10 ? __pfx_do_raw_spin_lock+0x10/0x10 ? mark_held_locks+0x24/0x90 worker_thread+0x93/0x630 ? __kthread_parkme+0xdc/0x100 ? __pfx_worker_thread+0x10/0x10 kthread+0x1a5/0x1e0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x60 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 RIP: 0000:0x0 Code: Unable to access opcode bytes at 0xffffffffffffffd6. RSP: 0000:0000000000000000 EFLAGS: 00000000 ORIG_RAX: 0000000000000000 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: CR2: ffffc900025910c8 ---[ end trace 0000000000000000 ]--- RIP: 0010:nexthop_select_path+0x197/0xbf0 Code: c1 e4 05 be 08 00 00 00 4c 8b 35 a4 14 7e 01 4e 8d 6c 25 00 4a 8d 7c 25 08 48 01 dd e8 c2 25 15 ff 49 8d 7d 08 e8 39 13 15 ff <4d> 89 75 08 48 89 ef e8 7d 12 15 ff 48 8b 5d 00 e8 14 55 2f 00 85 RSP: 0018:ffff88810c36f260 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000002000c0 RCX: ffffffffaf02dd77 RDX: dffffc0000000000 RSI: 0000000000000008 RDI: ffffc900025910c8 RBP: ffffc900025910c0 R08: 0000000000000001 R09: fffff520004b2219 R10: ffffc900025910cf R11: 31392d2068736168 R12: 00000000002000c0 R13: ffffc900025910c0 R14: 00000000fffef608 R15: ffff88811840e900 FS: 0000000000000000(0000) GS:ffff8881f7000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 0000000129d00000 CR4: 0000000000750ee0 PKRU: 55555554 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x2ca00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fix this problem by ensuring the MSB of hash is 0 using a right shift - the same approach used in fib_multipath_hash() and rt6_multipath_hash(). Fixes: 1274e1cc4226 ("vxlan: ecmp support for mac fdb entries") Signed-off-by: Benjamin Poirier Reviewed-by: Ido Schimmel Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/vxlan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 1648240c9668..6a9f8a5f387c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -556,12 +556,12 @@ static inline void vxlan_flag_attr_error(int attrtype, } static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, - int hash, + u32 hash, struct vxlan_rdst *rdst) { struct fib_nh_common *nhc; - nhc = nexthop_path_fdb_result(nh, hash); + nhc = nexthop_path_fdb_result(nh, hash >> 1); if (unlikely(!nhc)) return false; -- cgit v1.2.3 From 79e8328e5acbe691bbde029a52c89d70dcbc22f3 Mon Sep 17 00:00:00 2001 From: "ndesaulniers@google.com" Date: Tue, 1 Aug 2023 15:22:17 -0700 Subject: word-at-a-time: use the same return type for has_zero regardless of endianness Compiling big-endian targets with Clang produces the diagnostic: fs/namei.c:2173:13: warning: use of bitwise '|' with boolean operands [-Wbitwise-instead-of-logical] } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); ~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ || fs/namei.c:2173:13: note: cast one or both operands to int to silence this warning It appears that when has_zero was introduced, two definitions were produced with different signatures (in particular different return types). Looking at the usage in hash_name() in fs/namei.c, I suspect that has_zero() is meant to be invoked twice per while loop iteration; using logical-or would not update `bdata` when `a` did not have zeros. So I think it's preferred to always return an unsigned long rather than a bool than update the while loop in hash_name() to use a logical-or rather than bitwise-or. [ Also changed powerpc version to do the same - Linus ] Link: https://github.com/ClangBuiltLinux/linux/issues/1832 Link: https://lore.kernel.org/lkml/20230801-bitwise-v1-1-799bec468dc4@google.com/ Fixes: 36126f8f2ed8 ("word-at-a-time: make the interfaces truly generic") Debugged-by: Nathan Chancellor Signed-off-by: Nick Desaulniers Acked-by: Heiko Carstens Cc: Arnd Bergmann Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/word-at-a-time.h | 2 +- include/asm-generic/word-at-a-time.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/word-at-a-time.h b/arch/powerpc/include/asm/word-at-a-time.h index 46c31fb8748d..30a12d208687 100644 --- a/arch/powerpc/include/asm/word-at-a-time.h +++ b/arch/powerpc/include/asm/word-at-a-time.h @@ -34,7 +34,7 @@ static inline long find_zero(unsigned long mask) return leading_zero_bits >> 3; } -static inline bool has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) +static inline unsigned long has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) { unsigned long rhs = val | c->low_bits; *data = rhs; diff --git a/include/asm-generic/word-at-a-time.h b/include/asm-generic/word-at-a-time.h index 20c93f08c993..95a1d214108a 100644 --- a/include/asm-generic/word-at-a-time.h +++ b/include/asm-generic/word-at-a-time.h @@ -38,7 +38,7 @@ static inline long find_zero(unsigned long mask) return (mask >> 8) ? byte : byte + 1; } -static inline bool has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) +static inline unsigned long has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) { unsigned long rhs = val | c->low_bits; *data = rhs; -- cgit v1.2.3 From 6ad0f2f91ad14ba0a3c2990c054fd6fbe8100429 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 25 Jul 2023 22:21:08 +0800 Subject: Drivers: hv: vmbus: Remove unused extern declaration vmbus_ontimer() Since commit 30fbee49b071 ("Staging: hv: vmbus: Get rid of the unused function vmbus_ontimer()") this is not used anymore, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20230725142108.27280-1-yuehaibing@huawei.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index bfbc37ce223b..3ac3974b3c78 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1239,9 +1239,6 @@ extern int vmbus_recvpacket_raw(struct vmbus_channel *channel, u32 *buffer_actual_len, u64 *requestid); - -extern void vmbus_ontimer(unsigned long data); - /* Base driver object */ struct hv_driver { const char *name; -- cgit v1.2.3 From 3e3271549670783be20e233a2b78a87a0b04c715 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 5 Aug 2023 12:25:01 -0700 Subject: vfs: get rid of old '->iterate' directory operation All users now just use '->iterate_shared()', which only takes the directory inode lock for reading. Filesystems that never got convered to shared mode now instead use a wrapper that drops the lock, re-takes it in write mode, calls the old function, and then downgrades the lock back to read mode. This way the VFS layer and other callers no longer need to care about filesystems that never got converted to the modern era. The filesystems that use the new wrapper are ceph, coda, exfat, jfs, ntfs, ocfs2, overlayfs, and vboxsf. Honestly, several of them look like they really could just iterate their directories in shared mode and skip the wrapper entirely, but the point of this change is to not change semantics or fix filesystems that haven't been fixed in the last 7+ years, but to finally get rid of the dual iterators. Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 5 ++- Documentation/filesystems/porting.rst | 25 ++++++------- fs/ceph/dir.c | 5 +-- fs/coda/dir.c | 20 ++++------- fs/exfat/dir.c | 3 +- fs/exportfs/expfs.c | 2 +- fs/jfs/namei.c | 3 +- fs/ntfs/dir.c | 3 +- fs/ocfs2/file.c | 5 +-- fs/overlayfs/readdir.c | 3 +- fs/readdir.c | 68 ++++++++++++++++++++++++++--------- fs/vboxsf/dir.c | 3 +- include/linux/fs.h | 8 ++++- 13 files changed, 95 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index ed148919e11a..0ca479dbb1cd 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -551,9 +551,8 @@ mutex or just to use i_size_read() instead. Note: this does not protect the file->f_pos against concurrent modifications since this is something the userspace has to take care about. -->iterate() is called with i_rwsem exclusive. - -->iterate_shared() is called with i_rwsem at least shared. +->iterate_shared() is called with i_rwsem held for reading, and with the +file f_pos_lock held exclusively ->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags. Most instances call fasync_helper(), which does that maintenance, so it's diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index d2d684ae7798..0f5da78ef4f9 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -537,7 +537,7 @@ vfs_readdir() is gone; switch to iterate_dir() instead **mandatory** -->readdir() is gone now; switch to ->iterate() +->readdir() is gone now; switch to ->iterate_shared() **mandatory** @@ -693,24 +693,19 @@ parallel now. --- -**recommended** +**mandatory** -->iterate_shared() is added; it's a parallel variant of ->iterate(). +->iterate_shared() is added. Exclusion on struct file level is still provided (as well as that between it and lseek on the same struct file), but if your directory has been opened several times, you can get these called in parallel. Exclusion between that method and all directory-modifying ones is still provided, of course. -Often enough ->iterate() can serve as ->iterate_shared() without any -changes - it is a read-only operation, after all. If you have any -per-inode or per-dentry in-core data structures modified by ->iterate(), -you might need something to serialize the access to them. If you -do dcache pre-seeding, you'll need to switch to d_alloc_parallel() for -that; look for in-tree examples. - -Old method is only used if the new one is absent; eventually it will -be removed. Switch while you still can; the old one won't stay. +If you have any per-inode or per-dentry in-core data structures modified +by ->iterate_shared(), you might need something to serialize the access +to them. If you do dcache pre-seeding, you'll need to switch to +d_alloc_parallel() for that; look for in-tree examples. --- @@ -930,9 +925,9 @@ should be done by looking at FMODE_LSEEK in file->f_mode. filldir_t (readdir callbacks) calling conventions have changed. Instead of returning 0 or -E... it returns bool now. false means "no more" (as -E... used to) and true - "keep going" (as 0 in old calling conventions). Rationale: -callers never looked at specific -E... values anyway. ->iterate() and -->iterate_shared() instance require no changes at all, all filldir_t ones in -the tree converted. +callers never looked at specific -E... values anyway. -> iterate_shared() +instances require no changes at all, all filldir_t ones in the tree +converted. --- diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4a2b39d9a61a..bdcffb04513f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -2019,9 +2019,10 @@ unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn) } } +WRAP_DIR_ITER(ceph_readdir) // FIXME! const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, - .iterate = ceph_readdir, + .iterate_shared = shared_ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, @@ -2033,7 +2034,7 @@ const struct file_operations ceph_dir_fops = { }; const struct file_operations ceph_snapdir_fops = { - .iterate = ceph_readdir, + .iterate_shared = shared_ceph_readdir, .llseek = ceph_dir_llseek, .open = ceph_open, .release = ceph_release, diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 8450b1bd354b..1b960de2bf39 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -429,21 +429,14 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) cfi = coda_ftoc(coda_file); host_file = cfi->cfi_container; - if (host_file->f_op->iterate || host_file->f_op->iterate_shared) { + if (host_file->f_op->iterate_shared) { struct inode *host_inode = file_inode(host_file); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - if (host_file->f_op->iterate_shared) { - inode_lock_shared(host_inode); - ret = host_file->f_op->iterate_shared(host_file, ctx); - file_accessed(host_file); - inode_unlock_shared(host_inode); - } else { - inode_lock(host_inode); - ret = host_file->f_op->iterate(host_file, ctx); - file_accessed(host_file); - inode_unlock(host_inode); - } + inode_lock_shared(host_inode); + ret = host_file->f_op->iterate_shared(host_file, ctx); + file_accessed(host_file); + inode_unlock_shared(host_inode); } return ret; } @@ -585,10 +578,11 @@ const struct inode_operations coda_dir_inode_operations = { .setattr = coda_setattr, }; +WRAP_DIR_ITER(coda_readdir) // FIXME! const struct file_operations coda_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = coda_readdir, + .iterate_shared = shared_coda_readdir, .open = coda_open, .release = coda_release, .fsync = coda_fsync, diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 598081d0d059..e1586bba6d86 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -306,10 +306,11 @@ out: return err; } +WRAP_DIR_ITER(exfat_iterate) // FIXME! const struct file_operations exfat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = exfat_iterate, + .iterate_shared = shared_exfat_iterate, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = exfat_compat_ioctl, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 40e624cf7e92..d1dbe47c7975 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -315,7 +315,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child) goto out; error = -EINVAL; - if (!file->f_op->iterate && !file->f_op->iterate_shared) + if (!file->f_op->iterate_shared) goto out_close; buffer.sequence = 0; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9b030297aa64..e98ddb2b1cf2 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1535,9 +1535,10 @@ const struct inode_operations jfs_dir_inode_operations = { #endif }; +WRAP_DIR_ITER(jfs_readdir) // FIXME! const struct file_operations jfs_dir_operations = { .read = generic_read_dir, - .iterate = jfs_readdir, + .iterate_shared = shared_jfs_readdir, .fsync = jfs_fsync, .unlocked_ioctl = jfs_ioctl, .compat_ioctl = compat_ptr_ioctl, diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 518c3a21a556..4596c90e7b7c 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1525,10 +1525,11 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, #endif /* NTFS_RW */ +WRAP_DIR_ITER(ntfs_readdir) // FIXME! const struct file_operations ntfs_dir_ops = { .llseek = generic_file_llseek, /* Seek inside directory. */ .read = generic_read_dir, /* Return -EISDIR. */ - .iterate = ntfs_readdir, /* Read directory contents. */ + .iterate_shared = shared_ntfs_readdir, /* Read directory contents. */ #ifdef NTFS_RW .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ #endif /* NTFS_RW */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 91a194596552..bf2c17ea96a0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2793,10 +2793,11 @@ const struct file_operations ocfs2_fops = { .remap_file_range = ocfs2_remap_file_range, }; +WRAP_DIR_ITER(ocfs2_readdir) // FIXME! const struct file_operations ocfs2_dops = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, @@ -2842,7 +2843,7 @@ const struct file_operations ocfs2_fops_no_plocks = { const struct file_operations ocfs2_dops_no_plocks = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = ocfs2_readdir, + .iterate_shared = shared_ocfs2_readdir, .fsync = ocfs2_sync_file, .release = ocfs2_dir_release, .open = ocfs2_dir_open, diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index ee5c4736480f..de39e067ae65 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -954,10 +954,11 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return 0; } +WRAP_DIR_ITER(ovl_iterate) // FIXME! const struct file_operations ovl_dir_operations = { .read = generic_read_dir, .open = ovl_dir_open, - .iterate = ovl_iterate, + .iterate_shared = shared_ovl_iterate, .llseek = ovl_dir_llseek, .fsync = ovl_dir_fsync, .release = ovl_dir_release, diff --git a/fs/readdir.c b/fs/readdir.c index b264ce60114d..c8c46e294431 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -24,6 +24,53 @@ #include +/* + * Some filesystems were never converted to '->iterate_shared()' + * and their directory iterators want the inode lock held for + * writing. This wrapper allows for converting from the shared + * semantics to the exclusive inode use. + */ +int wrap_directory_iterator(struct file *file, + struct dir_context *ctx, + int (*iter)(struct file *, struct dir_context *)) +{ + struct inode *inode = file_inode(file); + int ret; + + /* + * We'd love to have an 'inode_upgrade_trylock()' operation, + * see the comment in mmap_upgrade_trylock() in mm/memory.c. + * + * But considering this is for "filesystems that never got + * converted", it really doesn't matter. + * + * Also note that since we have to return with the lock held + * for reading, we can't use the "killable()" locking here, + * since we do need to get the lock even if we're dying. + * + * We could do the write part killably and then get the read + * lock unconditionally if it mattered, but see above on why + * this does the very simplistic conversion. + */ + up_read(&inode->i_rwsem); + down_write(&inode->i_rwsem); + + /* + * Since we dropped the inode lock, we should do the + * DEADDIR test again. See 'iterate_dir()' below. + * + * Note that we don't need to re-do the f_pos games, + * since the file must be locked wrt f_pos anyway. + */ + ret = -ENOENT; + if (!IS_DEADDIR(inode)) + ret = iter(file, ctx); + + downgrade_write(&inode->i_rwsem); + return ret; +} +EXPORT_SYMBOL(wrap_directory_iterator); + /* * Note the "unsafe_put_user() semantics: we goto a * label for errors. @@ -40,39 +87,28 @@ int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); - bool shared = false; int res = -ENOTDIR; - if (file->f_op->iterate_shared) - shared = true; - else if (!file->f_op->iterate) + + if (!file->f_op->iterate_shared) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; - if (shared) - res = down_read_killable(&inode->i_rwsem); - else - res = down_write_killable(&inode->i_rwsem); + res = down_read_killable(&inode->i_rwsem); if (res) goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; - if (shared) - res = file->f_op->iterate_shared(file, ctx); - else - res = file->f_op->iterate(file, ctx); + res = file->f_op->iterate_shared(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } - if (shared) - inode_unlock_shared(inode); - else - inode_unlock(inode); + inode_unlock_shared(inode); out: return res; } diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 075f15c43c78..5f1a14d5b927 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -179,9 +179,10 @@ static int vboxsf_dir_iterate(struct file *dir, struct dir_context *ctx) return 0; } +WRAP_DIR_ITER(vboxsf_dir_iterate) // FIXME! const struct file_operations vboxsf_dir_fops = { .open = vboxsf_dir_open, - .iterate = vboxsf_dir_iterate, + .iterate_shared = shared_vboxsf_dir_iterate, .release = vboxsf_dir_release, .read = generic_read_dir, .llseek = generic_file_llseek, diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..562f2623c9c9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1780,7 +1780,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); - int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); @@ -1817,6 +1816,13 @@ struct file_operations { unsigned int poll_flags); } __randomize_layout; +/* Wrap a directory iterator that needs exclusive inode access */ +int wrap_directory_iterator(struct file *, struct dir_context *, + int (*) (struct file *, struct dir_context *)); +#define WRAP_DIR_ITER(x) \ + static int shared_##x(struct file *file , struct dir_context *ctx) \ + { return wrap_directory_iterator(file, ctx, x); } + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); -- cgit v1.2.3 From 554b841d470338a3b1d6335b14ee1cd0c8f5d754 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 2 Aug 2023 07:25:33 -0500 Subject: tpm: Disable RNG for all AMD fTPMs The TPM RNG functionality is not necessary for entropy when the CPU already supports the RDRAND instruction. The TPM RNG functionality was previously disabled on a subset of AMD fTPM series, but reports continue to show problems on some systems causing stutter root caused to TPM RNG functionality. Expand disabling TPM RNG use for all AMD fTPMs whether they have versions that claim to have fixed or not. To accomplish this, move the detection into part of the TPM CRB registration and add a flag indicating that the TPM should opt-out of registration to hwrng. Cc: stable@vger.kernel.org # 6.1.y+ Fixes: b006c439d58d ("hwrng: core - start hwrng kthread also for untrusted sources") Fixes: f1324bbc4011 ("tpm: disable hwrng for fTPM on some AMD designs") Reported-by: daniil.stas@posteo.net Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217719 Reported-by: bitlord0xff@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217212 Signed-off-by: Mario Limonciello Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-chip.c | 68 ++------------------------------------------- drivers/char/tpm/tpm_crb.c | 30 ++++++++++++++++++++ include/linux/tpm.h | 1 + 3 files changed, 33 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index cf5499e51999..e904aae9771b 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -510,70 +510,6 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip) return 0; } -/* - * Some AMD fTPM versions may cause stutter - * https://www.amd.com/en/support/kb/faq/pa-410 - * - * Fixes are available in two series of fTPM firmware: - * 6.x.y.z series: 6.0.18.6 + - * 3.x.y.z series: 3.57.y.5 + - */ -#ifdef CONFIG_X86 -static bool tpm_amd_is_rng_defective(struct tpm_chip *chip) -{ - u32 val1, val2; - u64 version; - int ret; - - if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) - return false; - - ret = tpm_request_locality(chip); - if (ret) - return false; - - ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL); - if (ret) - goto release; - if (val1 != 0x414D4400U /* AMD */) { - ret = -ENODEV; - goto release; - } - ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_1, &val1, NULL); - if (ret) - goto release; - ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL); - -release: - tpm_relinquish_locality(chip); - - if (ret) - return false; - - version = ((u64)val1 << 32) | val2; - if ((version >> 48) == 6) { - if (version >= 0x0006000000180006ULL) - return false; - } else if ((version >> 48) == 3) { - if (version >= 0x0003005700000005ULL) - return false; - } else { - return false; - } - - dev_warn(&chip->dev, - "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n", - version); - - return true; -} -#else -static inline bool tpm_amd_is_rng_defective(struct tpm_chip *chip) -{ - return false; -} -#endif /* CONFIG_X86 */ - static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) { struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng); @@ -588,7 +524,7 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) static int tpm_add_hwrng(struct tpm_chip *chip) { if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) || - tpm_amd_is_rng_defective(chip)) + chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED) return 0; snprintf(chip->hwrng_name, sizeof(chip->hwrng_name), @@ -719,7 +655,7 @@ void tpm_chip_unregister(struct tpm_chip *chip) { tpm_del_legacy_sysfs(chip); if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip) && - !tpm_amd_is_rng_defective(chip)) + !(chip->flags & TPM_CHIP_FLAG_HWRNG_DISABLED)) hwrng_unregister(&chip->hwrng); tpm_bios_log_teardown(chip); if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip)) diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index 1a5d09b18513..9eb1a1859012 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -463,6 +463,28 @@ static bool crb_req_canceled(struct tpm_chip *chip, u8 status) return (cancel & CRB_CANCEL_INVOKE) == CRB_CANCEL_INVOKE; } +static int crb_check_flags(struct tpm_chip *chip) +{ + u32 val; + int ret; + + ret = crb_request_locality(chip, 0); + if (ret) + return ret; + + ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val, NULL); + if (ret) + goto release; + + if (val == 0x414D4400U /* AMD */) + chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED; + +release: + crb_relinquish_locality(chip, 0); + + return ret; +} + static const struct tpm_class_ops tpm_crb = { .flags = TPM_OPS_AUTO_STARTUP, .status = crb_status, @@ -800,6 +822,14 @@ static int crb_acpi_add(struct acpi_device *device) chip->acpi_dev_handle = device->handle; chip->flags = TPM_CHIP_FLAG_TPM2; + rc = tpm_chip_bootstrap(chip); + if (rc) + goto out; + + rc = crb_check_flags(chip); + if (rc) + goto out; + rc = tpm_chip_register(chip); out: diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 6a1e8f157255..4ee9d13749ad 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -283,6 +283,7 @@ enum tpm_chip_flags { TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6), TPM_CHIP_FLAG_FIRMWARE_UPGRADE = BIT(7), TPM_CHIP_FLAG_SUSPENDED = BIT(8), + TPM_CHIP_FLAG_HWRNG_DISABLED = BIT(9), }; #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev) -- cgit v1.2.3 From 5fb9a9fb71a33be61d7d8e8ba4597bfb18d604d0 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 22 Jun 2023 18:59:19 +0200 Subject: wifi: cfg80211: fix sband iftype data lookup for AP_VLAN AP_VLAN interfaces are virtual, so doesn't really exist as a type for capabilities. When passed in as a type, AP is the one that's really intended. Fixes: c4cbaf7973a7 ("cfg80211: Add support for HE") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20230622165919.46841-1-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7c7d03aa9d06..d6fa7c8767ad 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -562,6 +562,9 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) return NULL; + if (iftype == NL80211_IFTYPE_AP_VLAN) + iftype = NL80211_IFTYPE_AP; + for (i = 0; i < sband->n_iftype_data; i++) { const struct ieee80211_sband_iftype_data *data = &sband->iftype_data[i]; -- cgit v1.2.3 From d74f714896fd6268882789ba28e52c9145951403 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Aug 2023 11:03:28 -0600 Subject: block: get rid of unused plug->nowait flag This was introduced to add a plug based way of signaling nowait issues, but we have since moved on from that. Kill the old dead code, nobody is setting it anymore. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ------ include/linux/blkdev.h | 1 - 2 files changed, 7 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 90de50082146..9866468c72a2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -722,14 +722,9 @@ void submit_bio_noacct(struct bio *bio) struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; - struct blk_plug *plug; might_sleep(); - plug = blk_mq_plug(bio); - if (plug && plug->nowait) - bio->bi_opf |= REQ_NOWAIT; - /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. @@ -1059,7 +1054,6 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) plug->rq_count = 0; plug->multiple_queues = false; plug->has_elevator = false; - plug->nowait = false; INIT_LIST_HEAD(&plug->cb_list); /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629..87d94be7825a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -969,7 +969,6 @@ struct blk_plug { bool multiple_queues; bool has_elevator; - bool nowait; struct list_head cb_list; /* md requires an unplug callback */ }; -- cgit v1.2.3 From 8a70ed9520c5fafaac91053cacdd44625c39e188 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Aug 2023 08:49:23 +0000 Subject: tcp: add missing family to tcp_set_ca_state() tracepoint Before this code is copied, add the missing family, as we did in commit 3dd344ea84e1 ("net: tracepoint: exposing sk_family in all tcp:tracepoints") Fixes: 15fcdf6ae116 ("tcp: Add tracepoint for tcp_set_ca_state") Signed-off-by: Eric Dumazet Cc: Ping Gan Cc: Manjusaka Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230808084923.2239142-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index bf06db8d2046..7b1ddffa3dfc 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -381,6 +381,7 @@ TRACE_EVENT(tcp_cong_state_set, __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) @@ -396,6 +397,7 @@ TRACE_EVENT(tcp_cong_state_set, __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; @@ -409,7 +411,8 @@ TRACE_EVENT(tcp_cong_state_set, __entry->cong_state = ca_state; ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", + TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", + show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, -- cgit v1.2.3 From 2bc057692599a5b3dc93d75a3dff34f72576355d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Aug 2023 11:06:17 -0600 Subject: block: don't make REQ_POLLED imply REQ_NOWAIT Normally these two flags do go together, as the issuer of polled IO generally cannot wait for resources that will get freed as part of IO completion. This is because that very task is the one that will complete the request and free those resources, hence that would introduce a deadlock. But it is possible to have someone else issue the polled IO, eg via io_uring if the request is punted to io-wq. For that case, it's fine to have the task block on IO submission, as it is not the same task that will be completing the IO. It's completely up to the caller to ask for both polled and nowait IO separately! If we don't allow polled IO where IOCB_NOWAIT isn't set in the kiocb, then we can run into repeated -EAGAIN submissions and not make any progress. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/fops.c | 7 ++++--- include/linux/bio.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/block/fops.c b/block/fops.c index a286bf3325c5..838ffada5341 100644 --- a/block/fops.c +++ b/block/fops.c @@ -358,13 +358,14 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + if (iocb->ki_flags & IOCB_HIPRI) { - bio->bi_opf |= REQ_POLLED | REQ_NOWAIT; + bio->bi_opf |= REQ_POLLED; submit_bio(bio); WRITE_ONCE(iocb->private, bio); } else { - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; submit_bio(bio); } return -EIOCBQUEUED; diff --git a/include/linux/bio.h b/include/linux/bio.h index c4f5b5228105..11984ed29cb8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -791,7 +791,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) { bio->bi_opf |= REQ_POLLED; - if (!is_sync_kiocb(kiocb)) + if (kiocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; } -- cgit v1.2.3 From 809e4dc71a0f2b8d2836035d98603694fff11d5d Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 4 Aug 2023 03:37:38 -0400 Subject: bpf, sockmap: Fix bug that strp_done cannot be called strp_done is only called when psock->progs.stream_parser is not NULL, but stream_parser was set to NULL by sk_psock_stop_strp(), called by sk_psock_drop() earlier. So, strp_done can never be called. Introduce SK_PSOCK_RX_ENABLED to mark whether there is strp on psock. Change the condition for calling strp_done from judging whether stream_parser is set to judging whether this flag is set. This flag is only set once when strp_init() succeeds, and will never be cleared later. Fixes: c0d95d3380ee ("bpf, sockmap: Re-evaluate proto ops when psock is removed from sockmap") Signed-off-by: Xu Kuohai Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20230804073740.194770-3-xukuohai@huaweicloud.com Signed-off-by: Martin KaFai Lau --- include/linux/skmsg.h | 1 + net/core/skmsg.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 054d7911bfc9..c1637515a8a4 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -62,6 +62,7 @@ struct sk_psock_progs { enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, + SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a29508e1ff35..ef1a2eb6520b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1120,13 +1120,19 @@ static void sk_psock_strp_data_ready(struct sock *sk) int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) { + int ret; + static const struct strp_callbacks cb = { .rcv_msg = sk_psock_strp_read, .read_sock_done = sk_psock_strp_read_done, .parse_msg = sk_psock_strp_parse, }; - return strp_init(&psock->strp, sk, &cb); + ret = strp_init(&psock->strp, sk, &cb); + if (!ret) + sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + + return ret; } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) @@ -1154,7 +1160,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.stream_parser) + if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED)) strp_done(&psock->strp); } #else -- cgit v1.2.3 From 5f68718b34a531a556f2f50300ead2862278da26 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Aug 2023 14:31:54 +0200 Subject: netfilter: nf_tables: GC transaction API to avoid race with control plane The set types rhashtable and rbtree use a GC worker to reclaim memory. From system work queue, in periodic intervals, a scan of the table is done. The major caveat here is that the nft transaction mutex is not held. This causes a race between control plane and GC when they attempt to delete the same element. We cannot grab the netlink mutex from the work queue, because the control plane has to wait for the GC work queue in case the set is to be removed, so we get following deadlock: cpu 1 cpu2 GC work transaction comes in , lock nft mutex `acquire nft mutex // BLOCKS transaction asks to remove the set set destruction calls cancel_work_sync() cancel_work_sync will now block forever, because it is waiting for the mutex the caller already owns. This patch adds a new API that deals with garbage collection in two steps: 1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT so they are not visible via lookup. Annotate current GC sequence in the GC transaction. Enqueue GC transaction work as soon as it is full. If ruleset is updated, then GC transaction is aborted and retried later. 2) GC work grabs the mutex. If GC sequence has changed then this GC transaction lost race with control plane, abort it as it contains stale references to objects and let GC try again later. If the ruleset is intact, then this GC transaction deactivates and removes the elements and it uses call_rcu() to destroy elements. Note that no elements are removed from GC lockless path, the _DEAD bit is set and pointers are collected. GC catchall does not remove the elements anymore too. There is a new set->dead flag that is set on to abort the GC transaction to deal with set->ops->destroy() path which removes the remaining elements in the set from commit_release, where no mutex is held. To deal with GC when mutex is held, which allows safe deactivate and removal, add sync GC API which releases the set element object via call_rcu(). This is used by rbtree and pipapo backends which also perform garbage collection from control plane path. Since element removal from sets can happen from control plane and element garbage collection/timeout, it is necessary to keep the set structure alive until all elements have been deactivated and destroyed. We cannot do a cancel_work_sync or flush_work in nft_set_destroy because its called with the transaction mutex held, but the aforementioned async work queue might be blocked on the very mutex that nft_set_destroy() callchain is sitting on. This gives us the choice of ABBA deadlock or UaF. To avoid both, add set->refs refcount_t member. The GC API can then increment the set refcount and release it once the elements have been free'd. Set backends are adapted to use the GC transaction API in a follow up patch entitled: ("netfilter: nf_tables: use gc transaction API in set backends") This is joint work with Florian Westphal. Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 64 +++++++++- net/netfilter/nf_tables_api.c | 248 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 300 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 640441a2f926..7256e9c80477 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -512,6 +512,7 @@ struct nft_set_elem_expr { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -541,6 +542,7 @@ struct nft_set_elem_expr { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -562,7 +564,8 @@ struct nft_set { struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -1592,6 +1595,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) clear_bit(NFT_SET_ELEM_BUSY_BIT, word); } +#define NFT_SET_ELEM_DEAD_MASK (1 << 3) + +#if defined(__LITTLE_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT 3 +#elif defined(__BIG_ENDIAN_BITFIELD) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#else +#error +#endif + +static inline void nft_set_elem_dead(struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) +{ + unsigned long *word = (unsigned long *)ext; + + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); +} + /** * struct nft_trans - nf_tables object update in transaction * @@ -1732,6 +1761,38 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u8 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1758,6 +1819,7 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; unsigned int base_seq; + unsigned int gc_seq; }; extern unsigned int nf_tables_net_id; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b4321869e5c6..c28bacb9479b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -31,7 +31,9 @@ static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static LIST_HEAD(nf_tables_destroy_list); +static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); +static DEFINE_SPINLOCK(nf_tables_gc_list_lock); enum { NFT_VALIDATE_SKIP = 0, @@ -120,6 +122,9 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s static void nf_tables_trans_destroy_work(struct work_struct *w); static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); +static void nft_trans_gc_work(struct work_struct *work); +static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -582,10 +587,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, return __nft_trans_set_add(ctx, msg_type, set, NULL); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem); - static int nft_mapelem_deactivate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, @@ -5055,6 +5056,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&set->bindings); INIT_LIST_HEAD(&set->catchall_list); + refcount_set(&set->refs, 1); set->table = table; write_pnet(&set->net, net); set->ops = ops; @@ -5122,6 +5124,14 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, } } +static void nft_set_put(struct nft_set *set) +{ + if (refcount_dec_and_test(&set->refs)) { + kfree(set->name); + kvfree(set); + } +} + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) { int i; @@ -5134,8 +5144,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) set->ops->destroy(ctx, set); nft_set_catchall_destroy(ctx, set); - kfree(set->name); - kvfree(set); + nft_set_put(set); } static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, @@ -6278,7 +6287,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ext = nft_set_elem_ext(set, catchall->elem); if (nft_set_elem_active(ext, genmask) && - !nft_set_elem_expired(ext)) + !nft_set_elem_expired(ext) && + !nft_set_elem_is_dead(ext)) return ext; } @@ -6933,9 +6943,9 @@ static void nft_setelem_data_activate(const struct net *net, nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); } -static void nft_setelem_data_deactivate(const struct net *net, - const struct nft_set *set, - struct nft_set_elem *elem) +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem) { const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); @@ -9418,6 +9428,207 @@ void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, + struct nft_trans_gc *trans) +{ + void **priv = trans->priv; + unsigned int i; + + for (i = 0; i < trans->count; i++) { + struct nft_set_elem elem = { + .priv = priv[i], + }; + + nft_setelem_data_deactivate(ctx->net, trans->set, &elem); + nft_setelem_remove(ctx->net, trans->set, &elem); + } +} + +void nft_trans_gc_destroy(struct nft_trans_gc *trans) +{ + nft_set_put(trans->set); + put_net(trans->net); + kfree(trans); +} + +static void nft_trans_gc_trans_free(struct rcu_head *rcu) +{ + struct nft_set_elem elem = {}; + struct nft_trans_gc *trans; + struct nft_ctx ctx = {}; + unsigned int i; + + trans = container_of(rcu, struct nft_trans_gc, rcu); + ctx.net = read_pnet(&trans->set->net); + + for (i = 0; i < trans->count; i++) { + elem.priv = trans->priv[i]; + if (!nft_setelem_is_catchall(trans->set, &elem)) + atomic_dec(&trans->set->nelems); + + nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); + } + + nft_trans_gc_destroy(trans); +} + +static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) +{ + struct nftables_pernet *nft_net; + struct nft_ctx ctx = {}; + + nft_net = nft_pernet(trans->net); + + mutex_lock(&nft_net->commit_mutex); + + /* Check for race with transaction, otherwise this batch refers to + * stale objects that might not be there anymore. Skip transaction if + * set has been destroyed from control plane transaction in case gc + * worker loses race. + */ + if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { + mutex_unlock(&nft_net->commit_mutex); + return false; + } + + ctx.net = trans->net; + ctx.table = trans->set->table; + + nft_trans_gc_setelem_remove(&ctx, trans); + mutex_unlock(&nft_net->commit_mutex); + + return true; +} + +static void nft_trans_gc_work(struct work_struct *work) +{ + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + + spin_lock(&nf_tables_destroy_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); + spin_unlock(&nf_tables_destroy_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); + if (!nft_trans_gc_work_done(trans)) { + nft_trans_gc_destroy(trans); + continue; + } + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } +} + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp) +{ + struct net *net = read_pnet(&set->net); + struct nft_trans_gc *trans; + + trans = kzalloc(sizeof(*trans), gfp); + if (!trans) + return NULL; + + refcount_inc(&set->refs); + trans->set = set; + trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +} + +void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) +{ + trans->priv[trans->count++] = priv; +} + +static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) +{ + spin_lock(&nf_tables_gc_list_lock); + list_add_tail(&trans->list, &nf_tables_gc_list); + spin_unlock(&nf_tables_gc_list_lock); + + schedule_work(&trans_gc_work); +} + +static int nft_trans_gc_space(struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) +{ + if (nft_trans_gc_space(gc)) + return gc; + + nft_trans_gc_queue_work(gc); + + return nft_trans_gc_alloc(gc->set, gc_seq, gfp); +} + +void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +{ + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + nft_trans_gc_queue_work(trans); +} + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) +{ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + + return nft_trans_gc_alloc(gc->set, 0, gfp); +} + +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +{ + WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); + + if (trans->count == 0) { + nft_trans_gc_destroy(trans); + return; + } + + call_rcu(&trans->rcu, nft_trans_gc_trans_free); +} + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq) +{ + struct nft_set_elem_catchall *catchall; + const struct nft_set *set = gc->set; + struct nft_set_ext *ext; + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + + if (!nft_set_elem_expired(ext)) + continue; + if (nft_set_elem_is_dead(ext)) + goto dead_elem; + + nft_set_elem_dead(ext); +dead_elem: + gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); + if (!gc) + return NULL; + + nft_trans_gc_elem_add(gc, catchall->elem); + } + + return gc; +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); @@ -9580,11 +9791,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans, *next; + unsigned int base_seq, gc_seq; LIST_HEAD(set_update_list); struct nft_trans_elem *te; struct nft_chain *chain; struct nft_table *table; - unsigned int base_seq; LIST_HEAD(adl); int err; @@ -9661,6 +9872,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) WRITE_ONCE(nft_net->base_seq, base_seq); + /* Bump gc counter, it becomes odd, this is the busy mark. */ + gc_seq = READ_ONCE(nft_net->gc_seq); + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + /* step 3. Start new generation, rules_gen_X now in use. */ net->nft.gencursor = nft_gencursor_next(net); @@ -9768,6 +9983,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) break; case NFT_MSG_DELSET: case NFT_MSG_DESTROYSET: + nft_trans_set(trans)->dead = 1; list_del_rcu(&nft_trans_set(trans)->list); nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), trans->msg_type, GFP_KERNEL); @@ -9870,6 +10086,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); nf_tables_commit_audit_log(&adl, nft_net->base_seq); + + WRITE_ONCE(nft_net->gc_seq, ++gc_seq); nf_tables_commit_release(net); return 0; @@ -10919,6 +11137,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); nft_net->base_seq = 1; + nft_net->gc_seq = 0; return 0; } @@ -10947,10 +11166,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); } +static void nf_tables_exit_batch(struct list_head *net_exit_list) +{ + flush_work(&trans_gc_work); +} + static struct pernet_operations nf_tables_net_ops = { .init = nf_tables_init_net, .pre_exit = nf_tables_pre_exit_net, .exit = nf_tables_exit_net, + .exit_batch = nf_tables_exit_batch, .id = &nf_tables_net_id, .size = sizeof(struct nftables_pernet), }; @@ -11022,6 +11247,7 @@ static void __exit nf_tables_module_exit(void) nft_chain_filter_fini(); nft_chain_route_fini(); unregister_pernet_subsys(&nf_tables_net_ops); + cancel_work_sync(&trans_gc_work); cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); -- cgit v1.2.3 From a2dd0233cbc4d8a0abb5f64487487ffc9265beb5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Aug 2023 15:00:36 +0200 Subject: netfilter: nf_tables: remove busy mark and gc batch API Ditch it, it has been replace it by the GC transaction API and it has no clients anymore. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 98 ++------------------------------------- net/netfilter/nf_tables_api.c | 48 +------------------ 2 files changed, 4 insertions(+), 142 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 7256e9c80477..35870858ddf2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -599,7 +599,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net, struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); -void *nft_set_catchall_gc(const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { @@ -816,62 +815,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); -/** - * struct nft_set_gc_batch_head - nf_tables set garbage collection batch - * - * @rcu: rcu head - * @set: set the elements belong to - * @cnt: count of elements - */ -struct nft_set_gc_batch_head { - struct rcu_head rcu; - const struct nft_set *set; - unsigned int cnt; -}; - -#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ - sizeof(struct nft_set_gc_batch_head)) / \ - sizeof(void *)) - -/** - * struct nft_set_gc_batch - nf_tables set garbage collection batch - * - * @head: GC batch head - * @elems: garbage collection elements - */ -struct nft_set_gc_batch { - struct nft_set_gc_batch_head head; - void *elems[NFT_SET_GC_BATCH_SIZE]; -}; - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp); -void nft_set_gc_batch_release(struct rcu_head *rcu); - -static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) -{ - if (gcb != NULL) - call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); -} - -static inline struct nft_set_gc_batch * -nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, - gfp_t gfp) -{ - if (gcb != NULL) { - if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) - return gcb; - nft_set_gc_batch_complete(gcb); - } - return nft_set_gc_batch_alloc(set, gfp); -} - -static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, - void *elem) -{ - gcb->elems[gcb->head.cnt++] = elem; -} - struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type @@ -1560,47 +1503,12 @@ static inline void nft_set_elem_change_active(const struct net *net, #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ -/* - * We use a free bit in the genmask field to indicate the element - * is busy, meaning it is currently being processed either by - * the netlink API or GC. - * - * Even though the genmask is only a single byte wide, this works - * because the extension structure if fully constant once initialized, - * so there are no non-atomic write accesses unless it is already - * marked busy. - */ -#define NFT_SET_ELEM_BUSY_MASK (1 << 2) - -#if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT 2 -#elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) -#else -#error -#endif - -static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); - return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) -{ - unsigned long *word = (unsigned long *)ext; - - clear_bit(NFT_SET_ELEM_BUSY_BIT, word); -} - -#define NFT_SET_ELEM_DEAD_MASK (1 << 3) +#define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT 3 +#define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index fd4b5da7ac3c..c62227ae7746 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6296,29 +6296,6 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, } EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); -void *nft_set_catchall_gc(const struct nft_set *set) -{ - struct nft_set_elem_catchall *catchall, *next; - struct nft_set_ext *ext; - void *elem = NULL; - - list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { - ext = nft_set_elem_ext(set, catchall->elem); - - if (!nft_set_elem_expired(ext) || - nft_set_elem_mark_busy(ext)) - continue; - - elem = catchall->elem; - list_del_rcu(&catchall->list); - kfree_rcu(catchall, rcu); - break; - } - - return elem; -} -EXPORT_SYMBOL_GPL(nft_set_catchall_gc); - static int nft_setelem_catchall_insert(const struct net *net, struct nft_set *set, const struct nft_set_elem *elem, @@ -6789,7 +6766,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, goto err_elem_free; } - ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; + ext->genmask = nft_genmask_cur(ctx->net); err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); if (err) { @@ -7181,29 +7158,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return err; } -void nft_set_gc_batch_release(struct rcu_head *rcu) -{ - struct nft_set_gc_batch *gcb; - unsigned int i; - - gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); - for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); - kfree(gcb); -} - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp) -{ - struct nft_set_gc_batch *gcb; - - gcb = kzalloc(sizeof(*gcb), gfp); - if (gcb == NULL) - return gcb; - gcb->head.set = set; - return gcb; -} - /* * Stateful objects */ -- cgit v1.2.3 From a57c27c7ad85c420b7de44c6ee56692d51709dda Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 9 Aug 2023 15:04:59 +0200 Subject: x86/speculation: Add cpu_show_gds() prototype The newly added function has two definitions but no prototypes: drivers/base/cpu.c:605:16: error: no previous prototype for 'cpu_show_gds' [-Werror=missing-prototypes] Add a declaration next to the other ones for this file to avoid the warning. Fixes: 8974eb588283b ("x86/speculation: Add Gather Data Sampling mitigation") Signed-off-by: Arnd Bergmann Signed-off-by: Dave Hansen Tested-by: Daniel Sneddon Cc: stable@kernel.org Link: https://lore.kernel.org/all/20230809130530.1913368-1-arnd%40kernel.org --- include/linux/cpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 23ac87be1ff1..e006c719182b 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -72,6 +72,8 @@ extern ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_gds(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From c8afaa1b0f8bc93d013ab2ea6b9649958af3f1d3 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 12 Aug 2023 18:15:54 +0200 Subject: locking: remove spin_lock_prefetch The only remaining consumer is new_inode, where it showed up in 2001 as commit c37fa164f793 ("v2.4.9.9 -> v2.4.9.10") in a historical repo [1] with a changelog which does not mention it. Since then the line got only touched up to keep compiling. While it may have been of benefit back in the day, it is guaranteed to at best not get in the way in the multicore setting -- as the code performs *a lot* of work between the prefetch and actual lock acquire, any contention means the cacheline is already invalid by the time the routine calls spin_lock(). It adds spurious traffic, for short. On top of it prefetch is notoriously tricky to use for single-threaded purposes, making it questionable from the get go. As such, remove it. I admit upfront I did not see value in benchmarking this change, but I can do it if that is deemed appropriate. Removal from new_inode and of the entire thing are in the same patch as requested by Linus, so whatever weird looks can be directed at that guy. Link: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/fs/inode.c?id=c37fa164f793735b32aa3f53154ff1a7659e6442 [1] Signed-off-by: Mateusz Guzik Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/processor.h | 13 ------------- arch/arm64/include/asm/processor.h | 8 -------- arch/ia64/include/asm/processor.h | 3 --- .../include/asm/mach-cavium-octeon/cpu-feature-overrides.h | 2 -- arch/powerpc/include/asm/processor.h | 3 --- arch/sparc/include/asm/processor_64.h | 3 --- arch/x86/include/asm/processor.h | 6 ------ fs/inode.c | 3 --- include/linux/prefetch.h | 7 +------ 9 files changed, 1 insertion(+), 47 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 714abe494e5f..55bb1c09fd39 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -47,12 +47,6 @@ unsigned long __get_wchan(struct task_struct *p); #define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH - -#ifndef CONFIG_SMP -/* Nothing to prefetch. */ -#define spin_lock_prefetch(lock) do { } while (0) -#endif extern inline void prefetch(const void *ptr) { @@ -64,11 +58,4 @@ extern inline void prefetchw(const void *ptr) __builtin_prefetch(ptr, 1, 3); } -#ifdef CONFIG_SMP -extern inline void spin_lock_prefetch(const void *ptr) -{ - __builtin_prefetch(ptr, 1, 3); -} -#endif - #endif /* __ASM_ALPHA_PROCESSOR_H */ diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 3918f2a67970..e5bc54522e71 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -359,14 +359,6 @@ static inline void prefetchw(const void *ptr) asm volatile("prfm pstl1keep, %a0\n" : : "p" (ptr)); } -#define ARCH_HAS_SPINLOCK_PREFETCH -static inline void spin_lock_prefetch(const void *ptr) -{ - asm volatile(ARM64_LSE_ATOMIC_INSN( - "prfm pstl1strm, %a0", - "nop") : : "p" (ptr)); -} - extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ extern void __init minsigstksz_setup(void); diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index d1978e004054..47e3801b526a 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -634,7 +634,6 @@ ia64_imva (void *addr) #define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH #define PREFETCH_STRIDE L1_CACHE_BYTES static inline void @@ -649,8 +648,6 @@ prefetchw (const void *x) ia64_lfetch_excl(ia64_lfhint_none, x); } -#define spin_lock_prefetch(x) prefetchw(x) - extern unsigned long boot_option_idle_override; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_FORCE_MWAIT, diff --git a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h index 9151dcd9d0d5..af9cea21c853 100644 --- a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h +++ b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h @@ -58,8 +58,6 @@ #define cpu_has_rixi (cpu_data[0].cputype != CPU_CAVIUM_OCTEON) -#define ARCH_HAS_SPINLOCK_PREFETCH 1 -#define spin_lock_prefetch(x) prefetch(x) #define PREFETCH_STRIDE 128 #ifdef __OCTEON__ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 8a6754ffdc7e..a6c7069bec5d 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -393,7 +393,6 @@ int validate_sp_size(unsigned long sp, struct task_struct *p, */ #define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH static inline void prefetch(const void *x) { @@ -411,8 +410,6 @@ static inline void prefetchw(const void *x) __asm__ __volatile__ ("dcbtst 0,%0" : : "r" (x)); } -#define spin_lock_prefetch(x) prefetchw(x) - /* asm stubs */ extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val); extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val); diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 2667f35d5ea5..0a0d5c3d184c 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -213,7 +213,6 @@ unsigned long __get_wchan(struct task_struct *task); */ #define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH static inline void prefetch(const void *x) { @@ -239,8 +238,6 @@ static inline void prefetchw(const void *x) : "r" (x)); } -#define spin_lock_prefetch(x) prefetchw(x) - #define HAVE_ARCH_PICK_MMAP_LAYOUT int do_mathemu(struct pt_regs *regs, struct fpustate *f, bool illegal_insn_trap); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4ae2773b873d..fd750247ca89 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -586,7 +586,6 @@ extern char ignore_fpu_irq; #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 #define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 # define BASE_PREFETCH "" @@ -620,11 +619,6 @@ static __always_inline void prefetchw(const void *x) "m" (*(const char *)x)); } -static inline void spin_lock_prefetch(const void *x) -{ - prefetchw(x); -} - #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) diff --git a/fs/inode.c b/fs/inode.c index 8fefb69e1f84..67611a360031 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -16,7 +16,6 @@ #include #include #include -#include #include /* for inode_has_buffers */ #include #include @@ -1041,8 +1040,6 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&sb->s_inode_list_lock); - inode = new_inode_pseudo(sb); if (inode) inode_sb_list_add(inode); diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h index b83a3f944f28..b068e2e60939 100644 --- a/include/linux/prefetch.h +++ b/include/linux/prefetch.h @@ -25,11 +25,10 @@ struct page; prefetch() should be defined by the architecture, if not, the #define below provides a no-op define. - There are 3 prefetch() macros: + There are 2 prefetch() macros: prefetch(x) - prefetches the cacheline at "x" for read prefetchw(x) - prefetches the cacheline at "x" for write - spin_lock_prefetch(x) - prefetches the spinlock *x for taking there is also PREFETCH_STRIDE which is the architecure-preferred "lookahead" size for prefetching streamed operations. @@ -44,10 +43,6 @@ struct page; #define prefetchw(x) __builtin_prefetch(x,1) #endif -#ifndef ARCH_HAS_SPINLOCK_PREFETCH -#define spin_lock_prefetch(x) prefetchw(x) -#endif - #ifndef PREFETCH_STRIDE #define PREFETCH_STRIDE (4*L1_CACHE_BYTES) #endif -- cgit v1.2.3 From 082f613127832c6e57ad21fa76500bc08427f04a Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Mon, 14 Aug 2023 22:05:10 +0800 Subject: fbdev: kyro: Remove unused declarations These declarations is never implemented since the beginning of git history. Signed-off-by: Yue Haibing Signed-off-by: Helge Deller --- include/video/kyro.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/video/kyro.h b/include/video/kyro.h index b958c2e9c915..418eef6c5523 100644 --- a/include/video/kyro.h +++ b/include/video/kyro.h @@ -38,18 +38,6 @@ struct kyrofb_info { int wc_cookie; }; -extern int kyro_dev_init(void); -extern void kyro_dev_reset(void); - -extern unsigned char *kyro_dev_physical_fb_ptr(void); -extern unsigned char *kyro_dev_virtual_fb_ptr(void); -extern void *kyro_dev_physical_regs_ptr(void); -extern void *kyro_dev_virtual_regs_ptr(void); -extern unsigned int kyro_dev_fb_size(void); -extern unsigned int kyro_dev_regs_size(void); - -extern u32 kyro_dev_overlay_offset(void); - /* * benedict.gaster@superh.com * Added the follow IOCTLS for the creation of overlay services... -- cgit v1.2.3 From 08713cb006b6f07434f276c5ee214fb20c7fd965 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Aug 2023 23:59:03 +0200 Subject: netfilter: nf_tables: fix kdoc warnings after gc rework Jakub Kicinski says: We've got some new kdoc warnings here: net/netfilter/nft_set_pipapo.c:1557: warning: Function parameter or member '_set' not described in 'pipapo_gc' net/netfilter/nft_set_pipapo.c:1557: warning: Excess function parameter 'set' description in 'pipapo_gc' include/net/netfilter/nf_tables.h:577: warning: Function parameter or member 'dead' not described in 'nft_set' Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") Reported-by: Jakub Kicinski Closes: https://lore.kernel.org/netdev/20230810104638.746e46f1@kernel.org/ Signed-off-by: Florian Westphal --- include/net/netfilter/nf_tables.h | 1 + net/netfilter/nft_set_pipapo.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 35870858ddf2..e9ae567c037d 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -534,6 +534,7 @@ struct nft_set_elem_expr { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 5fa12cfc7b84..f95b3844162e 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1549,7 +1549,7 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, /** * pipapo_gc() - Drop expired entries from set, destroy start and end elements - * @set: nftables API set representation + * @_set: nftables API set representation * @m: Matching data */ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) -- cgit v1.2.3 From 50b6f2c8297793f7f3315623db78dcff85158e96 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 15 Aug 2023 13:19:07 +0300 Subject: Revert "drm/edid: Fix csync detailed mode parsing" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit ca62297b2085b5b3168bd891ca24862242c635a1. Commit ca62297b2085 ("drm/edid: Fix csync detailed mode parsing") fixed EDID detailed mode sync parsing. Unfortunately, there are quite a few displays out there that have bogus (zero) sync field that are broken by the change. Zero means analog composite sync, which is not right for digital displays, and the modes get rejected. Regardless, it used to work, and it needs to continue to work. Revert the change. Rejecting modes with analog composite sync was the part that fixed the gitlab issue 8146 [1]. We'll need to get back to the drawing board with that. [1] https://gitlab.freedesktop.org/drm/intel/-/issues/8146 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/8789 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/8930 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/9044 Fixes: ca62297b2085 ("drm/edid: Fix csync detailed mode parsing") Cc: Ville Syrjälä Cc: dri-devel@lists.freedesktop.org Cc: # v6.4+ Signed-off-by: Jani Nikula Acked-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20230815101907.2900768-1-jani.nikula@intel.com --- drivers/gpu/drm/drm_edid.c | 29 ++++++++--------------------- include/drm/drm_edid.h | 12 +++--------- 2 files changed, 11 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index e0dbd9140726..1f470968ed14 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -3456,6 +3456,10 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_connector *connecto connector->base.id, connector->name); return NULL; } + if (!(pt->misc & DRM_EDID_PT_SEPARATE_SYNC)) { + drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Composite sync not supported\n", + connector->base.id, connector->name); + } /* it is incorrect if hsync/vsync width is zero */ if (!hsync_pulse_width || !vsync_pulse_width) { @@ -3502,27 +3506,10 @@ static struct drm_display_mode *drm_mode_detailed(struct drm_connector *connecto if (info->quirks & EDID_QUIRK_DETAILED_SYNC_PP) { mode->flags |= DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC; } else { - switch (pt->misc & DRM_EDID_PT_SYNC_MASK) { - case DRM_EDID_PT_ANALOG_CSYNC: - case DRM_EDID_PT_BIPOLAR_ANALOG_CSYNC: - drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Analog composite sync!\n", - connector->base.id, connector->name); - mode->flags |= DRM_MODE_FLAG_CSYNC | DRM_MODE_FLAG_NCSYNC; - break; - case DRM_EDID_PT_DIGITAL_CSYNC: - drm_dbg_kms(dev, "[CONNECTOR:%d:%s] Digital composite sync!\n", - connector->base.id, connector->name); - mode->flags |= DRM_MODE_FLAG_CSYNC; - mode->flags |= (pt->misc & DRM_EDID_PT_HSYNC_POSITIVE) ? - DRM_MODE_FLAG_PCSYNC : DRM_MODE_FLAG_NCSYNC; - break; - case DRM_EDID_PT_DIGITAL_SEPARATE_SYNC: - mode->flags |= (pt->misc & DRM_EDID_PT_HSYNC_POSITIVE) ? - DRM_MODE_FLAG_PHSYNC : DRM_MODE_FLAG_NHSYNC; - mode->flags |= (pt->misc & DRM_EDID_PT_VSYNC_POSITIVE) ? - DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC; - break; - } + mode->flags |= (pt->misc & DRM_EDID_PT_HSYNC_POSITIVE) ? + DRM_MODE_FLAG_PHSYNC : DRM_MODE_FLAG_NHSYNC; + mode->flags |= (pt->misc & DRM_EDID_PT_VSYNC_POSITIVE) ? + DRM_MODE_FLAG_PVSYNC : DRM_MODE_FLAG_NVSYNC; } set_size: diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h index 169755d3de19..48e93f909ef6 100644 --- a/include/drm/drm_edid.h +++ b/include/drm/drm_edid.h @@ -61,15 +61,9 @@ struct std_timing { u8 vfreq_aspect; } __attribute__((packed)); -#define DRM_EDID_PT_SYNC_MASK (3 << 3) -# define DRM_EDID_PT_ANALOG_CSYNC (0 << 3) -# define DRM_EDID_PT_BIPOLAR_ANALOG_CSYNC (1 << 3) -# define DRM_EDID_PT_DIGITAL_CSYNC (2 << 3) -# define DRM_EDID_PT_CSYNC_ON_RGB (1 << 1) /* analog csync only */ -# define DRM_EDID_PT_CSYNC_SERRATE (1 << 2) -# define DRM_EDID_PT_DIGITAL_SEPARATE_SYNC (3 << 3) -# define DRM_EDID_PT_HSYNC_POSITIVE (1 << 1) /* also digital csync */ -# define DRM_EDID_PT_VSYNC_POSITIVE (1 << 2) +#define DRM_EDID_PT_HSYNC_POSITIVE (1 << 1) +#define DRM_EDID_PT_VSYNC_POSITIVE (1 << 2) +#define DRM_EDID_PT_SEPARATE_SYNC (3 << 3) #define DRM_EDID_PT_STEREO (1 << 5) #define DRM_EDID_PT_INTERLACED (1 << 7) -- cgit v1.2.3 From 2d0c88e84e483982067a82073f6125490ddf3614 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Wed, 16 Aug 2023 17:12:22 +0800 Subject: sock: Fix misuse of sk_under_memory_pressure() The status of global socket memory pressure is updated when: a) __sk_mem_raise_allocated(): enter: sk_memory_allocated(sk) > sysctl_mem[1] leave: sk_memory_allocated(sk) <= sysctl_mem[0] b) __sk_mem_reduce_allocated(): leave: sk_under_memory_pressure(sk) && sk_memory_allocated(sk) < sysctl_mem[0] So the conditions of leaving global pressure are inconstant, which may lead to the situation that one pressured net-memcg prevents the global pressure from being cleared when there is indeed no global pressure, thus the global constrains are still in effect unexpectedly on the other sockets. This patch fixes this by ignoring the net-memcg's pressure when deciding whether should leave global memory pressure. Fixes: e1aab161e013 ("socket: initial cgroup code.") Signed-off-by: Abel Wu Acked-by: Shakeel Butt Link: https://lore.kernel.org/r/20230816091226.1542-1-wuyun.abel@bytedance.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 6 ++++++ net/core/sock.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2eb916d1ff64..e3d987b2ef12 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1420,6 +1420,12 @@ static inline bool sk_has_memory_pressure(const struct sock *sk) return sk->sk_prot->memory_pressure != NULL; } +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure && + !!*sk->sk_prot->memory_pressure; +} + static inline bool sk_under_memory_pressure(const struct sock *sk) { if (!sk->sk_prot->memory_pressure) diff --git a/net/core/sock.c b/net/core/sock.c index 732fc37a4771..c9cffb7acbea 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3159,7 +3159,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) if (mem_cgroup_sockets_enabled && sk->sk_memcg) mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); - if (sk_under_memory_pressure(sk) && + if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) sk_leave_memory_pressure(sk); } -- cgit v1.2.3 From b616be6b97688f2f2bd7c4a47ab32f27f94fb2a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2023 14:21:58 +0000 Subject: net: do not allow gso_size to be set to GSO_BY_FRAGS One missing check in virtio_net_hdr_to_skb() allowed syzbot to crash kernels again [1] Do not allow gso_size to be set to GSO_BY_FRAGS (0xffff), because this magic value is used by the kernel. [1] general protection fault, probably for non-canonical address 0xdffffc000000000e: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000070-0x0000000000000077] CPU: 0 PID: 5039 Comm: syz-executor401 Not tainted 6.5.0-rc5-next-20230809-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 RIP: 0010:skb_segment+0x1a52/0x3ef0 net/core/skbuff.c:4500 Code: 00 00 00 e9 ab eb ff ff e8 6b 96 5d f9 48 8b 84 24 00 01 00 00 48 8d 78 70 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e ea 21 00 00 48 8b 84 24 00 01 RSP: 0018:ffffc90003d3f1c8 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: 000000000001fffe RCX: 0000000000000000 RDX: 000000000000000e RSI: ffffffff882a3115 RDI: 0000000000000070 RBP: ffffc90003d3f378 R08: 0000000000000005 R09: 000000000000ffff R10: 000000000000ffff R11: 5ee4a93e456187d6 R12: 000000000001ffc6 R13: dffffc0000000000 R14: 0000000000000008 R15: 000000000000ffff FS: 00005555563f2380(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020020000 CR3: 000000001626d000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: udp6_ufo_fragment+0x9d2/0xd50 net/ipv6/udp_offload.c:109 ipv6_gso_segment+0x5c4/0x17b0 net/ipv6/ip6_offload.c:120 skb_mac_gso_segment+0x292/0x610 net/core/gso.c:53 __skb_gso_segment+0x339/0x710 net/core/gso.c:124 skb_gso_segment include/net/gso.h:83 [inline] validate_xmit_skb+0x3a5/0xf10 net/core/dev.c:3625 __dev_queue_xmit+0x8f0/0x3d60 net/core/dev.c:4329 dev_queue_xmit include/linux/netdevice.h:3082 [inline] packet_xmit+0x257/0x380 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3087 [inline] packet_sendmsg+0x24c7/0x5570 net/packet/af_packet.c:3119 sock_sendmsg_nosec net/socket.c:727 [inline] sock_sendmsg+0xd9/0x180 net/socket.c:750 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2496 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2550 __sys_sendmsg+0x117/0x1e0 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7ff27cdb34d9 Fixes: 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Xin Long Cc: "Michael S. Tsirkin" Cc: Jason Wang Reviewed-by: Willem de Bruijn Reviewed-by: Marcelo Ricardo Leitner Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/20230816142158.1779798-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index bdf8de2cdd93..7b4dd69555e4 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -155,6 +155,10 @@ retry: if (gso_type & SKB_GSO_UDP) nh_off -= thlen; + /* Kernel has a special handling for GSO_BY_FRAGS. */ + if (gso_size == GSO_BY_FRAGS) + return -EINVAL; + /* Too small packets are not really GSO ones. */ if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; -- cgit v1.2.3