From 8e742aa79780b13cd300a42198c1a4cea9c89905 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 10 Jun 2020 13:48:51 +0200 Subject: syscalls: Fix offset type of ksys_ftruncate() After the commit below, truncate() on x86 32bit uses ksys_ftruncate(). But ksys_ftruncate() truncates the offset to unsigned long. Switch the type of offset to loff_t which is what do_sys_ftruncate() expects. Fixes: 121b32a58a3a (x86/entry/32: Use IA32-specific wrappers for syscalls taking 64-bit arguments) Signed-off-by: Jiri Slaby Signed-off-by: Thomas Gleixner Reviewed-by: Brian Gerst Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200610114851.28549-1-jslaby@suse.cz --- include/linux/syscalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7c354c2955f5..b951a87da987 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1360,7 +1360,7 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -static inline long ksys_ftruncate(unsigned int fd, unsigned long length) +static inline long ksys_ftruncate(unsigned int fd, loff_t length) { return do_sys_ftruncate(fd, length, 1); } -- cgit v1.2.3 From e79302ae8c8cceb51cf642d5ace9da02668cb7b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2020 17:04:03 +0200 Subject: kcsan: Remove __no_kcsan_or_inline There are no more user of this function attribute, also, with us now actively supporting '__no_kcsan inline' it doesn't make sense to have in any case. Signed-off-by: Peter Zijlstra (Intel) --- Documentation/dev-tools/kcsan.rst | 6 ------ include/linux/compiler_types.h | 5 +---- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index ce4bbd918648..b38379f06194 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -114,12 +114,6 @@ the below options are available: To dynamically limit for which functions to generate reports, see the `DebugFS interface`_ blacklist/whitelist feature. - For ``__always_inline`` functions, replace ``__always_inline`` with - ``__no_kcsan_or_inline`` (which implies ``__always_inline``):: - - static __no_kcsan_or_inline void foo(void) { - ... - * To disable data race detection for a particular compilation unit, add to the ``Makefile``:: diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 21aed0981edf..938249809511 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -193,10 +193,7 @@ struct ftrace_likely_data { #define __no_kcsan __no_sanitize_thread #ifdef __SANITIZE_THREAD__ -# define __no_kcsan_or_inline __no_kcsan notrace __maybe_unused -# define __no_sanitize_or_inline __no_kcsan_or_inline -#else -# define __no_kcsan_or_inline __always_inline +# define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused #endif #ifndef __no_sanitize_or_inline -- cgit v1.2.3 From 5ddbc4082e1072eeeae52ff561a88620a05be08f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2020 18:47:11 +0200 Subject: x86, kcsan: Add __no_kcsan to noinstr The 'noinstr' function attribute means no-instrumentation, this should very much include *SAN. Because lots of that is broken at present, only include KCSAN for now, as that is limited to clang11, which has sane function attribute behaviour. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/compiler_types.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 938249809511..a8b4266084a1 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -118,10 +118,6 @@ struct ftrace_likely_data { #define notrace __attribute__((__no_instrument_function__)) #endif -/* Section for code which can't be instrumented at all */ -#define noinstr \ - noinline notrace __attribute((__section__(".noinstr.text"))) - /* * it doesn't make sense on ARM (currently the only user of __naked) * to trace naked functions because then mcount is called without @@ -200,6 +196,10 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* Section for code which can't be instrumented at all */ +#define noinstr \ + noinline notrace __attribute((__section__(".noinstr.text"))) __no_kcsan + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From 5144f8a8dfd7b3681f0a2b5bf599a210b2315018 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 4 Jun 2020 07:58:11 +0200 Subject: compiler_types.h: Add __no_sanitize_{address,undefined} to noinstr Adds the portable definitions for __no_sanitize_address, and __no_sanitize_undefined, and subsequently changes noinstr to use the attributes to disable instrumentation via KASAN or UBSAN. Reported-by: syzbot+dc1fa714cb070b184db5@syzkaller.appspotmail.com Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Miguel Ojeda Link: https://lore.kernel.org/lkml/000000000000d2474c05a6c938fe@google.com/ --- include/linux/compiler-clang.h | 8 ++++++++ include/linux/compiler-gcc.h | 6 ++++++ include/linux/compiler_types.h | 3 ++- 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index ee37256ec8bd..5e55302e3bf6 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -33,6 +33,14 @@ #define __no_sanitize_thread #endif +#if __has_feature(undefined_behavior_sanitizer) +/* GCC does not have __SANITIZE_UNDEFINED__ */ +#define __no_sanitize_undefined \ + __attribute__((no_sanitize("undefined"))) +#else +#define __no_sanitize_undefined +#endif + /* * Not all versions of clang implement the the type-generic versions * of the builtin overflow checkers. Fortunately, clang implements diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7dd4e0349ef3..1c74464c80c6 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -150,6 +150,12 @@ #define __no_sanitize_thread #endif +#if __has_attribute(__no_sanitize_undefined__) +#define __no_sanitize_undefined __attribute__((no_sanitize_undefined)) +#else +#define __no_sanitize_undefined +#endif + #if GCC_VERSION >= 50100 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index a8b4266084a1..85b8d2370c24 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -198,7 +198,8 @@ struct ftrace_likely_data { /* Section for code which can't be instrumented at all */ #define noinstr \ - noinline notrace __attribute((__section__(".noinstr.text"))) __no_kcsan + noinline notrace __attribute((__section__(".noinstr.text"))) \ + __no_kcsan __no_sanitize_address __no_sanitize_undefined #endif /* __KERNEL__ */ -- cgit v1.2.3 From 6b643a07a7e41f9e11cfbb9bba4c5c9791ac2997 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2020 20:09:06 +0200 Subject: x86/entry, ubsan, objtool: Whitelist __ubsan_handle_*() The UBSAN instrumentation only inserts external CALLs when things go 'BAD', much like WARN(). So treat them similar to WARN()s for noinstr, that is: allow them, at the risk of taking the machine down, to get their message out. Suggested-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver --- include/linux/compiler_types.h | 2 +- tools/objtool/check.c | 28 +++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 85b8d2370c24..14513e88b7e0 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -199,7 +199,7 @@ struct ftrace_likely_data { /* Section for code which can't be instrumented at all */ #define noinstr \ noinline notrace __attribute((__section__(".noinstr.text"))) \ - __no_kcsan __no_sanitize_address __no_sanitize_undefined + __no_kcsan __no_sanitize_address #endif /* __KERNEL__ */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 5fbb90a80d23..3e214f879ada 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2190,10 +2190,36 @@ static inline const char *call_dest_name(struct instruction *insn) return "{dynamic}"; } +static inline bool noinstr_call_dest(struct symbol *func) +{ + /* + * We can't deal with indirect function calls at present; + * assume they're instrumented. + */ + if (!func) + return false; + + /* + * If the symbol is from a noinstr section; we good. + */ + if (func->sec->noinstr) + return true; + + /* + * The __ubsan_handle_*() calls are like WARN(), they only happen when + * something 'BAD' happened. At the risk of taking the machine down, + * let them proceed to get the message out. + */ + if (!strncmp(func->name, "__ubsan_handle_", 15)) + return true; + + return false; +} + static int validate_call(struct instruction *insn, struct insn_state *state) { if (state->noinstr && state->instr <= 0 && - (!insn->call_dest || !insn->call_dest->sec->noinstr)) { + !noinstr_call_dest(insn->call_dest)) { WARN_FUNC("call to %s() leaves .noinstr.text section", insn->sec, insn->offset, call_dest_name(insn)); return 1; -- cgit v1.2.3 From 7dfc06a0f25b593a9f51992f540c0f80a57f3629 Mon Sep 17 00:00:00 2001 From: Fabian Vogt Date: Mon, 15 Jun 2020 09:16:36 +0200 Subject: efi/tpm: Verify event log header before parsing It is possible that the first event in the event log is not actually a log header at all, but rather a normal event. This leads to the cast in __calc_tpm2_event_size being an invalid conversion, which means that the values read are effectively garbage. Depending on the first event's contents, this leads either to apparently normal behaviour, a crash or a freeze. While this behaviour of the firmware is not in accordance with the TCG Client EFI Specification, this happens on a Dell Precision 5510 with the TPM enabled but hidden from the OS ("TPM On" disabled, state otherwise untouched). The EFI firmware claims that the TPM is present and active and that it supports the TCG 2.0 event log format. Fortunately, this can be worked around by simply checking the header of the first event and the event log header signature itself. Commit b4f1874c6216 ("tpm: check event log version before reading final events") addressed a similar issue also found on Dell models. Fixes: 6b0326190205 ("efi: Attempt to get the TCG2 event log in the boot stub") Signed-off-by: Fabian Vogt Link: https://lore.kernel.org/r/1927248.evlx2EsYKh@linux-e202.suse.de Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1165773 Signed-off-by: Ard Biesheuvel --- include/linux/tpm_eventlog.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 4f8c90c93c29..64356b199e94 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -81,6 +81,8 @@ struct tcg_efi_specid_event_algs { u16 digest_size; } __packed; +#define TCG_SPECID_SIG "Spec ID Event03" + struct tcg_efi_specid_event_head { u8 signature[16]; u32 platform_class; @@ -171,6 +173,7 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, int i; int j; u32 count, event_type; + const u8 zero_digest[sizeof(event_header->digest)] = {0}; marker = event; marker_start = marker; @@ -198,10 +201,19 @@ static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, count = READ_ONCE(event->count); event_type = READ_ONCE(event->event_type); + /* Verify that it's the log header */ + if (event_header->pcr_idx != 0 || + event_header->event_type != NO_ACTION || + memcmp(event_header->digest, zero_digest, sizeof(zero_digest))) { + size = 0; + goto out; + } + efispecid = (struct tcg_efi_specid_event_head *)event_header->event; /* Check if event is malformed. */ - if (count > efispecid->num_algs) { + if (memcmp(efispecid->signature, TCG_SPECID_SIG, + sizeof(TCG_SPECID_SIG)) || count > efispecid->num_algs) { size = 0; goto out; } -- cgit v1.2.3 From 2963795122f50b36ed16e3ba880c3ed2de1bda6e Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 27 May 2020 12:14:25 -0500 Subject: efi: Replace zero-length array and use struct_size() helper The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. Lastly, make use of the sizeof_field() helper instead of an open-coded version. This issue was found with the help of Coccinelle and audited _manually_. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20200527171425.GA4053@embeddedor Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/efi.c | 3 ++- include/linux/efi.h | 7 ++----- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 7f1657b6c30d..edc5d36caf54 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -622,7 +622,8 @@ int __init efi_config_parse_tables(const efi_config_table_t *config_tables, rsv = (void *)(p + prsv % PAGE_SIZE); /* reserve the entry itself */ - memblock_reserve(prsv, EFI_MEMRESERVE_SIZE(rsv->size)); + memblock_reserve(prsv, + struct_size(rsv, entry, rsv->size)); for (i = 0; i < atomic_read(&rsv->count); i++) { memblock_reserve(rsv->entry[i].base, diff --git a/include/linux/efi.h b/include/linux/efi.h index 2c6495f72f79..c3449c9699d0 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1236,14 +1236,11 @@ struct linux_efi_memreserve { struct { phys_addr_t base; phys_addr_t size; - } entry[0]; + } entry[]; }; -#define EFI_MEMRESERVE_SIZE(count) (sizeof(struct linux_efi_memreserve) + \ - (count) * sizeof(((struct linux_efi_memreserve *)0)->entry[0])) - #define EFI_MEMRESERVE_COUNT(size) (((size) - sizeof(struct linux_efi_memreserve)) \ - / sizeof(((struct linux_efi_memreserve *)0)->entry[0])) + / sizeof_field(struct linux_efi_memreserve, entry[0])) void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size); -- cgit v1.2.3 From 33aea07f30c261eff7ba229f19fd1b161e0fb851 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 16 Jun 2020 01:15:29 +0200 Subject: compiler_attributes.h: Support no_sanitize_undefined check with GCC 4 UBSAN is supported since GCC 4.9, which unfortunately did not yet have __has_attribute(). To work around, the __GCC4_has_attribute workaround requires defining which compiler version supports the given attribute. In the case of no_sanitize_undefined, it is the first version that supports UBSAN, which is GCC 4.9. Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miguel Ojeda Link: https://lkml.kernel.org/r/20200615231529.GA119644@google.com --- include/linux/compiler_attributes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index cdf016596659..c8f03d2969df 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -40,6 +40,7 @@ # define __GCC4_has_attribute___noclone__ 1 # define __GCC4_has_attribute___nonstring__ 0 # define __GCC4_has_attribute___no_sanitize_address__ (__GNUC_MINOR__ >= 8) +# define __GCC4_has_attribute___no_sanitize_undefined__ (__GNUC_MINOR__ >= 9) # define __GCC4_has_attribute___fallthrough__ 0 #endif -- cgit v1.2.3 From 2fd2bc7f49324e7b439e3c5584762abd0a9a13bc Mon Sep 17 00:00:00 2001 From: Colton Lewis Date: Sun, 14 Jun 2020 04:08:04 +0000 Subject: gpu: host1x: Correct trivial kernel-doc inconsistencies Silence documentation build warnings by adding kernel-doc fields. ./include/linux/host1x.h:69: warning: Function parameter or member 'parent' not described in 'host1x_client' ./include/linux/host1x.h:69: warning: Function parameter or member 'usecount' not described in 'host1x_client' ./include/linux/host1x.h:69: warning: Function parameter or member 'lock' not described in 'host1x_client' Signed-off-by: Colton Lewis Signed-off-by: Thierry Reding --- include/linux/host1x.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/host1x.h b/include/linux/host1x.h index c230b4e70d75..a3a568bf9686 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -48,6 +48,9 @@ struct host1x_client_ops { * @channel: host1x channel associated with this client * @syncpts: array of syncpoints requested for this client * @num_syncpts: number of syncpoints requested for this client + * @parent: pointer to parent structure + * @usecount: reference count for this structure + * @lock: mutex for mutually exclusive concurrency */ struct host1x_client { struct list_head list; -- cgit v1.2.3 From 26749b3201ab05e288fbf78fbc8585dfa2da3218 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 15 Jun 2020 08:52:31 +0200 Subject: dma-direct: mark __dma_direct_alloc_pages static Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 2 -- kernel/dma/direct.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 136f984df0d9..cdfa400f89b3 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -77,8 +77,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs); -struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - gfp_t gfp, unsigned long attrs); int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 9ec6a5c3fc57..30c41b57acd9 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -109,7 +109,7 @@ static inline bool dma_should_free_from_pool(struct device *dev, return false; } -struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, +static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, unsigned long attrs) { size_t alloc_size = PAGE_ALIGN(size); -- cgit v1.2.3 From 2a55280a3675203496d302463b941834228b9875 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sun, 7 Jun 2020 15:41:35 +0200 Subject: efi/libstub: arm: Print CPU boot mode and MMU state at boot On 32-bit ARM, we may boot at HYP mode, or with the MMU and caches off (or both), even though the EFI spec does not actually support this. While booting at HYP mode is something we might tolerate, fiddling with the caches is a more serious issue, as disabling the caches is tricky to do safely from C code, and running without the Dcache makes it impossible to support unaligned memory accesses, which is another explicit requirement imposed by the EFI spec. So take note of the CPU mode and MMU state in the EFI stub diagnostic output so that we can easily diagnose any issues that may arise from this. E.g., EFI stub: Entering in SVC mode with MMU enabled Also, capture the CPSR and SCTLR system register values at EFI stub entry, and after ExitBootServices() returns, and check whether the MMU and Dcache were disabled at any point. If this is the case, a diagnostic message like the following will be emitted: efi: [Firmware Bug]: EFI stub was entered with MMU and Dcache disabled, please fix your firmware! efi: CPSR at EFI stub entry : 0x600001d3 efi: SCTLR at EFI stub entry : 0x00c51838 efi: CPSR after ExitBootServices() : 0x600001d3 efi: SCTLR after ExitBootServices(): 0x00c50838 Signed-off-by: Ard Biesheuvel Reviewed-by: Leif Lindholm --- arch/arm/include/asm/efi.h | 7 ++++ drivers/firmware/efi/arm-init.c | 34 +++++++++++++++++-- drivers/firmware/efi/libstub/arm32-stub.c | 54 ++++++++++++++++++++++++++++++- drivers/firmware/efi/libstub/efi-stub.c | 3 ++ drivers/firmware/efi/libstub/efistub.h | 2 ++ include/linux/efi.h | 1 + 6 files changed, 98 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 84dc0ba822f5..5dcf3c6011b7 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -87,4 +87,11 @@ static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base, return dram_base + SZ_512M; } +struct efi_arm_entry_state { + u32 cpsr_before_ebs; + u32 sctlr_before_ebs; + u32 cpsr_after_ebs; + u32 sctlr_after_ebs; +}; + #endif /* _ASM_ARM_EFI_H */ diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index 6f4baf70db16..71c445d20258 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -52,9 +52,11 @@ static phys_addr_t __init efi_to_phys(unsigned long addr) } static __initdata unsigned long screen_info_table = EFI_INVALID_TABLE_ADDR; +static __initdata unsigned long cpu_state_table = EFI_INVALID_TABLE_ADDR; static const efi_config_table_type_t arch_tables[] __initconst = { {LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID, &screen_info_table}, + {LINUX_EFI_ARM_CPU_STATE_TABLE_GUID, &cpu_state_table}, {} }; @@ -240,9 +242,37 @@ void __init efi_init(void) init_screen_info(); +#ifdef CONFIG_ARM /* ARM does not permit early mappings to persist across paging_init() */ - if (IS_ENABLED(CONFIG_ARM)) - efi_memmap_unmap(); + efi_memmap_unmap(); + + if (cpu_state_table != EFI_INVALID_TABLE_ADDR) { + struct efi_arm_entry_state *state; + bool dump_state = true; + + state = early_memremap_ro(cpu_state_table, + sizeof(struct efi_arm_entry_state)); + if (state == NULL) { + pr_warn("Unable to map CPU entry state table.\n"); + return; + } + + if ((state->sctlr_before_ebs & 1) == 0) + pr_warn(FW_BUG "EFI stub was entered with MMU and Dcache disabled, please fix your firmware!\n"); + else if ((state->sctlr_after_ebs & 1) == 0) + pr_warn(FW_BUG "ExitBootServices() returned with MMU and Dcache disabled, please fix your firmware!\n"); + else + dump_state = false; + + if (dump_state || efi_enabled(EFI_DBG)) { + pr_info("CPSR at EFI stub entry : 0x%08x\n", state->cpsr_before_ebs); + pr_info("SCTLR at EFI stub entry : 0x%08x\n", state->sctlr_before_ebs); + pr_info("CPSR after ExitBootServices() : 0x%08x\n", state->cpsr_after_ebs); + pr_info("SCTLR after ExitBootServices(): 0x%08x\n", state->sctlr_after_ebs); + } + early_memunmap(state, sizeof(struct efi_arm_entry_state)); + } +#endif } static bool efifb_overlaps_pci_range(const struct of_pci_range *range) diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c index 40243f524556..d08e5d55838c 100644 --- a/drivers/firmware/efi/libstub/arm32-stub.c +++ b/drivers/firmware/efi/libstub/arm32-stub.c @@ -7,10 +7,49 @@ #include "efistub.h" +static efi_guid_t cpu_state_guid = LINUX_EFI_ARM_CPU_STATE_TABLE_GUID; + +struct efi_arm_entry_state *efi_entry_state; + +static void get_cpu_state(u32 *cpsr, u32 *sctlr) +{ + asm("mrs %0, cpsr" : "=r"(*cpsr)); + if ((*cpsr & MODE_MASK) == HYP_MODE) + asm("mrc p15, 4, %0, c1, c0, 0" : "=r"(*sctlr)); + else + asm("mrc p15, 0, %0, c1, c0, 0" : "=r"(*sctlr)); +} + efi_status_t check_platform_features(void) { + efi_status_t status; + u32 cpsr, sctlr; int block; + get_cpu_state(&cpsr, &sctlr); + + efi_info("Entering in %s mode with MMU %sabled\n", + ((cpsr & MODE_MASK) == HYP_MODE) ? "HYP" : "SVC", + (sctlr & 1) ? "en" : "dis"); + + status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, + sizeof(*efi_entry_state), + (void **)&efi_entry_state); + if (status != EFI_SUCCESS) { + efi_err("allocate_pool() failed\n"); + return status; + } + + efi_entry_state->cpsr_before_ebs = cpsr; + efi_entry_state->sctlr_before_ebs = sctlr; + + status = efi_bs_call(install_configuration_table, &cpu_state_guid, + efi_entry_state); + if (status != EFI_SUCCESS) { + efi_err("install_configuration_table() failed\n"); + goto free_state; + } + /* non-LPAE kernels can run anywhere */ if (!IS_ENABLED(CONFIG_ARM_LPAE)) return EFI_SUCCESS; @@ -19,9 +58,22 @@ efi_status_t check_platform_features(void) block = cpuid_feature_extract(CPUID_EXT_MMFR0, 0); if (block < 5) { efi_err("This LPAE kernel is not supported by your CPU\n"); - return EFI_UNSUPPORTED; + status = EFI_UNSUPPORTED; + goto drop_table; } return EFI_SUCCESS; + +drop_table: + efi_bs_call(install_configuration_table, &cpu_state_guid, NULL); +free_state: + efi_bs_call(free_pool, efi_entry_state); + return status; +} + +void efi_handle_post_ebs_state(void) +{ + get_cpu_state(&efi_entry_state->cpsr_after_ebs, + &efi_entry_state->sctlr_after_ebs); } static efi_guid_t screen_info_guid = LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID; diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index e97370bdfdb0..3318ec3f8e5b 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -329,6 +329,9 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, if (status != EFI_SUCCESS) goto fail_free_initrd; + if (IS_ENABLED(CONFIG_ARM)) + efi_handle_post_ebs_state(); + efi_enter_kernel(image_addr, fdt_addr, fdt_totalsize((void *)fdt_addr)); /* not reached */ diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index ac756f1fdb1a..2c9d42264c29 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -777,4 +777,6 @@ efi_status_t efi_load_initrd(efi_loaded_image_t *image, unsigned long soft_limit, unsigned long hard_limit); +void efi_handle_post_ebs_state(void); + #endif diff --git a/include/linux/efi.h b/include/linux/efi.h index c3449c9699d0..bb35f3305e55 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -350,6 +350,7 @@ void efi_native_runtime_setup(void); * associated with ConOut */ #define LINUX_EFI_ARM_SCREEN_INFO_TABLE_GUID EFI_GUID(0xe03fc20a, 0x85dc, 0x406e, 0xb9, 0x0e, 0x4a, 0xb5, 0x02, 0x37, 0x1d, 0x95) +#define LINUX_EFI_ARM_CPU_STATE_TABLE_GUID EFI_GUID(0xef79e4aa, 0x3c3d, 0x4989, 0xb9, 0x02, 0x07, 0xa9, 0x43, 0xe5, 0x50, 0xd2) #define LINUX_EFI_LOADER_ENTRY_GUID EFI_GUID(0x4a67b082, 0x0a4c, 0x41cf, 0xb6, 0xc7, 0x44, 0x0b, 0x29, 0xbb, 0x8c, 0x4f) #define LINUX_EFI_RANDOM_SEED_TABLE_GUID EFI_GUID(0x1ce1e5bc, 0x7ceb, 0x42f2, 0x81, 0xe5, 0x8a, 0xad, 0xf1, 0x80, 0xf5, 0x7b) #define LINUX_EFI_TPM_EVENT_LOG_GUID EFI_GUID(0xb7799cb0, 0xeca2, 0x4943, 0x96, 0x67, 0x1f, 0xae, 0x07, 0xb7, 0x47, 0xfa) -- cgit v1.2.3 From 34c86f4c4a7be3b3e35aa48bd18299d4c756064d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 8 Jun 2020 16:48:43 +1000 Subject: crypto: af_alg - fix use-after-free in af_alg_accept() due to bh_lock_sock() The locking in af_alg_release_parent is broken as the BH socket lock can only be taken if there is a code-path to handle the case where the lock is owned by process-context. Instead of adding such handling, we can fix this by changing the ref counts to atomic_t. This patch also modifies the main refcnt to include both normal and nokey sockets. This way we don't have to fudge the nokey ref count when a socket changes from nokey to normal. Credits go to Mauricio Faria de Oliveira who diagnosed this bug and sent a patch for it: https://lore.kernel.org/linux-crypto/20200605161657.535043-1-mfo@canonical.com/ Reported-by: Brian Moyles Reported-by: Mauricio Faria de Oliveira Fixes: 37f96694cf73 ("crypto: af_alg - Use bh_lock_sock in...") Cc: Signed-off-by: Herbert Xu --- crypto/af_alg.c | 26 +++++++++++--------------- crypto/algif_aead.c | 9 +++------ crypto/algif_hash.c | 9 +++------ crypto/algif_skcipher.c | 9 +++------ include/crypto/if_alg.h | 4 ++-- 5 files changed, 22 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/crypto/af_alg.c b/crypto/af_alg.c index b1cd3535c525..28fc323e3fe3 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -128,21 +128,15 @@ EXPORT_SYMBOL_GPL(af_alg_release); void af_alg_release_parent(struct sock *sk) { struct alg_sock *ask = alg_sk(sk); - unsigned int nokey = ask->nokey_refcnt; - bool last = nokey && !ask->refcnt; + unsigned int nokey = atomic_read(&ask->nokey_refcnt); sk = ask->parent; ask = alg_sk(sk); - local_bh_disable(); - bh_lock_sock(sk); - ask->nokey_refcnt -= nokey; - if (!last) - last = !--ask->refcnt; - bh_unlock_sock(sk); - local_bh_enable(); + if (nokey) + atomic_dec(&ask->nokey_refcnt); - if (last) + if (atomic_dec_and_test(&ask->refcnt)) sock_put(sk); } EXPORT_SYMBOL_GPL(af_alg_release_parent); @@ -187,7 +181,7 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = -EBUSY; lock_sock(sk); - if (ask->refcnt | ask->nokey_refcnt) + if (atomic_read(&ask->refcnt)) goto unlock; swap(ask->type, type); @@ -236,7 +230,7 @@ static int alg_setsockopt(struct socket *sock, int level, int optname, int err = -EBUSY; lock_sock(sk); - if (ask->refcnt) + if (atomic_read(&ask->refcnt) != atomic_read(&ask->nokey_refcnt)) goto unlock; type = ask->type; @@ -301,12 +295,14 @@ int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern) if (err) goto unlock; - if (nokey || !ask->refcnt++) + if (atomic_inc_return_relaxed(&ask->refcnt) == 1) sock_hold(sk); - ask->nokey_refcnt += nokey; + if (nokey) { + atomic_inc(&ask->nokey_refcnt); + atomic_set(&alg_sk(sk2)->nokey_refcnt, 1); + } alg_sk(sk2)->parent = sk; alg_sk(sk2)->type = type; - alg_sk(sk2)->nokey_refcnt = nokey; newsock->ops = type->ops; newsock->state = SS_CONNECTED; diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c index eb1910b6d434..0ae000a61c7f 100644 --- a/crypto/algif_aead.c +++ b/crypto/algif_aead.c @@ -384,7 +384,7 @@ static int aead_check_key(struct socket *sock) struct alg_sock *ask = alg_sk(sk); lock_sock(sk); - if (ask->refcnt) + if (!atomic_read(&ask->nokey_refcnt)) goto unlock_child; psk = ask->parent; @@ -396,11 +396,8 @@ static int aead_check_key(struct socket *sock) if (crypto_aead_get_flags(tfm->aead) & CRYPTO_TFM_NEED_KEY) goto unlock; - if (!pask->refcnt++) - sock_hold(psk); - - ask->refcnt = 1; - sock_put(psk); + atomic_dec(&pask->nokey_refcnt); + atomic_set(&ask->nokey_refcnt, 0); err = 0; diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index da1ffa4f7f8d..e71727c25a7d 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -301,7 +301,7 @@ static int hash_check_key(struct socket *sock) struct alg_sock *ask = alg_sk(sk); lock_sock(sk); - if (ask->refcnt) + if (!atomic_read(&ask->nokey_refcnt)) goto unlock_child; psk = ask->parent; @@ -313,11 +313,8 @@ static int hash_check_key(struct socket *sock) if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) goto unlock; - if (!pask->refcnt++) - sock_hold(psk); - - ask->refcnt = 1; - sock_put(psk); + atomic_dec(&pask->nokey_refcnt); + atomic_set(&ask->nokey_refcnt, 0); err = 0; diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 4c3bdffe0c3a..ec5567c87a6d 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -211,7 +211,7 @@ static int skcipher_check_key(struct socket *sock) struct alg_sock *ask = alg_sk(sk); lock_sock(sk); - if (ask->refcnt) + if (!atomic_read(&ask->nokey_refcnt)) goto unlock_child; psk = ask->parent; @@ -223,11 +223,8 @@ static int skcipher_check_key(struct socket *sock) if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) goto unlock; - if (!pask->refcnt++) - sock_hold(psk); - - ask->refcnt = 1; - sock_put(psk); + atomic_dec(&pask->nokey_refcnt); + atomic_set(&ask->nokey_refcnt, 0); err = 0; diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 56527c85d122..088c1ded2714 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -29,8 +29,8 @@ struct alg_sock { struct sock *parent; - unsigned int refcnt; - unsigned int nokey_refcnt; + atomic_t refcnt; + atomic_t nokey_refcnt; const struct af_alg_type *type; void *private; -- cgit v1.2.3 From f097eb38f71391ff2cf078788bad5a00eb3bd96a Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 9 Jun 2020 10:17:26 +0200 Subject: timekeeping: Fix kerneldoc system_device_crosststamp & al Make kernel doc comments actually work and fix the syncronized typo. [ tglx: Added the missing /** and fixed up formatting ] Signed-off-by: Kurt Kanzenbach Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200609081726.5657-1-kurt@linutronix.de --- include/linux/timekeeping.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index b27e2ffa96c1..d5471d6fa778 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -222,9 +222,9 @@ extern bool timekeeping_rtc_skipresume(void); extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta); -/* +/** * struct system_time_snapshot - simultaneous raw/real time capture with - * counter value + * counter value * @cycles: Clocksource counter value to produce the system times * @real: Realtime system time * @raw: Monotonic raw system time @@ -239,9 +239,9 @@ struct system_time_snapshot { u8 cs_was_changed_seq; }; -/* +/** * struct system_device_crosststamp - system/device cross-timestamp - * (syncronized capture) + * (synchronized capture) * @device: Device time * @sys_realtime: Realtime simultaneous with device time * @sys_monoraw: Monotonic raw simultaneous with device time @@ -252,12 +252,12 @@ struct system_device_crosststamp { ktime_t sys_monoraw; }; -/* +/** * struct system_counterval_t - system counter value with the pointer to the - * corresponding clocksource + * corresponding clocksource * @cycles: System counter value * @cs: Clocksource corresponding to system counter value. Used by - * timekeeping code to verify comparibility of two cycle values + * timekeeping code to verify comparibility of two cycle values */ struct system_counterval_t { u64 cycles; -- cgit v1.2.3 From f751820bc319a30d31a80fe511d88642aaefcdee Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 18 Jun 2020 13:07:13 -0600 Subject: vfio/type1: Fix migration info capability ID ID 1 is already used by the IOVA range capability, use ID 2. Reported-by: Liu Yi L Fixes: ad721705d09c ("vfio iommu: Add migration capability to report supported features") Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index eca6692667a3..920470502329 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1030,7 +1030,7 @@ struct vfio_iommu_type1_info_cap_iova_range { * size in bytes that can be used by user applications when getting the dirty * bitmap. */ -#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 1 +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION 2 struct vfio_iommu_type1_info_cap_migration { struct vfio_info_cap_header header; -- cgit v1.2.3 From 4bc799dcb67066e0531004d5bdbe755bb02b5488 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 15 Jun 2020 11:12:32 -0700 Subject: security: fix the key_permission LSM hook function type Commit 8c0637e950d6 ("keys: Make the KEY_NEED_* perms an enum rather than a mask") changed the type of the key_permission callback functions, but didn't change the type of the hook, which trips indirect call checking with Control-Flow Integrity (CFI). This change fixes the issue by changing the hook type to match the functions. Fixes: 8c0637e950d6 ("keys: Make the KEY_NEED_* perms an enum rather than a mask") Signed-off-by: Sami Tolvanen Acked-by: Kees Cook Signed-off-by: James Morris --- include/linux/lsm_hook_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 6791813cd439..24f6683f1cfc 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -360,7 +360,7 @@ LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred, unsigned long flags) LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key) LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred, - unsigned perm) + enum key_need_perm need_perm) LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **_buffer) #endif /* CONFIG_KEYS */ -- cgit v1.2.3 From 16ecf10e815d70d11d2300243f4a3b4c7c5acac7 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Jun 2020 07:13:41 +0800 Subject: iommu/vt-d: Set U/S bit in first level page table by default When using first-level translation for IOVA, currently the U/S bit in the page table is cleared which implies DMA requests with user privilege are blocked. As the result, following error messages might be observed when passing through a device to user level: DMAR: DRHD: handling fault status reg 3 DMAR: [DMA Read] Request device [41:00.0] PASID 1 fault addr 7ecdcd000 [fault reason 129] SM: U/S set 0 for first-level translation with user privilege This fixes it by setting U/S bit in the first level page table and makes IOVA over first level compatible with previous second-level translation. Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level") Reported-by: Xin Zeng Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20200622231345.29722-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 5 ++--- include/linux/intel-iommu.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9129663a7406..0fa394f7bbf9 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -921,7 +921,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE; if (domain_use_first_level(domain)) - pteval |= DMA_FL_PTE_XD; + pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US; if (cmpxchg64(&pte->val, 0ULL, pteval)) /* Someone else set it while we were thinking; use theirs. */ free_pgtable_page(tmp_page); @@ -1951,7 +1951,6 @@ static inline void context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid) { context->hi |= pasid & ((1 << 20) - 1); - context->hi |= (1 << 20); } /* @@ -2243,7 +2242,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); if (domain_use_first_level(domain)) - attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD; + attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US; if (!sg) { sg_res = nr_pages; diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 4100bd224f5c..3e8fa1c7a1e6 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -41,6 +41,7 @@ #define DMA_PTE_SNP BIT_ULL(11) #define DMA_FL_PTE_PRESENT BIT_ULL(0) +#define DMA_FL_PTE_US BIT_ULL(2) #define DMA_FL_PTE_XD BIT_ULL(63) #define ADDR_WIDTH_5LEVEL (57) -- cgit v1.2.3 From 23e390cdbe6f85827a43d38f9288dcd3066fa376 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 22 Jun 2020 00:21:35 +0200 Subject: security: Fix hook iteration and default value for inode_copy_up_xattr inode_copy_up_xattr returns 0 to indicate the acceptance of the xattr and 1 to reject it. If the LSM does not know about the xattr, it's expected to return -EOPNOTSUPP, which is the correct default value for this hook. BPF LSM, currently, uses 0 as the default value and thereby falsely allows all overlay fs xattributes to be copied up. The iteration logic is also updated from the "bail-on-fail" call_int_hook to continue on the non-decisive -EOPNOTSUPP and bail out on other values. Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks") Signed-off-by: KP Singh Signed-off-by: James Morris --- include/linux/lsm_hook_defs.h | 2 +- security/security.c | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 24f6683f1cfc..af998f93d256 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -150,7 +150,7 @@ LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer, size_t buffer_size) LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid) LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new) -LSM_HOOK(int, 0, inode_copy_up_xattr, const char *name) +LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, const char *name) LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir, struct kernfs_node *kn) LSM_HOOK(int, 0, file_permission, struct file *file, int mask) diff --git a/security/security.c b/security/security.c index 0ce3e73edd42..70a7ad357bc6 100644 --- a/security/security.c +++ b/security/security.c @@ -1414,7 +1414,22 @@ EXPORT_SYMBOL(security_inode_copy_up); int security_inode_copy_up_xattr(const char *name) { - return call_int_hook(inode_copy_up_xattr, -EOPNOTSUPP, name); + struct security_hook_list *hp; + int rc; + + /* + * The implementation can return 0 (accept the xattr), 1 (discard the + * xattr), -EOPNOTSUPP if it does not know anything about the xattr or + * any other error code incase of an error. + */ + hlist_for_each_entry(hp, + &security_hook_heads.inode_copy_up_xattr, list) { + rc = hp->hook.inode_copy_up_xattr(name); + if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr)) + return rc; + } + + return LSM_RET_DEFAULT(inode_copy_up_xattr); } EXPORT_SYMBOL(security_inode_copy_up_xattr); -- cgit v1.2.3 From bcc7f554cfa7e0ac77c7adc4027c16f4a2f99c6f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 23 Jun 2020 16:39:35 +0100 Subject: bpf: Fix formatting in documentation for BPF helpers When producing the bpf-helpers.7 man page from the documentation from the BPF user space header file, rst2man complains: :2636: (ERROR/3) Unexpected indentation. :2640: (WARNING/2) Block quote ends without a blank line; unexpected unindent. Let's fix formatting for the relevant chunk (item list in bpf_ringbuf_query()'s description), and for a couple other functions. Signed-off-by: Quentin Monnet Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200623153935.6215-1-quentin@isovalent.com --- include/uapi/linux/bpf.h | 41 +++++++++++++++++++++-------------------- tools/include/uapi/linux/bpf.h | 41 +++++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974a71342aea..8bd33050b7bb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3171,13 +3171,12 @@ union bpf_attr { * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3189,20 +3188,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3210,16 +3209,18 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. * * int bpf_csum_level(struct sk_buff *skb, u64 level) * Description diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974a71342aea..8bd33050b7bb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3171,13 +3171,12 @@ union bpf_attr { * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) * Description * Copy *size* bytes from *data* into a ring buffer *ringbuf*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return - * 0, on success; - * < 0, on error. + * 0 on success, or a negative error in case of failure. * * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) * Description @@ -3189,20 +3188,20 @@ union bpf_attr { * void bpf_ringbuf_submit(void *data, u64 flags) * Description * Submit reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * * void bpf_ringbuf_discard(void *data, u64 flags) * Description * Discard reserved ring buffer sample, pointed to by *data*. - * If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of - * new data availability is sent. - * IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of - * new data availability is sent unconditionally. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. * Return * Nothing. Always succeeds. * @@ -3210,16 +3209,18 @@ union bpf_attr { * Description * Query various characteristics of provided ring buffer. What * exactly is queries is determined by *flags*: - * - BPF_RB_AVAIL_DATA - amount of data not yet consumed; - * - BPF_RB_RING_SIZE - the size of ring buffer; - * - BPF_RB_CONS_POS - consumer position (can wrap around); - * - BPF_RB_PROD_POS - producer(s) position (can wrap around); - * Data returned is just a momentary snapshots of actual values + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values * and could be inaccurate, so this facility should be used to * power heuristics and for reporting, not to make 100% correct * calculation. * Return - * Requested value, or 0, if flags are not recognized. + * Requested value, or 0, if *flags* are not recognized. * * int bpf_csum_level(struct sk_buff *skb, u64 level) * Description -- cgit v1.2.3 From aad4b4d15f30de087c5972cfb767fadb5dbc3c52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 Jun 2020 09:13:02 +0200 Subject: scsi: libata: Fix the ata_scsi_dma_need_drain stub We not only need the stub when libata is disabled, but also if it is modular and there are built-in SAS drivers (which can happen when SCSI_SAS_ATA is disabled). Link: https://lore.kernel.org/r/20200620071302.462974-2-hch@lst.de Fixes: b8f1d1e05817 ("scsi: Wire up ata_scsi_dma_need_drain for SAS HBA drivers") Signed-off-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- include/linux/libata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 042e584daca7..c57bf6749681 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1092,7 +1092,7 @@ extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd, #define ATA_SCSI_COMPAT_IOCTL /* empty */ #endif extern int ata_scsi_queuecmd(struct Scsi_Host *h, struct scsi_cmnd *cmd); -#if IS_ENABLED(CONFIG_ATA) +#if IS_REACHABLE(CONFIG_ATA) bool ata_scsi_dma_need_drain(struct request *rq); #else #define ata_scsi_dma_need_drain NULL -- cgit v1.2.3 From dc5bdb68b5b369d5bc7d1de96fa64cc1737a6320 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 24 Jun 2020 11:29:10 +0200 Subject: drm/fb-helper: Fix vt restore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the past we had a pile of hacks to orchestrate access between fbdev emulation and native kms clients. We've tried to streamline this, by always preferring the kms side above fbdev calls when a drm master exists, because drm master controls access to the display resources. Unfortunately this breaks existing userspace, specifically Xorg. When exiting Xorg first restores the console to text mode using the KDSET ioctl on the vt. This does nothing, because a drm master is still around. Then it drops the drm master status, which again does nothing, because logind is keeping additional drm fd open to be able to orchestrate vt switches. In the past this is the point where fbdev was restored, as part of the ->lastclose hook on the drm side. Now to fix this regression we don't want to go back to letting fbdev restore things whenever it feels like, or to the pile of hacks we've had before. Instead try and go with a minimal exception to make the KDSET case work again, and nothing else. This means that if userspace does a KDSET call when switching between graphical compositors, there will be some flickering with fbcon showing up for a bit. But a) that's not a regression and b) userspace can fix it by improving the vt switching dance - logind should have all the information it needs. While pondering all this I'm also wondering wheter we should have a SWITCH_MASTER ioctl to allow race-free master status handover. But that's for another day. v2: Somehow forgot to cc all the fbdev people. v3: Fix typo Alex spotted. Reviewed-by: Alex Deucher Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=208179 Cc: shlomo@fastmail.com Reported-and-Tested-by: shlomo@fastmail.com Cc: Michel Dänzer Fixes: 64914da24ea9 ("drm/fbdev-helper: don't force restores") Cc: Noralf Trønnes Cc: Thomas Zimmermann Cc: Daniel Vetter Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: David Airlie Cc: Daniel Vetter Cc: dri-devel@lists.freedesktop.org Cc: # v5.7+ Cc: Bartlomiej Zolnierkiewicz Cc: Geert Uytterhoeven Cc: Nathan Chancellor Cc: Qiujun Huang Cc: Peter Rosin Cc: linux-fbdev@vger.kernel.org Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20200624092910.3280448-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/drm_fb_helper.c | 63 +++++++++++++++++++++++++++++++--------- drivers/video/fbdev/core/fbcon.c | 3 +- include/uapi/linux/fb.h | 1 + 3 files changed, 52 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index 170aa7689110..5609e164805f 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -227,18 +227,9 @@ int drm_fb_helper_debug_leave(struct fb_info *info) } EXPORT_SYMBOL(drm_fb_helper_debug_leave); -/** - * drm_fb_helper_restore_fbdev_mode_unlocked - restore fbdev configuration - * @fb_helper: driver-allocated fbdev helper, can be NULL - * - * This should be called from driver's drm &drm_driver.lastclose callback - * when implementing an fbcon on top of kms using this helper. This ensures that - * the user isn't greeted with a black screen when e.g. X dies. - * - * RETURNS: - * Zero if everything went ok, negative error code otherwise. - */ -int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) +static int +__drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper, + bool force) { bool do_delayed; int ret; @@ -250,7 +241,16 @@ int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) return 0; mutex_lock(&fb_helper->lock); - ret = drm_client_modeset_commit(&fb_helper->client); + if (force) { + /* + * Yes this is the _locked version which expects the master lock + * to be held. But for forced restores we're intentionally + * racing here, see drm_fb_helper_set_par(). + */ + ret = drm_client_modeset_commit_locked(&fb_helper->client); + } else { + ret = drm_client_modeset_commit(&fb_helper->client); + } do_delayed = fb_helper->delayed_hotplug; if (do_delayed) @@ -262,6 +262,22 @@ int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) return ret; } + +/** + * drm_fb_helper_restore_fbdev_mode_unlocked - restore fbdev configuration + * @fb_helper: driver-allocated fbdev helper, can be NULL + * + * This should be called from driver's drm &drm_driver.lastclose callback + * when implementing an fbcon on top of kms using this helper. This ensures that + * the user isn't greeted with a black screen when e.g. X dies. + * + * RETURNS: + * Zero if everything went ok, negative error code otherwise. + */ +int drm_fb_helper_restore_fbdev_mode_unlocked(struct drm_fb_helper *fb_helper) +{ + return __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, false); +} EXPORT_SYMBOL(drm_fb_helper_restore_fbdev_mode_unlocked); #ifdef CONFIG_MAGIC_SYSRQ @@ -1318,6 +1334,7 @@ int drm_fb_helper_set_par(struct fb_info *info) { struct drm_fb_helper *fb_helper = info->par; struct fb_var_screeninfo *var = &info->var; + bool force; if (oops_in_progress) return -EBUSY; @@ -1327,7 +1344,25 @@ int drm_fb_helper_set_par(struct fb_info *info) return -EINVAL; } - drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper); + /* + * Normally we want to make sure that a kms master takes precedence over + * fbdev, to avoid fbdev flickering and occasionally stealing the + * display status. But Xorg first sets the vt back to text mode using + * the KDSET IOCTL with KD_TEXT, and only after that drops the master + * status when exiting. + * + * In the past this was caught by drm_fb_helper_lastclose(), but on + * modern systems where logind always keeps a drm fd open to orchestrate + * the vt switching, this doesn't work. + * + * To not break the userspace ABI we have this special case here, which + * is only used for the above case. Everything else uses the normal + * commit function, which ensures that we never steal the display from + * an active drm master. + */ + force = var->activate & FB_ACTIVATE_KD_TEXT; + + __drm_fb_helper_restore_fbdev_mode_unlocked(fb_helper, force); return 0; } diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 9d28a8e3328f..e2a490c5ae08 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2402,7 +2402,8 @@ static int fbcon_blank(struct vc_data *vc, int blank, int mode_switch) ops->graphics = 1; if (!blank) { - var.activate = FB_ACTIVATE_NOW | FB_ACTIVATE_FORCE; + var.activate = FB_ACTIVATE_NOW | FB_ACTIVATE_FORCE | + FB_ACTIVATE_KD_TEXT; fb_set_var(info, &var); ops->graphics = 0; ops->var = info->var; diff --git a/include/uapi/linux/fb.h b/include/uapi/linux/fb.h index b6aac7ee1f67..4c14e8be7267 100644 --- a/include/uapi/linux/fb.h +++ b/include/uapi/linux/fb.h @@ -205,6 +205,7 @@ struct fb_bitfield { #define FB_ACTIVATE_ALL 64 /* change all VCs on this fb */ #define FB_ACTIVATE_FORCE 128 /* force apply even when no change*/ #define FB_ACTIVATE_INV_MODE 256 /* invalidate videomode */ +#define FB_ACTIVATE_KD_TEXT 512 /* for KDSET vt ioctl */ #define FB_ACCELF_TEXT 1 /* (OBSOLETE) see fb_info.flags and vc_mode */ -- cgit v1.2.3 From 0c1a7f13c9ec1ceb18d97ef4b1dd20ec71ffba31 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Wed, 17 Jun 2020 17:01:32 +0530 Subject: ieee80211: Add missing and new AKM suite selector definitions Add the definitions for missing AKM selectors defined in IEEE P802.11-REVmd/D3.0, table 9-151. These definitions will be used by various drivers that support these new AKM suites. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20200617113132.13477-1-vjakkam@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fe15f831841b..9f732499ea88 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -3333,13 +3333,17 @@ struct ieee80211_multiple_bssid_configuration { #define WLAN_AKM_SUITE_TDLS SUITE(0x000FAC, 7) #define WLAN_AKM_SUITE_SAE SUITE(0x000FAC, 8) #define WLAN_AKM_SUITE_FT_OVER_SAE SUITE(0x000FAC, 9) +#define WLAN_AKM_SUITE_AP_PEER_KEY SUITE(0x000FAC, 10) #define WLAN_AKM_SUITE_8021X_SUITE_B SUITE(0x000FAC, 11) #define WLAN_AKM_SUITE_8021X_SUITE_B_192 SUITE(0x000FAC, 12) +#define WLAN_AKM_SUITE_FT_8021X_SHA384 SUITE(0x000FAC, 13) #define WLAN_AKM_SUITE_FILS_SHA256 SUITE(0x000FAC, 14) #define WLAN_AKM_SUITE_FILS_SHA384 SUITE(0x000FAC, 15) #define WLAN_AKM_SUITE_FT_FILS_SHA256 SUITE(0x000FAC, 16) #define WLAN_AKM_SUITE_FT_FILS_SHA384 SUITE(0x000FAC, 17) #define WLAN_AKM_SUITE_OWE SUITE(0x000FAC, 18) +#define WLAN_AKM_SUITE_FT_PSK_SHA384 SUITE(0x000FAC, 19) +#define WLAN_AKM_SUITE_PSK_SHA384 SUITE(0x000FAC, 20) #define WLAN_MAX_KEY_LEN 32 -- cgit v1.2.3 From a9b59159d338d414acaa8e2f569d129d51c76452 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 24 Jun 2020 15:20:39 -0700 Subject: bpf: Do not allow btf_ctx_access with __int128 types To ensure btf_ctx_access() is safe the verifier checks that the BTF arg type is an int, enum, or pointer. When the function does the BTF arg lookup it uses the calculation 'arg = off / 8' using the fact that registers are 8B. This requires that the first arg is in the first reg, the second in the second, and so on. However, for __int128 the arg will consume two registers by default LLVM implementation. So this will cause the arg layout assumed by the 'arg = off / 8' calculation to be incorrect. Because __int128 is uncommon this patch applies the easiest fix and will force int types to be sizeof(u64) or smaller so that they will fit in a single register. v2: remove unneeded parens per Andrii's feedback Fixes: 9e15db66136a1 ("bpf: Implement accurate raw_tp context access via BTF") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/159303723962.11287.13309537171132420717.stgit@john-Precision-5820-Tower --- include/linux/btf.h | 5 +++++ kernel/bpf/btf.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 5c1ea99b480f..8b81fbb4497c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,6 +82,11 @@ static inline bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static inline bool btf_type_is_small_int(const struct btf_type *t) +{ + return btf_type_is_int(t) && t->size <= sizeof(u64); +} + static inline bool btf_type_is_enum(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 58c9af1d4808..9a1a98dd9e97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3746,7 +3746,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; t = btf_type_skip_modifiers(btf, t->type, NULL); - if (!btf_type_is_int(t)) { + if (!btf_type_is_small_int(t)) { bpf_log(log, "ret type %s not allowed for fmod_ret\n", btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -3768,7 +3768,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_int(t) || btf_type_is_enum(t)) + if (btf_type_is_small_int(t) || btf_type_is_enum(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { -- cgit v1.2.3 From 5faafd5685764e4d75376aceac91fdf75b3b16f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jun 2020 15:55:14 +0200 Subject: locking/atomics: Provide the arch_atomic_ interface to generic code Architectures with instrumented (KASAN/KCSAN) atomic operations natively provide arch_atomic_ variants that are not instrumented. It turns out that some generic code also requires arch_atomic_ in order to avoid instrumentation, so provide the arch_atomic_ interface as a direct map into the regular atomic_ interface for non-instrumented architectures. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- include/linux/atomic-fallback.h | 236 +++++++++++++++++++++++++++++++++- scripts/atomic/gen-atomic-fallback.sh | 31 +++++ 2 files changed, 266 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/atomic-fallback.h b/include/linux/atomic-fallback.h index 2c4927bf7b8d..fd525c71d676 100644 --- a/include/linux/atomic-fallback.h +++ b/include/linux/atomic-fallback.h @@ -77,6 +77,9 @@ #endif /* cmpxchg64_relaxed */ +#define arch_atomic_read atomic_read +#define arch_atomic_read_acquire atomic_read_acquire + #ifndef atomic_read_acquire static __always_inline int atomic_read_acquire(const atomic_t *v) @@ -86,6 +89,9 @@ atomic_read_acquire(const atomic_t *v) #define atomic_read_acquire atomic_read_acquire #endif +#define arch_atomic_set atomic_set +#define arch_atomic_set_release atomic_set_release + #ifndef atomic_set_release static __always_inline void atomic_set_release(atomic_t *v, int i) @@ -95,6 +101,13 @@ atomic_set_release(atomic_t *v, int i) #define atomic_set_release atomic_set_release #endif +#define arch_atomic_add atomic_add + +#define arch_atomic_add_return atomic_add_return +#define arch_atomic_add_return_acquire atomic_add_return_acquire +#define arch_atomic_add_return_release atomic_add_return_release +#define arch_atomic_add_return_relaxed atomic_add_return_relaxed + #ifndef atomic_add_return_relaxed #define atomic_add_return_acquire atomic_add_return #define atomic_add_return_release atomic_add_return @@ -137,6 +150,11 @@ atomic_add_return(int i, atomic_t *v) #endif /* atomic_add_return_relaxed */ +#define arch_atomic_fetch_add atomic_fetch_add +#define arch_atomic_fetch_add_acquire atomic_fetch_add_acquire +#define arch_atomic_fetch_add_release atomic_fetch_add_release +#define arch_atomic_fetch_add_relaxed atomic_fetch_add_relaxed + #ifndef atomic_fetch_add_relaxed #define atomic_fetch_add_acquire atomic_fetch_add #define atomic_fetch_add_release atomic_fetch_add @@ -179,6 +197,13 @@ atomic_fetch_add(int i, atomic_t *v) #endif /* atomic_fetch_add_relaxed */ +#define arch_atomic_sub atomic_sub + +#define arch_atomic_sub_return atomic_sub_return +#define arch_atomic_sub_return_acquire atomic_sub_return_acquire +#define arch_atomic_sub_return_release atomic_sub_return_release +#define arch_atomic_sub_return_relaxed atomic_sub_return_relaxed + #ifndef atomic_sub_return_relaxed #define atomic_sub_return_acquire atomic_sub_return #define atomic_sub_return_release atomic_sub_return @@ -221,6 +246,11 @@ atomic_sub_return(int i, atomic_t *v) #endif /* atomic_sub_return_relaxed */ +#define arch_atomic_fetch_sub atomic_fetch_sub +#define arch_atomic_fetch_sub_acquire atomic_fetch_sub_acquire +#define arch_atomic_fetch_sub_release atomic_fetch_sub_release +#define arch_atomic_fetch_sub_relaxed atomic_fetch_sub_relaxed + #ifndef atomic_fetch_sub_relaxed #define atomic_fetch_sub_acquire atomic_fetch_sub #define atomic_fetch_sub_release atomic_fetch_sub @@ -263,6 +293,8 @@ atomic_fetch_sub(int i, atomic_t *v) #endif /* atomic_fetch_sub_relaxed */ +#define arch_atomic_inc atomic_inc + #ifndef atomic_inc static __always_inline void atomic_inc(atomic_t *v) @@ -272,6 +304,11 @@ atomic_inc(atomic_t *v) #define atomic_inc atomic_inc #endif +#define arch_atomic_inc_return atomic_inc_return +#define arch_atomic_inc_return_acquire atomic_inc_return_acquire +#define arch_atomic_inc_return_release atomic_inc_return_release +#define arch_atomic_inc_return_relaxed atomic_inc_return_relaxed + #ifndef atomic_inc_return_relaxed #ifdef atomic_inc_return #define atomic_inc_return_acquire atomic_inc_return @@ -353,6 +390,11 @@ atomic_inc_return(atomic_t *v) #endif /* atomic_inc_return_relaxed */ +#define arch_atomic_fetch_inc atomic_fetch_inc +#define arch_atomic_fetch_inc_acquire atomic_fetch_inc_acquire +#define arch_atomic_fetch_inc_release atomic_fetch_inc_release +#define arch_atomic_fetch_inc_relaxed atomic_fetch_inc_relaxed + #ifndef atomic_fetch_inc_relaxed #ifdef atomic_fetch_inc #define atomic_fetch_inc_acquire atomic_fetch_inc @@ -434,6 +476,8 @@ atomic_fetch_inc(atomic_t *v) #endif /* atomic_fetch_inc_relaxed */ +#define arch_atomic_dec atomic_dec + #ifndef atomic_dec static __always_inline void atomic_dec(atomic_t *v) @@ -443,6 +487,11 @@ atomic_dec(atomic_t *v) #define atomic_dec atomic_dec #endif +#define arch_atomic_dec_return atomic_dec_return +#define arch_atomic_dec_return_acquire atomic_dec_return_acquire +#define arch_atomic_dec_return_release atomic_dec_return_release +#define arch_atomic_dec_return_relaxed atomic_dec_return_relaxed + #ifndef atomic_dec_return_relaxed #ifdef atomic_dec_return #define atomic_dec_return_acquire atomic_dec_return @@ -524,6 +573,11 @@ atomic_dec_return(atomic_t *v) #endif /* atomic_dec_return_relaxed */ +#define arch_atomic_fetch_dec atomic_fetch_dec +#define arch_atomic_fetch_dec_acquire atomic_fetch_dec_acquire +#define arch_atomic_fetch_dec_release atomic_fetch_dec_release +#define arch_atomic_fetch_dec_relaxed atomic_fetch_dec_relaxed + #ifndef atomic_fetch_dec_relaxed #ifdef atomic_fetch_dec #define atomic_fetch_dec_acquire atomic_fetch_dec @@ -605,6 +659,13 @@ atomic_fetch_dec(atomic_t *v) #endif /* atomic_fetch_dec_relaxed */ +#define arch_atomic_and atomic_and + +#define arch_atomic_fetch_and atomic_fetch_and +#define arch_atomic_fetch_and_acquire atomic_fetch_and_acquire +#define arch_atomic_fetch_and_release atomic_fetch_and_release +#define arch_atomic_fetch_and_relaxed atomic_fetch_and_relaxed + #ifndef atomic_fetch_and_relaxed #define atomic_fetch_and_acquire atomic_fetch_and #define atomic_fetch_and_release atomic_fetch_and @@ -647,6 +708,8 @@ atomic_fetch_and(int i, atomic_t *v) #endif /* atomic_fetch_and_relaxed */ +#define arch_atomic_andnot atomic_andnot + #ifndef atomic_andnot static __always_inline void atomic_andnot(int i, atomic_t *v) @@ -656,6 +719,11 @@ atomic_andnot(int i, atomic_t *v) #define atomic_andnot atomic_andnot #endif +#define arch_atomic_fetch_andnot atomic_fetch_andnot +#define arch_atomic_fetch_andnot_acquire atomic_fetch_andnot_acquire +#define arch_atomic_fetch_andnot_release atomic_fetch_andnot_release +#define arch_atomic_fetch_andnot_relaxed atomic_fetch_andnot_relaxed + #ifndef atomic_fetch_andnot_relaxed #ifdef atomic_fetch_andnot #define atomic_fetch_andnot_acquire atomic_fetch_andnot @@ -737,6 +805,13 @@ atomic_fetch_andnot(int i, atomic_t *v) #endif /* atomic_fetch_andnot_relaxed */ +#define arch_atomic_or atomic_or + +#define arch_atomic_fetch_or atomic_fetch_or +#define arch_atomic_fetch_or_acquire atomic_fetch_or_acquire +#define arch_atomic_fetch_or_release atomic_fetch_or_release +#define arch_atomic_fetch_or_relaxed atomic_fetch_or_relaxed + #ifndef atomic_fetch_or_relaxed #define atomic_fetch_or_acquire atomic_fetch_or #define atomic_fetch_or_release atomic_fetch_or @@ -779,6 +854,13 @@ atomic_fetch_or(int i, atomic_t *v) #endif /* atomic_fetch_or_relaxed */ +#define arch_atomic_xor atomic_xor + +#define arch_atomic_fetch_xor atomic_fetch_xor +#define arch_atomic_fetch_xor_acquire atomic_fetch_xor_acquire +#define arch_atomic_fetch_xor_release atomic_fetch_xor_release +#define arch_atomic_fetch_xor_relaxed atomic_fetch_xor_relaxed + #ifndef atomic_fetch_xor_relaxed #define atomic_fetch_xor_acquire atomic_fetch_xor #define atomic_fetch_xor_release atomic_fetch_xor @@ -821,6 +903,11 @@ atomic_fetch_xor(int i, atomic_t *v) #endif /* atomic_fetch_xor_relaxed */ +#define arch_atomic_xchg atomic_xchg +#define arch_atomic_xchg_acquire atomic_xchg_acquire +#define arch_atomic_xchg_release atomic_xchg_release +#define arch_atomic_xchg_relaxed atomic_xchg_relaxed + #ifndef atomic_xchg_relaxed #define atomic_xchg_acquire atomic_xchg #define atomic_xchg_release atomic_xchg @@ -863,6 +950,11 @@ atomic_xchg(atomic_t *v, int i) #endif /* atomic_xchg_relaxed */ +#define arch_atomic_cmpxchg atomic_cmpxchg +#define arch_atomic_cmpxchg_acquire atomic_cmpxchg_acquire +#define arch_atomic_cmpxchg_release atomic_cmpxchg_release +#define arch_atomic_cmpxchg_relaxed atomic_cmpxchg_relaxed + #ifndef atomic_cmpxchg_relaxed #define atomic_cmpxchg_acquire atomic_cmpxchg #define atomic_cmpxchg_release atomic_cmpxchg @@ -905,6 +997,11 @@ atomic_cmpxchg(atomic_t *v, int old, int new) #endif /* atomic_cmpxchg_relaxed */ +#define arch_atomic_try_cmpxchg atomic_try_cmpxchg +#define arch_atomic_try_cmpxchg_acquire atomic_try_cmpxchg_acquire +#define arch_atomic_try_cmpxchg_release atomic_try_cmpxchg_release +#define arch_atomic_try_cmpxchg_relaxed atomic_try_cmpxchg_relaxed + #ifndef atomic_try_cmpxchg_relaxed #ifdef atomic_try_cmpxchg #define atomic_try_cmpxchg_acquire atomic_try_cmpxchg @@ -1002,6 +1099,8 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) #endif /* atomic_try_cmpxchg_relaxed */ +#define arch_atomic_sub_and_test atomic_sub_and_test + #ifndef atomic_sub_and_test /** * atomic_sub_and_test - subtract value from variable and test result @@ -1020,6 +1119,8 @@ atomic_sub_and_test(int i, atomic_t *v) #define atomic_sub_and_test atomic_sub_and_test #endif +#define arch_atomic_dec_and_test atomic_dec_and_test + #ifndef atomic_dec_and_test /** * atomic_dec_and_test - decrement and test @@ -1037,6 +1138,8 @@ atomic_dec_and_test(atomic_t *v) #define atomic_dec_and_test atomic_dec_and_test #endif +#define arch_atomic_inc_and_test atomic_inc_and_test + #ifndef atomic_inc_and_test /** * atomic_inc_and_test - increment and test @@ -1054,6 +1157,8 @@ atomic_inc_and_test(atomic_t *v) #define atomic_inc_and_test atomic_inc_and_test #endif +#define arch_atomic_add_negative atomic_add_negative + #ifndef atomic_add_negative /** * atomic_add_negative - add and test if negative @@ -1072,6 +1177,8 @@ atomic_add_negative(int i, atomic_t *v) #define atomic_add_negative atomic_add_negative #endif +#define arch_atomic_fetch_add_unless atomic_fetch_add_unless + #ifndef atomic_fetch_add_unless /** * atomic_fetch_add_unless - add unless the number is already a given value @@ -1097,6 +1204,8 @@ atomic_fetch_add_unless(atomic_t *v, int a, int u) #define atomic_fetch_add_unless atomic_fetch_add_unless #endif +#define arch_atomic_add_unless atomic_add_unless + #ifndef atomic_add_unless /** * atomic_add_unless - add unless the number is already a given value @@ -1115,6 +1224,8 @@ atomic_add_unless(atomic_t *v, int a, int u) #define atomic_add_unless atomic_add_unless #endif +#define arch_atomic_inc_not_zero atomic_inc_not_zero + #ifndef atomic_inc_not_zero /** * atomic_inc_not_zero - increment unless the number is zero @@ -1131,6 +1242,8 @@ atomic_inc_not_zero(atomic_t *v) #define atomic_inc_not_zero atomic_inc_not_zero #endif +#define arch_atomic_inc_unless_negative atomic_inc_unless_negative + #ifndef atomic_inc_unless_negative static __always_inline bool atomic_inc_unless_negative(atomic_t *v) @@ -1147,6 +1260,8 @@ atomic_inc_unless_negative(atomic_t *v) #define atomic_inc_unless_negative atomic_inc_unless_negative #endif +#define arch_atomic_dec_unless_positive atomic_dec_unless_positive + #ifndef atomic_dec_unless_positive static __always_inline bool atomic_dec_unless_positive(atomic_t *v) @@ -1163,6 +1278,8 @@ atomic_dec_unless_positive(atomic_t *v) #define atomic_dec_unless_positive atomic_dec_unless_positive #endif +#define arch_atomic_dec_if_positive atomic_dec_if_positive + #ifndef atomic_dec_if_positive static __always_inline int atomic_dec_if_positive(atomic_t *v) @@ -1184,6 +1301,9 @@ atomic_dec_if_positive(atomic_t *v) #include #endif +#define arch_atomic64_read atomic64_read +#define arch_atomic64_read_acquire atomic64_read_acquire + #ifndef atomic64_read_acquire static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) @@ -1193,6 +1313,9 @@ atomic64_read_acquire(const atomic64_t *v) #define atomic64_read_acquire atomic64_read_acquire #endif +#define arch_atomic64_set atomic64_set +#define arch_atomic64_set_release atomic64_set_release + #ifndef atomic64_set_release static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) @@ -1202,6 +1325,13 @@ atomic64_set_release(atomic64_t *v, s64 i) #define atomic64_set_release atomic64_set_release #endif +#define arch_atomic64_add atomic64_add + +#define arch_atomic64_add_return atomic64_add_return +#define arch_atomic64_add_return_acquire atomic64_add_return_acquire +#define arch_atomic64_add_return_release atomic64_add_return_release +#define arch_atomic64_add_return_relaxed atomic64_add_return_relaxed + #ifndef atomic64_add_return_relaxed #define atomic64_add_return_acquire atomic64_add_return #define atomic64_add_return_release atomic64_add_return @@ -1244,6 +1374,11 @@ atomic64_add_return(s64 i, atomic64_t *v) #endif /* atomic64_add_return_relaxed */ +#define arch_atomic64_fetch_add atomic64_fetch_add +#define arch_atomic64_fetch_add_acquire atomic64_fetch_add_acquire +#define arch_atomic64_fetch_add_release atomic64_fetch_add_release +#define arch_atomic64_fetch_add_relaxed atomic64_fetch_add_relaxed + #ifndef atomic64_fetch_add_relaxed #define atomic64_fetch_add_acquire atomic64_fetch_add #define atomic64_fetch_add_release atomic64_fetch_add @@ -1286,6 +1421,13 @@ atomic64_fetch_add(s64 i, atomic64_t *v) #endif /* atomic64_fetch_add_relaxed */ +#define arch_atomic64_sub atomic64_sub + +#define arch_atomic64_sub_return atomic64_sub_return +#define arch_atomic64_sub_return_acquire atomic64_sub_return_acquire +#define arch_atomic64_sub_return_release atomic64_sub_return_release +#define arch_atomic64_sub_return_relaxed atomic64_sub_return_relaxed + #ifndef atomic64_sub_return_relaxed #define atomic64_sub_return_acquire atomic64_sub_return #define atomic64_sub_return_release atomic64_sub_return @@ -1328,6 +1470,11 @@ atomic64_sub_return(s64 i, atomic64_t *v) #endif /* atomic64_sub_return_relaxed */ +#define arch_atomic64_fetch_sub atomic64_fetch_sub +#define arch_atomic64_fetch_sub_acquire atomic64_fetch_sub_acquire +#define arch_atomic64_fetch_sub_release atomic64_fetch_sub_release +#define arch_atomic64_fetch_sub_relaxed atomic64_fetch_sub_relaxed + #ifndef atomic64_fetch_sub_relaxed #define atomic64_fetch_sub_acquire atomic64_fetch_sub #define atomic64_fetch_sub_release atomic64_fetch_sub @@ -1370,6 +1517,8 @@ atomic64_fetch_sub(s64 i, atomic64_t *v) #endif /* atomic64_fetch_sub_relaxed */ +#define arch_atomic64_inc atomic64_inc + #ifndef atomic64_inc static __always_inline void atomic64_inc(atomic64_t *v) @@ -1379,6 +1528,11 @@ atomic64_inc(atomic64_t *v) #define atomic64_inc atomic64_inc #endif +#define arch_atomic64_inc_return atomic64_inc_return +#define arch_atomic64_inc_return_acquire atomic64_inc_return_acquire +#define arch_atomic64_inc_return_release atomic64_inc_return_release +#define arch_atomic64_inc_return_relaxed atomic64_inc_return_relaxed + #ifndef atomic64_inc_return_relaxed #ifdef atomic64_inc_return #define atomic64_inc_return_acquire atomic64_inc_return @@ -1460,6 +1614,11 @@ atomic64_inc_return(atomic64_t *v) #endif /* atomic64_inc_return_relaxed */ +#define arch_atomic64_fetch_inc atomic64_fetch_inc +#define arch_atomic64_fetch_inc_acquire atomic64_fetch_inc_acquire +#define arch_atomic64_fetch_inc_release atomic64_fetch_inc_release +#define arch_atomic64_fetch_inc_relaxed atomic64_fetch_inc_relaxed + #ifndef atomic64_fetch_inc_relaxed #ifdef atomic64_fetch_inc #define atomic64_fetch_inc_acquire atomic64_fetch_inc @@ -1541,6 +1700,8 @@ atomic64_fetch_inc(atomic64_t *v) #endif /* atomic64_fetch_inc_relaxed */ +#define arch_atomic64_dec atomic64_dec + #ifndef atomic64_dec static __always_inline void atomic64_dec(atomic64_t *v) @@ -1550,6 +1711,11 @@ atomic64_dec(atomic64_t *v) #define atomic64_dec atomic64_dec #endif +#define arch_atomic64_dec_return atomic64_dec_return +#define arch_atomic64_dec_return_acquire atomic64_dec_return_acquire +#define arch_atomic64_dec_return_release atomic64_dec_return_release +#define arch_atomic64_dec_return_relaxed atomic64_dec_return_relaxed + #ifndef atomic64_dec_return_relaxed #ifdef atomic64_dec_return #define atomic64_dec_return_acquire atomic64_dec_return @@ -1631,6 +1797,11 @@ atomic64_dec_return(atomic64_t *v) #endif /* atomic64_dec_return_relaxed */ +#define arch_atomic64_fetch_dec atomic64_fetch_dec +#define arch_atomic64_fetch_dec_acquire atomic64_fetch_dec_acquire +#define arch_atomic64_fetch_dec_release atomic64_fetch_dec_release +#define arch_atomic64_fetch_dec_relaxed atomic64_fetch_dec_relaxed + #ifndef atomic64_fetch_dec_relaxed #ifdef atomic64_fetch_dec #define atomic64_fetch_dec_acquire atomic64_fetch_dec @@ -1712,6 +1883,13 @@ atomic64_fetch_dec(atomic64_t *v) #endif /* atomic64_fetch_dec_relaxed */ +#define arch_atomic64_and atomic64_and + +#define arch_atomic64_fetch_and atomic64_fetch_and +#define arch_atomic64_fetch_and_acquire atomic64_fetch_and_acquire +#define arch_atomic64_fetch_and_release atomic64_fetch_and_release +#define arch_atomic64_fetch_and_relaxed atomic64_fetch_and_relaxed + #ifndef atomic64_fetch_and_relaxed #define atomic64_fetch_and_acquire atomic64_fetch_and #define atomic64_fetch_and_release atomic64_fetch_and @@ -1754,6 +1932,8 @@ atomic64_fetch_and(s64 i, atomic64_t *v) #endif /* atomic64_fetch_and_relaxed */ +#define arch_atomic64_andnot atomic64_andnot + #ifndef atomic64_andnot static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) @@ -1763,6 +1943,11 @@ atomic64_andnot(s64 i, atomic64_t *v) #define atomic64_andnot atomic64_andnot #endif +#define arch_atomic64_fetch_andnot atomic64_fetch_andnot +#define arch_atomic64_fetch_andnot_acquire atomic64_fetch_andnot_acquire +#define arch_atomic64_fetch_andnot_release atomic64_fetch_andnot_release +#define arch_atomic64_fetch_andnot_relaxed atomic64_fetch_andnot_relaxed + #ifndef atomic64_fetch_andnot_relaxed #ifdef atomic64_fetch_andnot #define atomic64_fetch_andnot_acquire atomic64_fetch_andnot @@ -1844,6 +2029,13 @@ atomic64_fetch_andnot(s64 i, atomic64_t *v) #endif /* atomic64_fetch_andnot_relaxed */ +#define arch_atomic64_or atomic64_or + +#define arch_atomic64_fetch_or atomic64_fetch_or +#define arch_atomic64_fetch_or_acquire atomic64_fetch_or_acquire +#define arch_atomic64_fetch_or_release atomic64_fetch_or_release +#define arch_atomic64_fetch_or_relaxed atomic64_fetch_or_relaxed + #ifndef atomic64_fetch_or_relaxed #define atomic64_fetch_or_acquire atomic64_fetch_or #define atomic64_fetch_or_release atomic64_fetch_or @@ -1886,6 +2078,13 @@ atomic64_fetch_or(s64 i, atomic64_t *v) #endif /* atomic64_fetch_or_relaxed */ +#define arch_atomic64_xor atomic64_xor + +#define arch_atomic64_fetch_xor atomic64_fetch_xor +#define arch_atomic64_fetch_xor_acquire atomic64_fetch_xor_acquire +#define arch_atomic64_fetch_xor_release atomic64_fetch_xor_release +#define arch_atomic64_fetch_xor_relaxed atomic64_fetch_xor_relaxed + #ifndef atomic64_fetch_xor_relaxed #define atomic64_fetch_xor_acquire atomic64_fetch_xor #define atomic64_fetch_xor_release atomic64_fetch_xor @@ -1928,6 +2127,11 @@ atomic64_fetch_xor(s64 i, atomic64_t *v) #endif /* atomic64_fetch_xor_relaxed */ +#define arch_atomic64_xchg atomic64_xchg +#define arch_atomic64_xchg_acquire atomic64_xchg_acquire +#define arch_atomic64_xchg_release atomic64_xchg_release +#define arch_atomic64_xchg_relaxed atomic64_xchg_relaxed + #ifndef atomic64_xchg_relaxed #define atomic64_xchg_acquire atomic64_xchg #define atomic64_xchg_release atomic64_xchg @@ -1970,6 +2174,11 @@ atomic64_xchg(atomic64_t *v, s64 i) #endif /* atomic64_xchg_relaxed */ +#define arch_atomic64_cmpxchg atomic64_cmpxchg +#define arch_atomic64_cmpxchg_acquire atomic64_cmpxchg_acquire +#define arch_atomic64_cmpxchg_release atomic64_cmpxchg_release +#define arch_atomic64_cmpxchg_relaxed atomic64_cmpxchg_relaxed + #ifndef atomic64_cmpxchg_relaxed #define atomic64_cmpxchg_acquire atomic64_cmpxchg #define atomic64_cmpxchg_release atomic64_cmpxchg @@ -2012,6 +2221,11 @@ atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) #endif /* atomic64_cmpxchg_relaxed */ +#define arch_atomic64_try_cmpxchg atomic64_try_cmpxchg +#define arch_atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg_acquire +#define arch_atomic64_try_cmpxchg_release atomic64_try_cmpxchg_release +#define arch_atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg_relaxed + #ifndef atomic64_try_cmpxchg_relaxed #ifdef atomic64_try_cmpxchg #define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg @@ -2109,6 +2323,8 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) #endif /* atomic64_try_cmpxchg_relaxed */ +#define arch_atomic64_sub_and_test atomic64_sub_and_test + #ifndef atomic64_sub_and_test /** * atomic64_sub_and_test - subtract value from variable and test result @@ -2127,6 +2343,8 @@ atomic64_sub_and_test(s64 i, atomic64_t *v) #define atomic64_sub_and_test atomic64_sub_and_test #endif +#define arch_atomic64_dec_and_test atomic64_dec_and_test + #ifndef atomic64_dec_and_test /** * atomic64_dec_and_test - decrement and test @@ -2144,6 +2362,8 @@ atomic64_dec_and_test(atomic64_t *v) #define atomic64_dec_and_test atomic64_dec_and_test #endif +#define arch_atomic64_inc_and_test atomic64_inc_and_test + #ifndef atomic64_inc_and_test /** * atomic64_inc_and_test - increment and test @@ -2161,6 +2381,8 @@ atomic64_inc_and_test(atomic64_t *v) #define atomic64_inc_and_test atomic64_inc_and_test #endif +#define arch_atomic64_add_negative atomic64_add_negative + #ifndef atomic64_add_negative /** * atomic64_add_negative - add and test if negative @@ -2179,6 +2401,8 @@ atomic64_add_negative(s64 i, atomic64_t *v) #define atomic64_add_negative atomic64_add_negative #endif +#define arch_atomic64_fetch_add_unless atomic64_fetch_add_unless + #ifndef atomic64_fetch_add_unless /** * atomic64_fetch_add_unless - add unless the number is already a given value @@ -2204,6 +2428,8 @@ atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) #define atomic64_fetch_add_unless atomic64_fetch_add_unless #endif +#define arch_atomic64_add_unless atomic64_add_unless + #ifndef atomic64_add_unless /** * atomic64_add_unless - add unless the number is already a given value @@ -2222,6 +2448,8 @@ atomic64_add_unless(atomic64_t *v, s64 a, s64 u) #define atomic64_add_unless atomic64_add_unless #endif +#define arch_atomic64_inc_not_zero atomic64_inc_not_zero + #ifndef atomic64_inc_not_zero /** * atomic64_inc_not_zero - increment unless the number is zero @@ -2238,6 +2466,8 @@ atomic64_inc_not_zero(atomic64_t *v) #define atomic64_inc_not_zero atomic64_inc_not_zero #endif +#define arch_atomic64_inc_unless_negative atomic64_inc_unless_negative + #ifndef atomic64_inc_unless_negative static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) @@ -2254,6 +2484,8 @@ atomic64_inc_unless_negative(atomic64_t *v) #define atomic64_inc_unless_negative atomic64_inc_unless_negative #endif +#define arch_atomic64_dec_unless_positive atomic64_dec_unless_positive + #ifndef atomic64_dec_unless_positive static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) @@ -2270,6 +2502,8 @@ atomic64_dec_unless_positive(atomic64_t *v) #define atomic64_dec_unless_positive atomic64_dec_unless_positive #endif +#define arch_atomic64_dec_if_positive atomic64_dec_if_positive + #ifndef atomic64_dec_if_positive static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) @@ -2288,4 +2522,4 @@ atomic64_dec_if_positive(atomic64_t *v) #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 1fac0941c79bf0ae100723cc2ac9b94061f0b67a +// 9d95b56f98d82a2a26c7b79ccdd0c47572d50a6f diff --git a/scripts/atomic/gen-atomic-fallback.sh b/scripts/atomic/gen-atomic-fallback.sh index 0fd1cf0c2b94..693dfa1de430 100755 --- a/scripts/atomic/gen-atomic-fallback.sh +++ b/scripts/atomic/gen-atomic-fallback.sh @@ -58,6 +58,21 @@ cat << EOF EOF } +gen_proto_order_variant() +{ + local meta="$1"; shift + local pfx="$1"; shift + local name="$1"; shift + local sfx="$1"; shift + local order="$1"; shift + local arch="$1" + local atomic="$2" + + local basename="${arch}${atomic}_${pfx}${name}${sfx}" + + printf "#define arch_${basename}${order} ${basename}${order}\n" +} + #gen_proto_order_variants(meta, pfx, name, sfx, arch, atomic, int, args...) gen_proto_order_variants() { @@ -72,6 +87,22 @@ gen_proto_order_variants() local template="$(find_fallback_template "${pfx}" "${name}" "${sfx}" "${order}")" + if [ -z "$arch" ]; then + gen_proto_order_variant "${meta}" "${pfx}" "${name}" "${sfx}" "" "$@" + + if meta_has_acquire "${meta}"; then + gen_proto_order_variant "${meta}" "${pfx}" "${name}" "${sfx}" "_acquire" "$@" + fi + if meta_has_release "${meta}"; then + gen_proto_order_variant "${meta}" "${pfx}" "${name}" "${sfx}" "_release" "$@" + fi + if meta_has_relaxed "${meta}"; then + gen_proto_order_variant "${meta}" "${pfx}" "${name}" "${sfx}" "_relaxed" "$@" + fi + + echo "" + fi + # If we don't have relaxed atomics, then we don't bother with ordering fallbacks # read_acquire and set_release need to be templated, though if ! meta_has_relaxed "${meta}"; then -- cgit v1.2.3 From b58e733fd774f3f4b49d9e7640d172a57e35200e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2020 18:24:27 +0200 Subject: rcu: Fixup noinstr warnings A KCSAN build revealed we have explicit annoations through atomic_*() usage, switch to arch_atomic_*() for the respective functions. vmlinux.o: warning: objtool: rcu_nmi_exit()+0x4d: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_dynticks_eqs_enter()+0x25: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_nmi_enter()+0x4f: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_dynticks_eqs_exit()+0x2a: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: __rcu_is_watching()+0x25: call to __kcsan_check_access() leaves .noinstr.text section Additionally, without the NOP in instrumentation_begin(), objtool would not detect the lack of the 'else instrumentation_begin();' branch in rcu_nmi_enter(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- include/linux/compiler.h | 2 +- kernel/rcu/tree.c | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 30827f82ad62..204e76856435 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -123,7 +123,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #ifdef CONFIG_DEBUG_ENTRY /* Begin/end of an instrumentation safe region */ #define instrumentation_begin() ({ \ - asm volatile("%c0:\n\t" \ + asm volatile("%c0: nop\n\t" \ ".pushsection .discard.instr_begin\n\t" \ ".long %c0b - .\n\t" \ ".popsection\n\t" : : "i" (__COUNTER__)); \ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c716eadc7617..6c6569e0586c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -250,7 +250,7 @@ static noinstr void rcu_dynticks_eqs_enter(void) * next idle sojourn. */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICK_CTRL_CTR)); @@ -274,13 +274,13 @@ static noinstr void rcu_dynticks_eqs_exit(void) * and we also must force ordering with the next RCU read-side * critical section. */ - seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); + seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICK_CTRL_CTR)); if (seq & RCU_DYNTICK_CTRL_MASK) { - atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); + arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); smp_mb__after_atomic(); /* _exit after clearing mask. */ } } @@ -313,7 +313,7 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - return !(atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); + return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -633,6 +633,10 @@ static noinstr void rcu_eqs_enter(bool user) do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + instrumentation_end(); WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ // RCU is watching here ... @@ -692,6 +696,7 @@ noinstr void rcu_nmi_exit(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + instrumentation_begin(); /* * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. * (We are exiting an NMI handler, so RCU better be paying attention @@ -705,7 +710,6 @@ noinstr void rcu_nmi_exit(void) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - instrumentation_begin(); trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ @@ -714,13 +718,15 @@ noinstr void rcu_nmi_exit(void) return; } - instrumentation_begin(); /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (!in_nmi()) rcu_prepare_for_idle(); + + // instrumentation for the noinstr rcu_dynticks_eqs_enter() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); // RCU is watching here ... @@ -838,6 +844,10 @@ static void noinstr rcu_eqs_exit(bool user) rcu_dynticks_eqs_exit(); // ... but is watching here. instrumentation_begin(); + + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); @@ -983,13 +993,21 @@ noinstr void rcu_nmi_enter(void) if (!in_nmi()) rcu_cleanup_after_idle(); + instrumentation_begin(); + // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() + instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); + // instrumentation for the noinstr rcu_dynticks_eqs_exit() + instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); + incby = 1; } else if (!in_nmi()) { instrumentation_begin(); rcu_irq_enter_check_tick(); instrumentation_end(); + } else { + instrumentation_begin(); } - instrumentation_begin(); + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); -- cgit v1.2.3 From 8dbdd5049cfa8f8848711ba83c9bbf67e08f5b2d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 25 Jun 2020 20:30:07 -0700 Subject: make asm-generic/cacheflush.h more standalone Some s390 builds get these warnings: include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:22:46: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:28:45: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:36:44: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:44:45: warning: 'struct page' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:52:50: warning: 'struct address_space' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:58:52: warning: 'struct address_space' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:75:17: warning: 'struct page' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:74:45: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:82:16: warning: 'struct page' declared inside parameter list will not be visible outside of this definition or declaration include/asm-generic/cacheflush.h:81:50: warning: 'struct vm_area_struct' declared inside parameter list will not be visible outside of this definition or declaration Forward declare the named structs to get rid of these. Link: http://lkml.kernel.org/r/20200623135714.4dae4b8a@canb.auug.org.au Fixes: e0cf615d725c ("asm-generic: don't include in cacheflush.h") Signed-off-by: Stephen Rothwell Reviewed-by: Christoph Hellwig Acked-by: Arnd Bergmann Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 907fa5d16494..4a674db4e1fa 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,6 +2,11 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H +struct mm_struct; +struct vm_area_struct; +struct page; +struct address_space; + /* * The cache doesn't need to be flushed when TLB entries change when * the cache is mapped to physical memory, not virtual memory -- cgit v1.2.3 From 31d8fcac00fcf4007f3921edc69ab4dcb3abcd4d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Jun 2020 20:30:31 -0700 Subject: mm: workingset: age nonresident information alongside anonymous pages Patch series "fix for "mm: balance LRU lists based on relative thrashing" patchset" This patchset fixes some problems of the patchset, "mm: balance LRU lists based on relative thrashing", which is now merged on the mainline. Patch "mm: workingset: let cache workingset challenge anon fix" is the result of discussion with Johannes. See following link. http://lkml.kernel.org/r/20200520232525.798933-6-hannes@cmpxchg.org And, the other two are minor things which are found when I try to rebase my patchset. This patch (of 3): After ("mm: workingset: let cache workingset challenge anon fix"), we compare refault distances to active_file + anon. But age of the non-resident information is only driven by the file LRU. As a result, we may overestimate the recency of any incoming refaults and activate them too eagerly, causing unnecessary LRU churn in certain situations. Make anon aging drive nonresident age as well to address that. Link: http://lkml.kernel.org/r/1592288204-27734-1-git-send-email-iamjoonsoo.kim@lge.com Link: http://lkml.kernel.org/r/1592288204-27734-2-git-send-email-iamjoonsoo.kim@lge.com Fixes: 34e58cac6d8f2a ("mm: workingset: let cache workingset challenge anon") Reported-by: Joonsoo Kim Signed-off-by: Johannes Weiner Signed-off-by: Joonsoo Kim Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- include/linux/swap.h | 1 + mm/vmscan.c | 3 +++ mm/workingset.c | 46 +++++++++++++++++++++++++++------------------- 4 files changed, 33 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c4c37fd12104..f6f884970511 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -257,8 +257,8 @@ struct lruvec { */ unsigned long anon_cost; unsigned long file_cost; - /* Evictions & activations on the inactive file list */ - atomic_long_t inactive_age; + /* Non-resident age, driven by LRU movement */ + atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; /* Various lruvec state flags (enum lruvec_flags) */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 4c5974bb9ba9..5b3216ba39a9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -313,6 +313,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); diff --git a/mm/vmscan.c b/mm/vmscan.c index b6d84326bdf2..749d239c62b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); + workingset_eviction(page, target_memcg); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; + if (PageActive(page)) + workingset_age_nonresident(lruvec, nr_pages); } } diff --git a/mm/workingset.c b/mm/workingset.c index d481ea452eeb..50b7937bab32 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,8 @@ * * Implementation * - * For each node's file LRU lists, a counter for inactive evictions - * and activations is maintained (node->inactive_age). + * For each node's LRU lists, a counter for inactive evictions and + * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +/** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @memcg: the lruvec that was aged + * @nr_pages: the number of pages to count + * + * As in-memory pages are aged, non-resident pages need to be aged as + * well, in order for the refault distances later on to be comparable + * to the in-memory dimensions. This function allows reclaim and LRU + * operations to drive the non-resident aging along in parallel. + */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) * the root cgroup's, age as well. */ do { - struct lruvec *lruvec; - - lruvec = mem_cgroup_lruvec(memcg, pgdat); - atomic_long_inc(&lruvec->inactive_age); - } while (memcg && (memcg = parent_mem_cgroup(memcg))); + atomic_long_add(nr_pages, &lruvec->nonresident_age); + } while ((lruvec = parent_lruvec(lruvec))); } /** @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - advance_inactive_age(page_memcg(page), pgdat); - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->inactive_age); + eviction = atomic_long_read(&lruvec->nonresident_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow) if (!mem_cgroup_disabled() && !eviction_memcg) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); - refault = atomic_long_read(&eviction_lruvec->inactive_age); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. There is a + * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the - * inactive_age to lap a shadow entry in the field, which can - * then result in a false small refault distance, leading to a - * false activation should this old entry actually refault - * again. However, earlier kernels used to deactivate + * nonresident_age to lap a shadow entry in the field, which + * can then result in a false small refault distance, leading + * to a false activation should this old entry actually + * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - advance_inactive_age(memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -382,6 +388,7 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -394,7 +401,8 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - advance_inactive_age(memcg, page_pgdat(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); out: rcu_read_unlock(); } -- cgit v1.2.3 From 7a0e27b2a0ce2735e27e21ebc8b777550fe0ed81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jun 2020 20:30:47 -0700 Subject: mm: remove vmalloc_exec Merge vmalloc_exec into its only caller. Note that for !CONFIG_MMU __vmalloc_node_range maps to __vmalloc, which directly clears the __GFP_HIGHMEM added by the vmalloc_exec stub anyway. Link: http://lkml.kernel.org/r/20200618064307.32739-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Dexuan Cui Cc: Jessica Yu Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 - kernel/module.c | 4 +++- mm/nommu.c | 17 ----------------- mm/vmalloc.c | 20 -------------------- 4 files changed, 3 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 48bb681e6c2a..0221f852a7e1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..0c6573b98c36 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2783,7 +2783,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return vmalloc_exec(size); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __func__); } bool __weak module_init_section(const char *name) diff --git a/mm/nommu.c b/mm/nommu.c index cdcad5d61dd1..f32a69095d50 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -290,23 +290,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ - -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); -} - /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 957a0be77270..5a2b55c8dd9a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2695,26 +2695,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -- cgit v1.2.3 From 5946d1f5b309381805bad3ddc3054c04f4ae9c24 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Thu, 4 Jun 2020 15:31:19 +0530 Subject: kdb: Switch to use safer dbg_io_ops over console APIs In kgdb context, calling console handlers aren't safe due to locks used in those handlers which could in turn lead to a deadlock. Although, using oops_in_progress increases the chance to bypass locks in most console handlers but it might not be sufficient enough in case a console uses more locks (VT/TTY is good example). Currently when a driver provides both polling I/O and a console then kdb will output using the console. We can increase robustness by using the currently active polling I/O driver (which should be lockless) instead of the corresponding console. For several common cases (e.g. an embedded system with a single serial port that is used both for console output and debugger I/O) this will result in no console handler being used. In order to achieve this we need to reverse the order of preference to use dbg_io_ops (uses polling I/O mode) over console APIs. So we just store "struct console" that represents debugger I/O in dbg_io_ops and while emitting kdb messages, skip console that matches dbg_io_ops console in order to avoid duplicate messages. After this change, "is_console" param becomes redundant and hence removed. Suggested-by: Daniel Thompson Signed-off-by: Sumit Garg Link: https://lore.kernel.org/r/1591264879-25920-5-git-send-email-sumit.garg@linaro.org Reviewed-by: Douglas Anderson Reviewed-by: Petr Mladek Acked-by: Greg Kroah-Hartman Signed-off-by: Daniel Thompson --- drivers/tty/serial/kgdb_nmi.c | 2 +- drivers/tty/serial/kgdboc.c | 32 ++++++++++++++++---------------- drivers/usb/early/ehci-dbgp.c | 3 ++- include/linux/kgdb.h | 5 ++--- kernel/debug/kdb/kdb_io.c | 4 +++- 5 files changed, 24 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/tty/serial/kgdb_nmi.c b/drivers/tty/serial/kgdb_nmi.c index 5022447afa23..6004c0c1d173 100644 --- a/drivers/tty/serial/kgdb_nmi.c +++ b/drivers/tty/serial/kgdb_nmi.c @@ -50,7 +50,7 @@ static int kgdb_nmi_console_setup(struct console *co, char *options) * I/O utilities that messages sent to the console will automatically * be displayed on the dbg_io. */ - dbg_io_ops->is_console = true; + dbg_io_ops->cons = co; return 0; } diff --git a/drivers/tty/serial/kgdboc.c b/drivers/tty/serial/kgdboc.c index 41396982e9e0..84ffede27f23 100644 --- a/drivers/tty/serial/kgdboc.c +++ b/drivers/tty/serial/kgdboc.c @@ -45,7 +45,6 @@ static struct platform_device *kgdboc_pdev; #if IS_BUILTIN(CONFIG_KGDB_SERIAL_CONSOLE) static struct kgdb_io kgdboc_earlycon_io_ops; -static struct console *earlycon; static int (*earlycon_orig_exit)(struct console *con); #endif /* IS_BUILTIN(CONFIG_KGDB_SERIAL_CONSOLE) */ @@ -145,7 +144,7 @@ static void kgdboc_unregister_kbd(void) #if IS_BUILTIN(CONFIG_KGDB_SERIAL_CONSOLE) static void cleanup_earlycon(void) { - if (earlycon) + if (kgdboc_earlycon_io_ops.cons) kgdb_unregister_io_module(&kgdboc_earlycon_io_ops); } #else /* !IS_BUILTIN(CONFIG_KGDB_SERIAL_CONSOLE) */ @@ -178,7 +177,7 @@ static int configure_kgdboc(void) goto noconfig; } - kgdboc_io_ops.is_console = 0; + kgdboc_io_ops.cons = NULL; kgdb_tty_driver = NULL; kgdboc_use_kms = 0; @@ -198,7 +197,7 @@ static int configure_kgdboc(void) int idx; if (cons->device && cons->device(cons, &idx) == p && idx == tty_line) { - kgdboc_io_ops.is_console = 1; + kgdboc_io_ops.cons = cons; break; } } @@ -433,7 +432,8 @@ static int kgdboc_earlycon_get_char(void) { char c; - if (!earlycon->read(earlycon, &c, 1)) + if (!kgdboc_earlycon_io_ops.cons->read(kgdboc_earlycon_io_ops.cons, + &c, 1)) return NO_POLL_CHAR; return c; @@ -441,7 +441,8 @@ static int kgdboc_earlycon_get_char(void) static void kgdboc_earlycon_put_char(u8 chr) { - earlycon->write(earlycon, &chr, 1); + kgdboc_earlycon_io_ops.cons->write(kgdboc_earlycon_io_ops.cons, &chr, + 1); } static void kgdboc_earlycon_pre_exp_handler(void) @@ -461,7 +462,7 @@ static void kgdboc_earlycon_pre_exp_handler(void) * boot if we detect this case. */ for_each_console(con) - if (con == earlycon) + if (con == kgdboc_earlycon_io_ops.cons) return; already_warned = true; @@ -484,25 +485,25 @@ static int kgdboc_earlycon_deferred_exit(struct console *con) static void kgdboc_earlycon_deinit(void) { - if (!earlycon) + if (!kgdboc_earlycon_io_ops.cons) return; - if (earlycon->exit == kgdboc_earlycon_deferred_exit) + if (kgdboc_earlycon_io_ops.cons->exit == kgdboc_earlycon_deferred_exit) /* * kgdboc_earlycon is exiting but original boot console exit * was never called (AKA kgdboc_earlycon_deferred_exit() * didn't ever run). Undo our trap. */ - earlycon->exit = earlycon_orig_exit; - else if (earlycon->exit) + kgdboc_earlycon_io_ops.cons->exit = earlycon_orig_exit; + else if (kgdboc_earlycon_io_ops.cons->exit) /* * We skipped calling the exit() routine so we could try to * keep using the boot console even after it went away. We're * finally done so call the function now. */ - earlycon->exit(earlycon); + kgdboc_earlycon_io_ops.cons->exit(kgdboc_earlycon_io_ops.cons); - earlycon = NULL; + kgdboc_earlycon_io_ops.cons = NULL; } static struct kgdb_io kgdboc_earlycon_io_ops = { @@ -511,7 +512,6 @@ static struct kgdb_io kgdboc_earlycon_io_ops = { .write_char = kgdboc_earlycon_put_char, .pre_exception = kgdboc_earlycon_pre_exp_handler, .deinit = kgdboc_earlycon_deinit, - .is_console = true, }; #define MAX_CONSOLE_NAME_LEN (sizeof((struct console *) 0)->name) @@ -557,10 +557,10 @@ static int __init kgdboc_earlycon_init(char *opt) goto unlock; } - earlycon = con; + kgdboc_earlycon_io_ops.cons = con; pr_info("Going to register kgdb with earlycon '%s'\n", con->name); if (kgdb_register_io_module(&kgdboc_earlycon_io_ops) != 0) { - earlycon = NULL; + kgdboc_earlycon_io_ops.cons = NULL; pr_info("Failed to register kgdb with earlycon\n"); } else { /* Trap exit so we can keep earlycon longer if needed. */ diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c index ea0d531c63e2..775cf70cfb3e 100644 --- a/drivers/usb/early/ehci-dbgp.c +++ b/drivers/usb/early/ehci-dbgp.c @@ -1058,7 +1058,8 @@ static int __init kgdbdbgp_parse_config(char *str) kgdbdbgp_wait_time = simple_strtoul(ptr, &ptr, 10); } kgdb_register_io_module(&kgdbdbgp_io_ops); - kgdbdbgp_io_ops.is_console = early_dbgp_console.index != -1; + if (early_dbgp_console.index != -1) + kgdbdbgp_io_ops.cons = &early_dbgp_console; return 0; } diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index c62d76478adc..529116b0cabe 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -276,8 +276,7 @@ struct kgdb_arch { * the I/O driver. * @post_exception: Pointer to a function that will do any cleanup work * for the I/O driver. - * @is_console: 1 if the end device is a console 0 if the I/O device is - * not a console + * @cons: valid if the I/O device is a console; else NULL. */ struct kgdb_io { const char *name; @@ -288,7 +287,7 @@ struct kgdb_io { void (*deinit) (void); void (*pre_exception) (void); void (*post_exception) (void); - int is_console; + struct console *cons; }; extern const struct kgdb_arch arch_kgdb_ops; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0e4f2eda96d8..683a799618ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -549,7 +549,7 @@ static void kdb_msg_write(const char *msg, int msg_len) if (msg_len == 0) return; - if (dbg_io_ops && !dbg_io_ops->is_console) { + if (dbg_io_ops) { const char *cp = msg; int len = msg_len; @@ -562,6 +562,8 @@ static void kdb_msg_write(const char *msg, int msg_len) for_each_console(c) { if (!(c->flags & CON_ENABLED)) continue; + if (c == dbg_io_ops->cons) + continue; /* * Set oops_in_progress to encourage the console drivers to * disregard their internal spin locks: in the current calling -- cgit v1.2.3 From 4f311afc2035f7b33d97e69ffaa2e6cd67fca16d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2020 12:14:09 +0200 Subject: sched/core: Fix CONFIG_GCC_PLUGIN_RANDSTRUCT build fail As a temporary build fix, the proper cleanup needs more work. Reported-by: Guenter Roeck Reported-by: Eric Biggers Suggested-by: Eric Biggers Suggested-by: Kees Cook Fixes: a148866489fb ("sched: Replace rq::wake_list") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b62e6aaf28f0..224b5de568e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,8 +654,10 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct llist_node wake_entry; - unsigned int wake_entry_type; + struct { + struct llist_node wake_entry; + unsigned int wake_entry_type; + }; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ -- cgit v1.2.3 From 8c4890d1c3358fb8023d46e1e554c41d54f02878 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:25 +0200 Subject: smp, irq_work: Continue smp_call_function*() and irq_work*() integration Instead of relying on BUG_ON() to ensure the various data structures line up, use a bunch of horrible unions to make it all automatic. Much of the union magic is to ensure irq_work and smp_call_function do not (yet) see the members of their respective data structures change name. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20200622100825.844455025@infradead.org --- include/linux/irq_work.h | 26 ++++++------------- include/linux/sched.h | 5 +--- include/linux/smp.h | 23 ++++++----------- include/linux/smp_types.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 6 ++--- kernel/smp.c | 18 ------------- 6 files changed, 86 insertions(+), 58 deletions(-) create mode 100644 include/linux/smp_types.h (limited to 'include') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 2735da5f839e..30823780c192 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,7 +2,7 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H -#include +#include /* * An entry can be in one of four states: @@ -13,24 +13,14 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ -/* flags share CSD_FLAG_ space */ - -#define IRQ_WORK_PENDING BIT(0) -#define IRQ_WORK_BUSY BIT(1) - -/* Doesn't want IPI, wait for tick: */ -#define IRQ_WORK_LAZY BIT(2) -/* Run hard IRQ context, even on RT */ -#define IRQ_WORK_HARD_IRQ BIT(3) - -#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) - -/* - * structure shares layout with single_call_data_t. - */ struct irq_work { - struct llist_node llnode; - atomic_t flags; + union { + struct __call_single_node node; + struct { + struct llist_node llnode; + atomic_t flags; + }; + }; void (*func)(struct irq_work *); }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 224b5de568e7..692e327d7455 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,11 +654,8 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct { - struct llist_node wake_entry; - unsigned int wake_entry_type; - }; int on_cpu; + struct __call_single_node wake_entry; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; diff --git a/include/linux/smp.h b/include/linux/smp.h index 7ee202ad21a6..80d557ef8a11 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -12,29 +12,22 @@ #include #include #include -#include +#include typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); -enum { - CSD_FLAG_LOCK = 0x01, - - /* IRQ_WORK_flags */ - - CSD_TYPE_ASYNC = 0x00, - CSD_TYPE_SYNC = 0x10, - CSD_TYPE_IRQ_WORK = 0x20, - CSD_TYPE_TTWU = 0x30, - CSD_FLAG_TYPE_MASK = 0xF0, -}; - /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { - struct llist_node llist; - unsigned int flags; + union { + struct __call_single_node node; + struct { + struct llist_node llist; + unsigned int flags; + }; + }; smp_call_func_t func; void *info; }; diff --git a/include/linux/smp_types.h b/include/linux/smp_types.h new file mode 100644 index 000000000000..364b3ae3e41d --- /dev/null +++ b/include/linux/smp_types.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_SMP_TYPES_H +#define __LINUX_SMP_TYPES_H + +#include + +enum { + CSD_FLAG_LOCK = 0x01, + + IRQ_WORK_PENDING = 0x01, + IRQ_WORK_BUSY = 0x02, + IRQ_WORK_LAZY = 0x04, /* No IPI, wait for tick */ + IRQ_WORK_HARD_IRQ = 0x08, /* IRQ context on PREEMPT_RT */ + + IRQ_WORK_CLAIMED = (IRQ_WORK_PENDING | IRQ_WORK_BUSY), + + CSD_TYPE_ASYNC = 0x00, + CSD_TYPE_SYNC = 0x10, + CSD_TYPE_IRQ_WORK = 0x20, + CSD_TYPE_TTWU = 0x30, + + CSD_FLAG_TYPE_MASK = 0xF0, +}; + +/* + * struct __call_single_node is the primary type on + * smp.c:call_single_queue. + * + * flush_smp_call_function_queue() only reads the type from + * __call_single_node::u_flags as a regular load, the above + * (anonymous) enum defines all the bits of this word. + * + * Other bits are not modified until the type is known. + * + * CSD_TYPE_SYNC/ASYNC: + * struct { + * struct llist_node node; + * unsigned int flags; + * smp_call_func_t func; + * void *info; + * }; + * + * CSD_TYPE_IRQ_WORK: + * struct { + * struct llist_node node; + * atomic_t flags; + * void (*func)(struct irq_work *); + * }; + * + * CSD_TYPE_TTWU: + * struct { + * struct llist_node node; + * unsigned int flags; + * }; + * + */ + +struct __call_single_node { + struct llist_node llist; + union { + unsigned int u_flags; + atomic_t a_flags; + }; +}; + +#endif /* __LINUX_SMP_TYPES_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f778067de277..ca5db40392d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2293,7 +2293,7 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) { + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { if (WARN_ON_ONCE(p->on_cpu)) smp_cond_load_acquire(&p->on_cpu, !VAL); @@ -2329,7 +2329,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - __smp_call_single_queue(cpu, &p->wake_entry); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2786,7 +2786,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP - p->wake_entry_type = CSD_TYPE_TTWU; + p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif } diff --git a/kernel/smp.c b/kernel/smp.c index 472c2b274c65..aa17eedff5be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,24 +669,6 @@ void __init smp_init(void) { int num_nodes, num_cpus; - /* - * Ensure struct irq_work layout matches so that - * flush_smp_call_function_queue() can do horrible things. - */ - BUILD_BUG_ON(offsetof(struct irq_work, llnode) != - offsetof(struct __call_single_data, llist)); - BUILD_BUG_ON(offsetof(struct irq_work, func) != - offsetof(struct __call_single_data, func)); - BUILD_BUG_ON(offsetof(struct irq_work, flags) != - offsetof(struct __call_single_data, flags)); - - /* - * Assert the CSD_TYPE_TTWU layout is similar enough - * for task_struct to be on the @call_single_queue. - */ - BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != - offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); - idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From bfe373f608cf81b7626dfeb904001b0e867c5110 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 28 Apr 2020 09:54:56 +0800 Subject: blk-mq-debugfs: update blk_queue_flag_name[] accordingly for new flags Else there may be magic numbers in /sys/kernel/debug/block/*/state. Signed-off-by: Hou Tao Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 3 +++ include/linux/blkdev.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 15df3a36e9fa..e0b2bc131bf5 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -125,6 +125,9 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), + QUEUE_FLAG_NAME(PCI_P2PDMA), + QUEUE_FLAG_NAME(ZONE_RESETALL), + QUEUE_FLAG_NAME(RQ_ALLOC_TIME), }; #undef QUEUE_FLAG_NAME diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8fd900998b4e..57241417ff2f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -590,6 +590,7 @@ struct request_queue { u64 write_hints[BLK_MAX_WRITE_HINTS]; }; +/* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ -- cgit v1.2.3 From bf64ff4c2aac65d680dc639a511c781cf6b6ec08 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 27 Jun 2020 00:12:24 -0700 Subject: genetlink: get rid of family->attrbuf genl_family_rcv_msg_attrs_parse() reuses the global family->attrbuf when family->parallel_ops is false. However, family->attrbuf is not protected by any lock on the genl_family_rcv_msg_doit() code path. This leads to several different consequences, one of them is UAF, like the following: genl_family_rcv_msg_doit(): genl_start(): genl_family_rcv_msg_attrs_parse() attrbuf = family->attrbuf __nlmsg_parse(attrbuf); genl_family_rcv_msg_attrs_parse() attrbuf = family->attrbuf __nlmsg_parse(attrbuf); info->attrs = attrs; cb->data = info; netlink_unicast_kernel(): consume_skb() genl_lock_dumpit(): genl_dumpit_info(cb)->attrs Note family->attrbuf is an array of pointers to the skb data, once the skb is freed, any dereference of family->attrbuf will be a UAF. Maybe we could serialize the family->attrbuf with genl_mutex too, but that would make the locking more complicated. Instead, we can just get rid of family->attrbuf and always allocate attrbuf from heap like the family->parallel_ops==true code path. This may add some performance overhead but comparing with taking the global genl_mutex, it still looks better. Fixes: 75cdbdd08900 ("net: ieee802154: have genetlink code to parse the attrs during dumpit") Fixes: 057af7071344 ("net: tipc: have genetlink code to parse the attrs during dumpit") Reported-and-tested-by: syzbot+3039ddf6d7b13daf3787@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+80cad1e3cb4c41cde6ff@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+736bcbcb11b60d0c0792@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+520f8704db2b68091d44@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+c96e4dfb32f8987fdeed@syzkaller.appspotmail.com Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 -- net/netlink/genetlink.c | 48 +++++++++++++----------------------------------- 2 files changed, 13 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 74950663bb00..ad71ed4f55ff 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -41,7 +41,6 @@ struct genl_info; * Note that unbind() will not be called symmetrically if the * generic netlink family is removed while there are still open * sockets. - * @attrbuf: buffer to store parsed attributes (private) * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -66,7 +65,6 @@ struct genl_family { struct genl_info *info); int (*mcast_bind)(struct net *net, int group); void (*mcast_unbind)(struct net *net, int group); - struct nlattr ** attrbuf; /* private */ const struct genl_ops * ops; const struct genl_multicast_group *mcgrps; unsigned int n_ops; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 55ee680e9db1..a914b9365a46 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -351,22 +351,11 @@ int genl_register_family(struct genl_family *family) start = end = GENL_ID_VFS_DQUOT; } - if (family->maxattr && !family->parallel_ops) { - family->attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), - GFP_KERNEL); - if (family->attrbuf == NULL) { - err = -ENOMEM; - goto errout_locked; - } - } else - family->attrbuf = NULL; - family->id = idr_alloc_cyclic(&genl_fam_idr, family, start, end + 1, GFP_KERNEL); if (family->id < 0) { err = family->id; - goto errout_free; + goto errout_locked; } err = genl_validate_assign_mc_groups(family); @@ -385,8 +374,6 @@ int genl_register_family(struct genl_family *family) errout_remove: idr_remove(&genl_fam_idr, family->id); -errout_free: - kfree(family->attrbuf); errout_locked: genl_unlock_all(); return err; @@ -419,8 +406,6 @@ int genl_unregister_family(const struct genl_family *family) atomic_read(&genl_sk_destructing_cnt) == 0); genl_unlock(); - kfree(family->attrbuf); - genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0); return 0; @@ -485,30 +470,23 @@ genl_family_rcv_msg_attrs_parse(const struct genl_family *family, if (!family->maxattr) return NULL; - if (family->parallel_ops) { - attrbuf = kmalloc_array(family->maxattr + 1, - sizeof(struct nlattr *), GFP_KERNEL); - if (!attrbuf) - return ERR_PTR(-ENOMEM); - } else { - attrbuf = family->attrbuf; - } + attrbuf = kmalloc_array(family->maxattr + 1, + sizeof(struct nlattr *), GFP_KERNEL); + if (!attrbuf) + return ERR_PTR(-ENOMEM); err = __nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, family->policy, validate, extack); if (err) { - if (family->parallel_ops) - kfree(attrbuf); + kfree(attrbuf); return ERR_PTR(err); } return attrbuf; } -static void genl_family_rcv_msg_attrs_free(const struct genl_family *family, - struct nlattr **attrbuf) +static void genl_family_rcv_msg_attrs_free(struct nlattr **attrbuf) { - if (family->parallel_ops) - kfree(attrbuf); + kfree(attrbuf); } struct genl_start_context { @@ -542,7 +520,7 @@ static int genl_start(struct netlink_callback *cb) no_attrs: info = genl_dumpit_info_alloc(); if (!info) { - genl_family_rcv_msg_attrs_free(ctx->family, attrs); + genl_family_rcv_msg_attrs_free(attrs); return -ENOMEM; } info->family = ctx->family; @@ -559,7 +537,7 @@ no_attrs: } if (rc) { - genl_family_rcv_msg_attrs_free(info->family, info->attrs); + genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); cb->data = NULL; } @@ -588,7 +566,7 @@ static int genl_lock_done(struct netlink_callback *cb) rc = ops->done(cb); genl_unlock(); } - genl_family_rcv_msg_attrs_free(info->family, info->attrs); + genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); return rc; } @@ -601,7 +579,7 @@ static int genl_parallel_done(struct netlink_callback *cb) if (ops->done) rc = ops->done(cb); - genl_family_rcv_msg_attrs_free(info->family, info->attrs); + genl_family_rcv_msg_attrs_free(info->attrs); genl_dumpit_info_free(info); return rc; } @@ -694,7 +672,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, family->post_doit(ops, skb, &info); out: - genl_family_rcv_msg_attrs_free(family, attrbuf); + genl_family_rcv_msg_attrs_free(attrbuf); return err; } -- cgit v1.2.3 From 3aa91625007807bfca4155df1867a5c924a08662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:56 +0200 Subject: dma-mapping: Add a new dma_need_sync API Add a new API to check if calls to dma_sync_single_for_{device,cpu} are required for a given DMA streaming mapping. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-2-hch@lst.de --- Documentation/core-api/dma-api.rst | 8 ++++++++ include/linux/dma-direct.h | 1 + include/linux/dma-mapping.h | 5 +++++ kernel/dma/direct.c | 6 ++++++ kernel/dma/mapping.c | 10 ++++++++++ 5 files changed, 30 insertions(+) (limited to 'include') diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 2d8d2fed7317..f41620439ef3 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -204,6 +204,14 @@ Returns the maximum size of a mapping for the device. The size parameter of the mapping functions like dma_map_single(), dma_map_page() and others should not be larger than the returned value. +:: + + bool + dma_need_sync(struct device *dev, dma_addr_t dma_addr); + +Returns %true if dma_sync_single_for_{device,cpu} calls are required to +transfer memory ownership. Returns %false if those calls can be skipped. + :: unsigned long diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 136f984df0d9..8b006730687b 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -87,4 +87,5 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int dma_direct_supported(struct device *dev, u64 mask); +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 78f677cf45ab..a33ed3954ed4 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -461,6 +461,7 @@ int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, @@ -571,6 +572,10 @@ static inline size_t dma_max_mapping_size(struct device *dev) { return 0; } +static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return false; +} static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 0a4881e59aa7..ecb922a0bfa0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -530,3 +530,9 @@ size_t dma_direct_max_mapping_size(struct device *dev) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } + +bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + return !dev_is_dma_coherent(dev) || + is_swiotlb_buffer(dma_to_phys(dev, dma_addr)); +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 98e3d873792e..a8c18c9a796f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -397,6 +397,16 @@ size_t dma_max_mapping_size(struct device *dev) } EXPORT_SYMBOL_GPL(dma_max_mapping_size); +bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_is_direct(ops)) + return dma_direct_need_sync(dev, dma_addr); + return ops->sync_single_for_cpu || ops->sync_single_for_device; +} +EXPORT_SYMBOL_GPL(dma_need_sync); + unsigned long dma_get_merge_boundary(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); -- cgit v1.2.3 From 91d5b70273267bbae6f5d1fb4cf3510bd31ef9ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 15:03:57 +0200 Subject: xsk: Replace the cheap_dma flag with a dma_need_sync flag Invert the polarity and better name the flag so that the use case is properly documented. Signed-off-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200629130359.2690853-3-hch@lst.de --- include/net/xsk_buff_pool.h | 6 +++--- net/xdp/xsk_buff_pool.c | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index a4ff226505c9..6842990e2712 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -40,7 +40,7 @@ struct xsk_buff_pool { u32 headroom; u32 chunk_size; u32 frame_len; - bool cheap_dma; + bool dma_need_sync; bool unaligned; void *addrs; struct device *dev; @@ -80,7 +80,7 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - if (xskb->pool->cheap_dma) + if (!xskb->pool->dma_need_sync) return; xp_dma_sync_for_cpu_slow(xskb); @@ -91,7 +91,7 @@ void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - if (pool->cheap_dma) + if (!pool->dma_need_sync) return; xp_dma_sync_for_device_slow(pool, dma, size); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 540ed75e4482..9fe84c797a70 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -55,7 +55,6 @@ struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, pool->free_heads_cnt = chunks; pool->headroom = headroom; pool->chunk_size = chunk_size; - pool->cheap_dma = true; pool->unaligned = unaligned; pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; INIT_LIST_HEAD(&pool->free_list); @@ -195,7 +194,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, xp_check_dma_contiguity(pool); pool->dev = dev; - pool->cheap_dma = xp_check_cheap_dma(pool); + pool->dma_need_sync = !xp_check_cheap_dma(pool); return 0; } EXPORT_SYMBOL(xp_dma_map); @@ -280,7 +279,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; xskb->xdp.data_meta = xskb->xdp.data; - if (!pool->cheap_dma) { + if (pool->dma_need_sync) { dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, pool->frame_len, DMA_BIDIRECTIONAL); -- cgit v1.2.3 From 3b7016996c4c44db5d499d98759b82fb714bb912 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:54 +0200 Subject: flow_dissector: Pull BPF program assignment up to bpf-netns Prepare for using bpf_prog_array to store attached programs by moving out code that updates the attached program out of flow dissector. Managing bpf_prog_array is more involved than updating a single bpf_prog pointer. This will let us do it all from one place, bpf/net_namespace.c, in the subsequent patch. No functional change intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-2-jakub@cloudflare.com --- include/net/flow_dissector.h | 3 ++- kernel/bpf/net_namespace.c | 20 ++++++++++++++++++-- net/core/flow_dissector.c | 13 ++----------- 3 files changed, 22 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index a7eba43fe4e4..4b6e36288ddd 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -372,7 +372,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control, } #ifdef CONFIG_BPF_SYSCALL -int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog); +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 78cf061f8179..b951dab2687f 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -189,6 +189,7 @@ int netns_bpf_prog_query(const union bpf_attr *attr, int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { enum netns_bpf_attach_type type; + struct bpf_prog *attached; struct net *net; int ret; @@ -207,12 +208,26 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); + ret = flow_dissector_bpf_prog_attach_check(net, prog); break; default: ret = -EINVAL; break; } + if (ret) + goto out_unlock; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (attached == prog) { + /* The same program cannot be attached twice */ + ret = -EINVAL; + goto out_unlock; + } + rcu_assign_pointer(net->bpf.progs[type], prog); + if (attached) + bpf_prog_put(attached); + out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -277,7 +292,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, switch (type) { case NETNS_BPF_FLOW_DISSECTOR: - err = flow_dissector_bpf_prog_attach(net, link->prog); + err = flow_dissector_bpf_prog_attach_check(net, link->prog); break; default: err = -EINVAL; @@ -286,6 +301,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; + rcu_assign_pointer(net->bpf.progs[type], link->prog); net->bpf.links[type] = link; out_unlock: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d02df0b6d0d9..b57fb1359395 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -70,10 +70,10 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL -int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) +int flow_dissector_bpf_prog_attach_check(struct net *net, + struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; - struct bpf_prog *attached; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -97,15 +97,6 @@ int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) return -EEXIST; } - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (attached == prog) - /* The same program cannot be attached twice */ - return -EINVAL; - - rcu_assign_pointer(net->bpf.progs[type], prog); - if (attached) - bpf_prog_put(attached); return 0; } #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 695c12147a40181fe9221d321c3f2de33c9574ed Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:55 +0200 Subject: bpf, netns: Keep attached programs in bpf_prog_array Prepare for having multi-prog attachments for new netns attach types by storing programs to run in a bpf_prog_array, which is well suited for iterating over programs and running them in sequence. After this change bpf(PROG_QUERY) may block to allocate memory in bpf_prog_array_copy_to_user() for collected program IDs. This forces a change in how we protect access to the attached program in the query callback. Because bpf_prog_array_copy_to_user() can sleep, we switch from an RCU read lock to holding a mutex that serializes updaters. Because we allow only one BPF flow_dissector program to be attached to netns at all times, the bpf_prog_array pointed by net->bpf.run_array is always either detached (null) or one element long. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200625141357.910330-3-jakub@cloudflare.com --- include/net/netns/bpf.h | 5 +- kernel/bpf/net_namespace.c | 120 +++++++++++++++++++++++++++++++-------------- net/core/flow_dissector.c | 19 +++---- 3 files changed, 96 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a8dce2a380c8..a5015bda9979 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -9,9 +9,12 @@ #include struct bpf_prog; +struct bpf_prog_array; struct netns_bpf { - struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; + /* Array of programs to run compiled from progs or links */ + struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; + struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE]; struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; }; diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index b951dab2687f..0dba97202357 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -33,6 +33,17 @@ static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) net_link->net = NULL; } +/* Must be called with netns_bpf_mutex held. */ +static void netns_bpf_run_array_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog_array *run_array; + + run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL, + lockdep_is_held(&netns_bpf_mutex)); + bpf_prog_array_free(run_array); +} + static void bpf_netns_link_release(struct bpf_link *link) { struct bpf_netns_link *net_link = @@ -54,8 +65,8 @@ static void bpf_netns_link_release(struct bpf_link *link) if (!net) goto out_unlock; + netns_bpf_run_array_detach(net, type); net->bpf.links[type] = NULL; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -76,6 +87,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, struct bpf_netns_link *net_link = container_of(link, struct bpf_netns_link, link); enum netns_bpf_attach_type type = net_link->netns_type; + struct bpf_prog_array *run_array; struct net *net; int ret = 0; @@ -93,8 +105,11 @@ static int bpf_netns_link_update_prog(struct bpf_link *link, goto out_unlock; } + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + WRITE_ONCE(run_array->items[0].prog, new_prog); + old_prog = xchg(&link->prog, new_prog); - rcu_assign_pointer(net->bpf.progs[type], new_prog); bpf_prog_put(old_prog); out_unlock: @@ -142,14 +157,38 @@ static const struct bpf_link_ops bpf_netns_link_ops = { .show_fdinfo = bpf_netns_link_show_fdinfo, }; +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr, + struct net *net, + enum netns_bpf_attach_type type) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + struct bpf_prog_array *run_array; + u32 prog_cnt = 0, flags = 0; + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) + prog_cnt = bpf_prog_array_length(run_array); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + return bpf_prog_array_copy_to_user(run_array, prog_ids, + attr->query.prog_cnt); +} + int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; enum netns_bpf_attach_type type; - struct bpf_prog *attached; struct net *net; + int ret; if (attr->query.query_flags) return -EINVAL; @@ -162,32 +201,17 @@ int netns_bpf_prog_query(const union bpf_attr *attr, if (IS_ERR(net)) return PTR_ERR(net); - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_query(attr, uattr, net, type); + mutex_unlock(&netns_bpf_mutex); put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; + return ret; } int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + struct bpf_prog_array *run_array; enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; @@ -217,14 +241,28 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) if (ret) goto out_unlock; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); + attached = net->bpf.progs[type]; if (attached == prog) { /* The same program cannot be attached twice */ ret = -EINVAL; goto out_unlock; } - rcu_assign_pointer(net->bpf.progs[type], prog); + + run_array = rcu_dereference_protected(net->bpf.run_array[type], + lockdep_is_held(&netns_bpf_mutex)); + if (run_array) { + WRITE_ONCE(run_array->items[0].prog, prog); + } else { + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + ret = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + } + + net->bpf.progs[type] = prog; if (attached) bpf_prog_put(attached); @@ -244,11 +282,11 @@ static int __netns_bpf_prog_detach(struct net *net, if (net->bpf.links[type]) return -EINVAL; - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); + attached = net->bpf.progs[type]; if (!attached) return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); + netns_bpf_run_array_detach(net, type); + net->bpf.progs[type] = NULL; bpf_prog_put(attached); return 0; } @@ -272,7 +310,7 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { - struct bpf_prog *prog; + struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); @@ -283,9 +321,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, goto out_unlock; } /* Links are not compatible with attaching prog directly */ - prog = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (prog) { + if (net->bpf.progs[type]) { err = -EEXIST; goto out_unlock; } @@ -301,7 +337,14 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, if (err) goto out_unlock; - rcu_assign_pointer(net->bpf.progs[type], link->prog); + run_array = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!run_array) { + err = -ENOMEM; + goto out_unlock; + } + run_array->items[0].prog = link->prog; + rcu_assign_pointer(net->bpf.run_array[type], run_array); + net->bpf.links[type] = link; out_unlock: @@ -368,11 +411,12 @@ static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { + netns_bpf_run_array_detach(net, type); link = net->bpf.links[type]; if (link) bpf_netns_link_auto_detach(link); - else - __netns_bpf_prog_detach(net, type); + else if (net->bpf.progs[type]) + bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b57fb1359395..142a8824f0a8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -86,14 +86,14 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->bpf.progs[type])) + if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.bpf.progs[type])) + if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } @@ -894,7 +894,6 @@ bool __skb_flow_dissect(const struct net *net, struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; - struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; @@ -951,14 +950,14 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + struct bpf_prog_array *run_array; rcu_read_lock(); - attached = rcu_dereference(init_net.bpf.progs[type]); + run_array = rcu_dereference(init_net.bpf.run_array[type]); + if (!run_array) + run_array = rcu_dereference(net->bpf.run_array[type]); - if (!attached) - attached = rcu_dereference(net->bpf.progs[type]); - - if (attached) { + if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, @@ -966,6 +965,7 @@ bool __skb_flow_dissect(const struct net *net, .data_end = data + hlen, }; __be16 n_proto = proto; + struct bpf_prog *prog; if (skb) { ctx.skb = skb; @@ -976,7 +976,8 @@ bool __skb_flow_dissect(const struct net *net, n_proto = skb->protocol; } - ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, + prog = READ_ONCE(run_array->items[0].prog); + ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); -- cgit v1.2.3 From ab53cad90eb10c9991f501ba08904680a074ef3d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 25 Jun 2020 16:13:56 +0200 Subject: bpf, netns: Keep a list of attached bpf_link's To support multi-prog link-based attachments for new netns attach types, we need to keep track of more than one bpf_link per attach type. Hence, convert net->bpf.links into a list, that currently can be either empty or have just one item. Instead of reusing bpf_prog_list from bpf-cgroup, we link together bpf_netns_link's themselves. This makes list management simpler as we don't have to allocate, initialize, and later release list elements. We can do this because multi-prog attachment will be available only for bpf_link, and we don't need to build a list of programs attached directly and indirectly via links. No functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200625141357.910330-4-jakub@cloudflare.com --- include/net/netns/bpf.h | 2 +- kernel/bpf/net_namespace.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h index a5015bda9979..0ca6a1b87185 100644 --- a/include/net/netns/bpf.h +++ b/include/net/netns/bpf.h @@ -15,7 +15,7 @@ struct netns_bpf { /* Array of programs to run compiled from progs or links */ struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE]; struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE]; - struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE]; + struct list_head links[MAX_NETNS_BPF_ATTACH_TYPE]; }; #endif /* __NETNS_BPF_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 0dba97202357..7a34a8caf954 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -19,20 +19,12 @@ struct bpf_netns_link { * with netns_bpf_mutex held. */ struct net *net; + struct list_head node; /* node in list of links attached to net */ }; /* Protects updates to netns_bpf */ DEFINE_MUTEX(netns_bpf_mutex); -/* Must be called with netns_bpf_mutex held. */ -static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link) -{ - struct bpf_netns_link *net_link = - container_of(link, struct bpf_netns_link, link); - - net_link->net = NULL; -} - /* Must be called with netns_bpf_mutex held. */ static void netns_bpf_run_array_detach(struct net *net, enum netns_bpf_attach_type type) @@ -66,7 +58,7 @@ static void bpf_netns_link_release(struct bpf_link *link) goto out_unlock; netns_bpf_run_array_detach(net, type); - net->bpf.links[type] = NULL; + list_del(&net_link->node); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -225,7 +217,7 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) mutex_lock(&netns_bpf_mutex); /* Attaching prog directly is not compatible with links */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { ret = -EEXIST; goto out_unlock; } @@ -279,7 +271,7 @@ static int __netns_bpf_prog_detach(struct net *net, struct bpf_prog *attached; /* Progs attached via links cannot be detached */ - if (net->bpf.links[type]) + if (!list_empty(&net->bpf.links[type])) return -EINVAL; attached = net->bpf.progs[type]; @@ -310,13 +302,15 @@ int netns_bpf_prog_detach(const union bpf_attr *attr) static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, enum netns_bpf_attach_type type) { + struct bpf_netns_link *net_link = + container_of(link, struct bpf_netns_link, link); struct bpf_prog_array *run_array; int err; mutex_lock(&netns_bpf_mutex); /* Allow attaching only one prog or link for now */ - if (net->bpf.links[type]) { + if (!list_empty(&net->bpf.links[type])) { err = -E2BIG; goto out_unlock; } @@ -345,7 +339,7 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link, run_array->items[0].prog = link->prog; rcu_assign_pointer(net->bpf.run_array[type], run_array); - net->bpf.links[type] = link; + list_add_tail(&net_link->node, &net->bpf.links[type]); out_unlock: mutex_unlock(&netns_bpf_mutex); @@ -404,24 +398,34 @@ out_put_net: return err; } +static int __net_init netns_bpf_pernet_init(struct net *net) +{ + int type; + + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + INIT_LIST_HEAD(&net->bpf.links[type]); + + return 0; +} + static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { enum netns_bpf_attach_type type; - struct bpf_link *link; + struct bpf_netns_link *net_link; mutex_lock(&netns_bpf_mutex); for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) { netns_bpf_run_array_detach(net, type); - link = net->bpf.links[type]; - if (link) - bpf_netns_link_auto_detach(link); - else if (net->bpf.progs[type]) + list_for_each_entry(net_link, &net->bpf.links[type], node) + net_link->net = NULL; /* auto-detach link */ + if (net->bpf.progs[type]) bpf_prog_put(net->bpf.progs[type]); } mutex_unlock(&netns_bpf_mutex); } static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .init = netns_bpf_pernet_init, .pre_exit = netns_bpf_pernet_pre_exit, }; -- cgit v1.2.3 From 4ac2add65974e4efafb8d4ccd8fc5660417ea312 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:26 +0100 Subject: bpf: flow_dissector: Check value of unused flags to BPF_PROG_DETACH Using BPF_PROG_DETACH on a flow dissector program supports neither attach_flags nor attach_bpf_fd. Yet no value is enforced for them. Enforce that attach_flags are zero, and require the current program to be passed via attach_bpf_fd. This allows us to remove the check for CAP_SYS_ADMIN, since userspace can now no longer remove arbitrary flow dissector programs. Fixes: b27f7bb590ba ("flow_dissector: Move out netns_bpf prog callbacks") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-3-lmb@cloudflare.com --- include/linux/bpf-netns.h | 5 +++-- kernel/bpf/net_namespace.c | 19 +++++++++++++++---- kernel/bpf/syscall.c | 4 +--- 3 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h index 4052d649f36d..47d5b0c708c9 100644 --- a/include/linux/bpf-netns.h +++ b/include/linux/bpf-netns.h @@ -33,7 +33,7 @@ int netns_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); -int netns_bpf_prog_detach(const union bpf_attr *attr); +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog); #else @@ -49,7 +49,8 @@ static inline int netns_bpf_prog_attach(const union bpf_attr *attr, return -EOPNOTSUPP; } -static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +static inline int netns_bpf_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) { return -EOPNOTSUPP; } diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 03045f45afec..3dbc29b6f51d 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -269,7 +269,8 @@ out_unlock: /* Must be called with netns_bpf_mutex held. */ static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) + enum netns_bpf_attach_type type, + struct bpf_prog *old) { struct bpf_prog *attached; @@ -278,7 +279,7 @@ static int __netns_bpf_prog_detach(struct net *net, return -EINVAL; attached = net->bpf.progs[type]; - if (!attached) + if (!attached || attached != old) return -ENOENT; netns_bpf_run_array_detach(net, type); net->bpf.progs[type] = NULL; @@ -286,19 +287,29 @@ static int __netns_bpf_prog_detach(struct net *net, return 0; } -int netns_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) { enum netns_bpf_attach_type type; + struct bpf_prog *prog; int ret; + if (attr->target_fd) + return -EINVAL; + type = to_netns_bpf_attach_type(attr->attach_type); if (type < 0) return -EINVAL; + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog)) + return PTR_ERR(prog); + mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type, prog); mutex_unlock(&netns_bpf_mutex); + bpf_prog_put(prog); + return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7d946435587d..28c6ef759037 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2897,9 +2897,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - return netns_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr, ptype); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: -- cgit v1.2.3 From bb0de3131f4c60a9bf976681e0fe4d1e55c7a821 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 29 Jun 2020 10:56:28 +0100 Subject: bpf: sockmap: Require attach_bpf_fd when detaching a program The sockmap code currently ignores the value of attach_bpf_fd when detaching a program. This is contrary to the usual behaviour of checking that attach_bpf_fd represents the currently attached program. Ensure that attach_bpf_fd is indeed the currently attached program. It turns out that all sockmap selftests already do this, which indicates that this is unlikely to cause breakage. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200629095630.7933-5-lmb@cloudflare.com --- include/linux/bpf.h | 13 +++++++++++-- include/linux/skmsg.h | 13 +++++++++++++ kernel/bpf/syscall.c | 2 +- net/core/sock_map.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 70 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 07052d44bca1..9750a1902ee5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1543,13 +1543,16 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_BPF_STREAM_PARSER) -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which); +int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else static inline int sock_map_prog_update(struct bpf_map *map, - struct bpf_prog *prog, u32 which) + struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { return -EOPNOTSUPP; } @@ -1559,6 +1562,12 @@ static inline int sock_map_get_from_fd(const union bpf_attr *attr, { return -EINVAL; } + +static inline int sock_map_prog_detach(const union bpf_attr *attr, + enum bpf_prog_type ptype) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 08674cd14d5a..1e9ed840b9fc 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -430,6 +430,19 @@ static inline void psock_set_prog(struct bpf_prog **pprog, bpf_prog_put(prog); } +static inline int psock_replace_prog(struct bpf_prog **pprog, + struct bpf_prog *prog, + struct bpf_prog *old) +{ + if (cmpxchg(pprog, old, prog) != old) + return -ENOENT; + + if (old) + bpf_prog_put(old); + + return 0; +} + static inline void psock_progs_drop(struct sk_psock_progs *progs) { psock_set_prog(&progs->msg_parser, NULL); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 28c6ef759037..a74fce8ce043 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2893,7 +2893,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) switch (ptype) { case BPF_PROG_TYPE_SK_MSG: case BPF_PROG_TYPE_SK_SKB: - return sock_map_get_from_fd(attr, NULL); + return sock_map_prog_detach(attr, ptype); case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 58016a5c63ff..0971f17e8e54 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -77,7 +77,42 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - ret = sock_map_prog_update(map, prog, attr->attach_type); + ret = sock_map_prog_update(map, prog, NULL, attr->attach_type); + fdput(f); + return ret; +} + +int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + u32 ufd = attr->target_fd; + struct bpf_prog *prog; + struct bpf_map *map; + struct fd f; + int ret; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + prog = bpf_prog_get(attr->attach_bpf_fd); + if (IS_ERR(prog)) { + ret = PTR_ERR(prog); + goto put_map; + } + + if (prog->type != ptype) { + ret = -EINVAL; + goto put_prog; + } + + ret = sock_map_prog_update(map, NULL, prog, attr->attach_type); +put_prog: + bpf_prog_put(prog); +put_map: fdput(f); return ret; } @@ -1206,27 +1241,32 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) } int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - u32 which) + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog **pprog; if (!progs) return -EOPNOTSUPP; switch (which) { case BPF_SK_MSG_VERDICT: - psock_set_prog(&progs->msg_parser, prog); + pprog = &progs->msg_parser; break; case BPF_SK_SKB_STREAM_PARSER: - psock_set_prog(&progs->skb_parser, prog); + pprog = &progs->skb_parser; break; case BPF_SK_SKB_STREAM_VERDICT: - psock_set_prog(&progs->skb_verdict, prog); + pprog = &progs->skb_verdict; break; default: return -EOPNOTSUPP; } + if (old) + return psock_replace_prog(pprog, prog, old); + + psock_set_prog(pprog, prog); return 0; } -- cgit v1.2.3 From e91b48162332480f5840902268108bb7fb7a44c7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 30 Jun 2020 17:32:54 +0200 Subject: task_work: teach task_work_add() to do signal_wake_up() So that the target task will exit the wait_event_interruptible-like loop and call task_work_run() asap. The patch turns "bool notify" into 0,TWA_RESUME,TWA_SIGNAL enum, the new TWA_SIGNAL flag implies signal_wake_up(). However, it needs to avoid the race with recalc_sigpending(), so the patch also adds the new JOBCTL_TASK_WORK bit included in JOBCTL_PENDING_MASK. TODO: once this patch is merged we need to change all current users of task_work_add(notify = true) to use TWA_RESUME. Cc: stable@vger.kernel.org # v5.7 Acked-by: Peter Zijlstra (Intel) Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- include/linux/sched/jobctl.h | 4 +++- include/linux/task_work.h | 5 ++++- kernel/signal.c | 10 +++++++--- kernel/task_work.c | 16 ++++++++++++++-- 4 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index fa067de9f1a9..d2b4204ba4d3 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -19,6 +19,7 @@ struct task_struct; #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ +#define JOBCTL_TASK_WORK_BIT 24 /* set by TWA_SIGNAL */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -28,9 +29,10 @@ struct task_struct; #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) +#define JOBCTL_TASK_WORK (1UL << JOBCTL_TASK_WORK_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) -#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) +#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK) extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); diff --git a/include/linux/task_work.h b/include/linux/task_work.h index bd9a6a91c097..0fb93aafa478 100644 --- a/include/linux/task_work.h +++ b/include/linux/task_work.h @@ -13,7 +13,10 @@ init_task_work(struct callback_head *twork, task_work_func_t func) twork->func = func; } -int task_work_add(struct task_struct *task, struct callback_head *twork, bool); +#define TWA_RESUME 1 +#define TWA_SIGNAL 2 +int task_work_add(struct task_struct *task, struct callback_head *twork, int); + struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); void task_work_run(void); diff --git a/kernel/signal.c b/kernel/signal.c index 5ca48cc5da76..ee22ec78fd6d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2529,9 +2529,6 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; - if (unlikely(current->task_works)) - task_work_run(); - if (unlikely(uprobe_deny_signal())) return false; @@ -2544,6 +2541,13 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); + current->jobctl &= ~JOBCTL_TASK_WORK; + if (unlikely(current->task_works)) { + spin_unlock_irq(&sighand->siglock); + task_work_run(); + goto relock; + } + /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes diff --git a/kernel/task_work.c b/kernel/task_work.c index 825f28259a19..5c0848ca1287 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -25,9 +25,10 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ * 0 if succeeds or -ESRCH. */ int -task_work_add(struct task_struct *task, struct callback_head *work, bool notify) +task_work_add(struct task_struct *task, struct callback_head *work, int notify) { struct callback_head *head; + unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -36,8 +37,19 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) work->next = head; } while (cmpxchg(&task->task_works, head, work) != head); - if (notify) + switch (notify) { + case TWA_RESUME: set_notify_resume(task); + break; + case TWA_SIGNAL: + if (lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } + break; + } + return 0; } -- cgit v1.2.3 From 2606aff916854b61234bf85001be9777bab2d5f8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Jun 2020 19:06:18 -0600 Subject: net: ip_tunnel: add header_ops for layer 3 devices Some devices that take straight up layer 3 packets benefit from having a shared header_ops so that AF_PACKET sockets can inject packets that are recognized. This shared infrastructure will be used by other drivers that currently can't inject packets using AF_PACKET. It also exposes the parser function, as it is useful in standalone form too. Signed-off-by: Jason A. Donenfeld Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 +++ net/ipv4/ip_tunnel_core.c | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 076e5d7db7d3..36025dea7612 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -290,6 +290,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); +extern const struct header_ops ip_tunnel_header_ops; +__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); + struct ip_tunnel_encap_ops { size_t (*encap_hlen)(struct ip_tunnel_encap *e); int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 181b7a2a0247..f8b419e2475c 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -844,3 +844,21 @@ void ip_tunnel_unneed_metadata(void) static_branch_dec(&ip_tunnel_metadata_cnt); } EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); + +/* Returns either the correct skb->protocol value, or 0 if invalid. */ +__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb) +{ + if (skb_network_header(skb) >= skb->head && + (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) && + ip_hdr(skb)->version == 4) + return htons(ETH_P_IP); + if (skb_network_header(skb) >= skb->head && + (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) && + ipv6_hdr(skb)->version == 6) + return htons(ETH_P_IPV6); + return 0; +} +EXPORT_SYMBOL(ip_tunnel_parse_protocol); + +const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol }; +EXPORT_SYMBOL(ip_tunnel_header_ops); -- cgit v1.2.3 From 5396956cc7c6874180c9bfc1ceceb02b739a6a87 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 22 Jun 2020 19:12:48 +0300 Subject: PCI: Make pcie_find_root_port() work for Root Ports Commit 6ae72bfa656e ("PCI: Unify pcie_find_root_port() and pci_find_pcie_root_port()") broke acpi_pci_bridge_d3() because calling pcie_find_root_port() on a Root Port returned NULL when it should return the Root Port, which in turn broke power management of PCIe hierarchies. Rework pcie_find_root_port() so it returns its argument when it is already a Root Port. [bhelgaas: test device only once, test for PCIe] Fixes: 6ae72bfa656e ("PCI: Unify pcie_find_root_port() and pci_find_pcie_root_port()") Link: https://lore.kernel.org/r/20200622161248.51099-1-mika.westerberg@linux.intel.com Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index c79d83304e52..34c1c4f45288 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2169,12 +2169,11 @@ static inline int pci_pcie_type(const struct pci_dev *dev) */ static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev) { - struct pci_dev *bridge = pci_upstream_bridge(dev); - - while (bridge) { - if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) - return bridge; - bridge = pci_upstream_bridge(bridge); + while (dev) { + if (pci_is_pcie(dev) && + pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT) + return dev; + dev = pci_upstream_bridge(dev); } return NULL; -- cgit v1.2.3 From 1e82a62fec613844da9e558f3493540a5b7a7b67 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Tue, 30 Jun 2020 11:50:17 -0600 Subject: genetlink: remove genl_bind A potential deadlock can occur during registering or unregistering a new generic netlink family between the main nl_table_lock and the cb_lock where each thread wants the lock held by the other, as demonstrated below. 1) Thread 1 is performing a netlink_bind() operation on a socket. As part of this call, it will call netlink_lock_table(), incrementing the nl_table_users count to 1. 2) Thread 2 is registering (or unregistering) a genl_family via the genl_(un)register_family() API. The cb_lock semaphore will be taken for writing. 3) Thread 1 will call genl_bind() as part of the bind operation to handle subscribing to GENL multicast groups at the request of the user. It will attempt to take the cb_lock semaphore for reading, but it will fail and be scheduled away, waiting for Thread 2 to finish the write. 4) Thread 2 will call netlink_table_grab() during the (un)registration call. However, as Thread 1 has incremented nl_table_users, it will not be able to proceed, and both threads will be stuck waiting for the other. genl_bind() is a noop, unless a genl_family implements the mcast_bind() function to handle setting up family-specific multicast operations. Since no one in-tree uses this functionality as Cong pointed out, simply removing the genl_bind() function will remove the possibility for deadlock, as there is no attempt by Thread 1 above to take the cb_lock semaphore. Fixes: c380d9a7afff ("genetlink: pass multicast bind/unbind to families") Suggested-by: Cong Wang Acked-by: Johannes Berg Reported-by: kernel test robot Signed-off-by: Sean Tranchetti Signed-off-by: David S. Miller --- include/net/genetlink.h | 8 -------- net/netlink/genetlink.c | 49 ------------------------------------------------- 2 files changed, 57 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index ad71ed4f55ff..6e5f1e1aa822 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -35,12 +35,6 @@ struct genl_info; * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks - * @mcast_bind: a socket bound to the given multicast group (which - * is given as the offset into the groups array) - * @mcast_unbind: a socket was unbound from the given multicast group. - * Note that unbind() will not be called symmetrically if the - * generic netlink family is removed while there are still open - * sockets. * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family @@ -63,8 +57,6 @@ struct genl_family { void (*post_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); - int (*mcast_bind)(struct net *net, int group); - void (*mcast_unbind)(struct net *net, int group); const struct genl_ops * ops; const struct genl_multicast_group *mcgrps; unsigned int n_ops; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index a914b9365a46..9395ee8a868d 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1144,60 +1144,11 @@ static struct genl_family genl_ctrl __ro_after_init = { .netnsok = true, }; -static int genl_bind(struct net *net, int group) -{ - struct genl_family *f; - int err = -ENOENT; - unsigned int id; - - down_read(&cb_lock); - - idr_for_each_entry(&genl_fam_idr, f, id) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; - - if (!f->netnsok && net != &init_net) - err = -ENOENT; - else if (f->mcast_bind) - err = f->mcast_bind(net, fam_grp); - else - err = 0; - break; - } - } - up_read(&cb_lock); - - return err; -} - -static void genl_unbind(struct net *net, int group) -{ - struct genl_family *f; - unsigned int id; - - down_read(&cb_lock); - - idr_for_each_entry(&genl_fam_idr, f, id) { - if (group >= f->mcgrp_offset && - group < f->mcgrp_offset + f->n_mcgrps) { - int fam_grp = group - f->mcgrp_offset; - - if (f->mcast_unbind) - f->mcast_unbind(net, fam_grp); - break; - } - } - up_read(&cb_lock); -} - static int __net_init genl_pernet_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = genl_rcv, .flags = NL_CFG_F_NONROOT_RECV, - .bind = genl_bind, - .unbind = genl_unbind, }; /* we'll bump the group number right afterwards */ -- cgit v1.2.3 From d7bf2ebebc2bd61ab95e2a8e33541ef282f303d4 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Fri, 3 Jul 2020 22:26:43 +0200 Subject: sched: consistently handle layer3 header accesses in the presence of VLANs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple of places in net/sched/ that check skb->protocol and act on the value there. However, in the presence of VLAN tags, the value stored in skb->protocol can be inconsistent based on whether VLAN acceleration is enabled. The commit quoted in the Fixes tag below fixed the users of skb->protocol to use a helper that will always see the VLAN ethertype. However, most of the callers don't actually handle the VLAN ethertype, but expect to find the IP header type in the protocol field. This means that things like changing the ECN field, or parsing diffserv values, stops working if there's a VLAN tag, or if there are multiple nested VLAN tags (QinQ). To fix this, change the helper to take an argument that indicates whether the caller wants to skip the VLAN tags or not. When skipping VLAN tags, we make sure to skip all of them, so behaviour is consistent even in QinQ mode. To make the helper usable from the ECN code, move it to if_vlan.h instead of pkt_sched.h. v3: - Remove empty lines - Move vlan variable definitions inside loop in skb_protocol() - Also use skb_protocol() helper in IP{,6}_ECN_decapsulate() and bpf_skb_ecn_set_ce() v2: - Use eth_type_vlan() helper in skb_protocol() - Also fix code that reads skb->protocol directly - Change a couple of 'if/else if' statements to switch constructs to avoid calling the helper twice Reported-by: Ilya Ponetayev Fixes: d8b9605d2697 ("net: sched: fix skb->protocol use in case of accelerated vlan path") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 28 ++++++++++++++++++++++++++++ include/net/inet_ecn.h | 25 +++++++++++++++++-------- include/net/pkt_sched.h | 11 ----------- net/core/filter.c | 10 +++++++--- net/sched/act_connmark.c | 9 ++++++--- net/sched/act_csum.c | 2 +- net/sched/act_ct.c | 9 ++++----- net/sched/act_ctinfo.c | 9 ++++++--- net/sched/act_mpls.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/cls_api.c | 2 +- net/sched/cls_flow.c | 8 ++++---- net/sched/cls_flower.c | 2 +- net/sched/em_ipset.c | 2 +- net/sched/em_ipt.c | 2 +- net/sched/em_meta.c | 2 +- net/sched/sch_cake.c | 4 ++-- net/sched/sch_dsmark.c | 6 +++--- net/sched/sch_teql.c | 2 +- 19 files changed, 86 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index b05e855f1ddd..427a5b8597c2 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -308,6 +308,34 @@ static inline bool eth_type_vlan(__be16 ethertype) } } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); + __be16 proto = skb->protocol; + + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; + + while (eth_type_vlan(proto)) { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); + if (!vh) + break; + + proto = vh->h_vlan_encapsulated_proto; + offset += sizeof(vhdr); + } + + return proto; +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index 0f0d1efe06dd..e1eaf1780288 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -172,7 +173,7 @@ static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) static inline int INET_ECN_set_ce(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -191,7 +192,7 @@ static inline int INET_ECN_set_ce(struct sk_buff *skb) static inline int INET_ECN_set_ect1(struct sk_buff *skb) { - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) @@ -272,12 +273,16 @@ static inline int IP_ECN_decapsulate(const struct iphdr *oiph, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, oiph->tos, inner); } @@ -287,12 +292,16 @@ static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, { __u8 inner; - if (skb->protocol == htons(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) + break; + case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else + break; + default: return 0; + } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9092e697059e..ac8c890a2657 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -136,17 +136,6 @@ static inline void qdisc_run(struct Qdisc *q) } } -static inline __be16 tc_skb_protocol(const struct sk_buff *skb) -{ - /* We need to take extra care in case the skb came via - * vlan accelerated path. In that case, use skb->vlan_proto - * as the original vlan header was already stripped. - */ - if (skb_vlan_tag_present(skb)) - return skb->vlan_proto; - return skb->protocol; -} - /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ diff --git a/net/core/filter.c b/net/core/filter.c index 73395384afe2..82e1b5b06167 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5853,12 +5853,16 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) { unsigned int iphdr_len; - if (skb->protocol == cpu_to_be16(ETH_P_IP)) + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): iphdr_len = sizeof(struct iphdr); - else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + break; + case cpu_to_be16(ETH_P_IPV6): iphdr_len = sizeof(struct ipv6hdr); - else + break; + default: return 0; + } if (skb_headlen(skb) < iphdr_len) return 0; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 43a243081e7d..f901421b0634 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -43,17 +43,20 @@ static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, tcf_lastuse_update(&ca->tcf_tm); bstats_update(&ca->tcf_bstats, skb); - if (skb->protocol == htons(ETH_P_IP)) { + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): if (skb->len < sizeof(struct iphdr)) goto out; proto = NFPROTO_IPV4; - } else if (skb->protocol == htons(ETH_P_IPV6)) { + break; + case htons(ETH_P_IPV6): if (skb->len < sizeof(struct ipv6hdr)) goto out; proto = NFPROTO_IPV6; - } else { + break; + default: goto out; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index cb8608f0a77a..c60674cf25c4 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -587,7 +587,7 @@ static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, goto drop; update_flags = params->update_flags; - protocol = tc_skb_protocol(skb); + protocol = skb_protocol(skb, false); again: switch (protocol) { case cpu_to_be16(ETH_P_IP): diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index e9f3576cbf71..86ed02487467 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -624,7 +624,7 @@ static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) { u8 family = NFPROTO_UNSPEC; - switch (skb->protocol) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): family = NFPROTO_IPV4; break; @@ -748,6 +748,7 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { + __be16 proto = skb_protocol(skb, true); int hooknum, err = NF_ACCEPT; /* See HOOK2MANIP(). */ @@ -759,14 +760,13 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: - if (skb->protocol == htons(ETH_P_IP) && + if (proto == htons(ETH_P_IP) && ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, hooknum)) err = NF_DROP; goto out; - } else if (IS_ENABLED(CONFIG_IPV6) && - skb->protocol == htons(ETH_P_IPV6)) { + } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { __be16 frag_off; u8 nexthdr = ipv6_hdr(skb)->nexthdr; int hdrlen = ipv6_skip_exthdr(skb, @@ -1550,4 +1550,3 @@ MODULE_AUTHOR("Yossi Kuperman "); MODULE_AUTHOR("Marcelo Ricardo Leitner "); MODULE_DESCRIPTION("Connection tracking action"); MODULE_LICENSE("GPL v2"); - diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 19649623493b..b5042f3ea079 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -96,19 +96,22 @@ static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); - if (tc_skb_protocol(skb) == htons(ETH_P_IP)) { + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV4; - } else if (tc_skb_protocol(skb) == htons(ETH_P_IPV6)) { + break; + case htons(ETH_P_IPV6): wlen += sizeof(struct ipv6hdr); if (!pskb_may_pull(skb, wlen)) goto out; proto = NFPROTO_IPV6; - } else { + break; + default: goto out; } diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index be3f215cd027..8118e2640979 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -82,7 +82,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, goto drop; break; case TCA_MPLS_ACT_PUSH: - new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb->protocol)); + new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb_protocol(skb, true))); if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b125b2be4467..b2b3faa57294 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -41,7 +41,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, if (params->flags & SKBEDIT_F_INHERITDSFIELD) { int wlen = skb_network_offset(skb); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen)) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index faa78b7dd962..e62beec0d844 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1538,7 +1538,7 @@ static inline int __tcf_classify(struct sk_buff *skb, reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { - __be16 protocol = tc_skb_protocol(skb); + __be16 protocol = skb_protocol(skb, false); int err; if (tp->protocol != protocol && diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 80ae7b9fa90a..ab53a93b2f2b 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -80,7 +80,7 @@ static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow) if (dst) return ntohl(dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_proto(const struct sk_buff *skb, @@ -104,7 +104,7 @@ static u32 flow_get_proto_dst(const struct sk_buff *skb, if (flow->ports.ports) return ntohs(flow->ports.dst); - return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb); + return addr_fold(skb_dst(skb)) ^ (__force u16)skb_protocol(skb, true); } static u32 flow_get_iif(const struct sk_buff *skb) @@ -151,7 +151,7 @@ static u32 flow_get_nfct(const struct sk_buff *skb) static u32 flow_get_nfct_src(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, src.u3.ip)); case htons(ETH_P_IPV6): @@ -164,7 +164,7 @@ fallback: static u32 flow_get_nfct_dst(const struct sk_buff *skb, const struct flow_keys *flow) { - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return ntohl(CTTUPLE(skb, dst.u3.ip)); case htons(ETH_P_IPV6): diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index b2da37286082..e30bd969fc48 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -313,7 +313,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* skb_flow_dissect() does not set n_proto in case an unknown * protocol, so do it rather here. */ - skb_key.basic.n_proto = skb->protocol; + skb_key.basic.n_proto = skb_protocol(skb, false); skb_flow_dissect_tunnel_info(skb, &mask->dissector, &skb_key); skb_flow_dissect_ct(skb, &mask->dissector, &skb_key, fl_ct_info_to_flower_map, diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c index df00566d327d..c95cf86fb431 100644 --- a/net/sched/em_ipset.c +++ b/net/sched/em_ipset.c @@ -59,7 +59,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em, }; int ret, network_offset; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): state.pf = NFPROTO_IPV4; if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index 18755d29fd15..3650117da47f 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -212,7 +212,7 @@ static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, struct nf_hook_state state; int ret; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) return 0; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index d99966a55c84..46254968d390 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -195,7 +195,7 @@ META_COLLECTOR(int_priority) META_COLLECTOR(int_protocol) { /* Let userspace take care of the byte ordering */ - dst->value = tc_skb_protocol(skb); + dst->value = skb_protocol(skb, false); } META_COLLECTOR(int_pkttype) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index ca813697728e..ebaeec1e5c82 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -592,7 +592,7 @@ static bool cake_update_flowkeys(struct flow_keys *keys, bool rev = !skb->_nfct, upd = false; __be32 ip; - if (tc_skb_protocol(skb) != htons(ETH_P_IP)) + if (skb_protocol(skb, true) != htons(ETH_P_IP)) return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) @@ -1557,7 +1557,7 @@ static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) u16 *buf, buf_; u8 dscp; - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); if (unlikely(!buf)) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 05605b30bef3..2b88710994d7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -210,7 +210,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (p->set_tc_index) { int wlen = skb_network_offset(skb); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): wlen += sizeof(struct iphdr); if (!pskb_may_pull(skb, wlen) || @@ -303,7 +303,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) index = skb->tc_index & (p->indices - 1); pr_debug("index %d->%d\n", skb->tc_index, index); - switch (tc_skb_protocol(skb)) { + switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, p->mv[index].value); @@ -320,7 +320,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) */ if (p->mv[index].mask != 0xff || p->mv[index].value) pr_warn("%s: unsupported protocol %d\n", - __func__, ntohs(tc_skb_protocol(skb))); + __func__, ntohs(skb_protocol(skb, true))); break; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 689ef6f3ded8..2f1f0a378408 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -239,7 +239,7 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, n, dev); - err = dev_hard_header(skb, dev, ntohs(tc_skb_protocol(skb)), + err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)), haddr, NULL, skb->len); if (err < 0) -- cgit v1.2.3 From 68d237056e007c88031d80900cdba0945121a287 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 30 Jun 2020 10:16:02 +0200 Subject: scatterlist: protect parameters of the sg_table related macros Add brackets to protect parameters of the recently added sg_table related macros from side-effects. Fixes: 709d6d73c756 ("scatterlist: add generic wrappers for iterating over sgtable objects") Signed-off-by: Marek Szyprowski Signed-off-by: Christoph Hellwig --- include/linux/scatterlist.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 4f922afb607a..45cf7b69d852 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -155,7 +155,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, * Loop over each sg element in the given sg_table object. */ #define for_each_sgtable_sg(sgt, sg, i) \ - for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) + for_each_sg((sgt)->sgl, sg, (sgt)->orig_nents, i) /* * Loop over each sg element in the given *DMA mapped* sg_table object. @@ -163,7 +163,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, * of the each element. */ #define for_each_sgtable_dma_sg(sgt, sg, i) \ - for_each_sg(sgt->sgl, sg, sgt->nents, i) + for_each_sg((sgt)->sgl, sg, (sgt)->nents, i) /** * sg_chain - Chain two sglists together @@ -451,7 +451,7 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) * See also for_each_sg_page(). In each loop it operates on PAGE_SIZE unit. */ #define for_each_sgtable_page(sgt, piter, pgoffset) \ - for_each_sg_page(sgt->sgl, piter, sgt->orig_nents, pgoffset) + for_each_sg_page((sgt)->sgl, piter, (sgt)->orig_nents, pgoffset) /** * for_each_sgtable_dma_page - iterate over the DMA mapped sg_table object @@ -465,7 +465,7 @@ sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) * unit. */ #define for_each_sgtable_dma_page(sgt, dma_iter, pgoffset) \ - for_each_sg_dma_page(sgt->sgl, dma_iter, sgt->nents, pgoffset) + for_each_sg_dma_page((sgt)->sgl, dma_iter, (sgt)->nents, pgoffset) /* -- cgit v1.2.3 From f79a732a8325dfbd570d87f1435019d7e5501c6d Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Mon, 29 Jun 2020 19:17:37 +0530 Subject: ALSA: compress: fix partial_drain completion state On partial_drain completion we should be in SNDRV_PCM_STATE_RUNNING state, so set that for partially draining streams in snd_compr_drain_notify() and use a flag for partially draining streams While at it, add locks for stream state change in snd_compr_drain_notify() as well. Fixes: f44f2a5417b2 ("ALSA: compress: fix drain calls blocking other compress functions (v6)") Reviewed-by: Srinivas Kandagatla Tested-by: Srinivas Kandagatla Reviewed-by: Charles Keepax Tested-by: Charles Keepax Signed-off-by: Vinod Koul Link: https://lore.kernel.org/r/20200629134737.105993-4-vkoul@kernel.org Signed-off-by: Takashi Iwai --- include/sound/compress_driver.h | 10 +++++++++- sound/core/compress_offload.c | 4 ++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/compress_driver.h b/include/sound/compress_driver.h index 6ce8effa0b12..70cbc5095e72 100644 --- a/include/sound/compress_driver.h +++ b/include/sound/compress_driver.h @@ -66,6 +66,7 @@ struct snd_compr_runtime { * @direction: stream direction, playback/recording * @metadata_set: metadata set flag, true when set * @next_track: has userspace signal next track transition, true when set + * @partial_drain: undergoing partial_drain for stream, true when set * @private_data: pointer to DSP private data * @dma_buffer: allocated buffer if any */ @@ -78,6 +79,7 @@ struct snd_compr_stream { enum snd_compr_direction direction; bool metadata_set; bool next_track; + bool partial_drain; void *private_data; struct snd_dma_buffer dma_buffer; }; @@ -182,7 +184,13 @@ static inline void snd_compr_drain_notify(struct snd_compr_stream *stream) if (snd_BUG_ON(!stream)) return; - stream->runtime->state = SNDRV_PCM_STATE_SETUP; + /* for partial_drain case we are back to running state on success */ + if (stream->partial_drain) { + stream->runtime->state = SNDRV_PCM_STATE_RUNNING; + stream->partial_drain = false; /* clear this flag as well */ + } else { + stream->runtime->state = SNDRV_PCM_STATE_SETUP; + } wake_up(&stream->runtime->sleep); } diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index 509290f2efa8..0e53f6f31916 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -764,6 +764,9 @@ static int snd_compr_stop(struct snd_compr_stream *stream) retval = stream->ops->trigger(stream, SNDRV_PCM_TRIGGER_STOP); if (!retval) { + /* clear flags and stop any drain wait */ + stream->partial_drain = false; + stream->metadata_set = false; snd_compr_drain_notify(stream); stream->runtime->total_bytes_available = 0; stream->runtime->total_bytes_transferred = 0; @@ -921,6 +924,7 @@ static int snd_compr_partial_drain(struct snd_compr_stream *stream) if (stream->next_track == false) return -EPERM; + stream->partial_drain = true; retval = stream->ops->trigger(stream, SND_COMPR_TRIGGER_PARTIAL_DRAIN); if (retval) { pr_debug("Partial drain returned failure\n"); -- cgit v1.2.3 From ad0f75e5f57ccbceec13274e1e242f2b5a6397ed Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Jul 2020 11:52:56 -0700 Subject: cgroup: fix cgroup_sk_alloc() for sk_clone_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled. sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable. The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that. This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas Reported-by: Peter Geis Reported-by: Lu Fengqi Reported-by: Daniël Sonck Reported-by: Zhang Qiang Tested-by: Cameron Berkenpas Tested-by: Peter Geis Tested-by: Thomas Lamprecht Cc: Daniel Borkmann Cc: Zefan Li Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 6 ++++-- include/linux/cgroup.h | 4 +++- kernel/cgroup/cgroup.c | 31 +++++++++++++++++++------------ net/core/sock.c | 2 +- 4 files changed, 27 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 52661155f85f..4f1cd0edc57d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -790,7 +790,8 @@ struct sock_cgroup_data { union { #ifdef __LITTLE_ENDIAN struct { - u8 is_data; + u8 is_data : 1; + u8 no_refcnt : 1; u8 padding; u16 prioidx; u32 classid; @@ -800,7 +801,8 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; - u8 is_data; + u8 no_refcnt : 1; + u8 is_data : 1; } __packed; #endif u64 val; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4598e4da6b1b..618838c48313 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -822,6 +822,7 @@ extern spinlock_t cgroup_sk_update_lock; void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) @@ -835,7 +836,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) */ v = READ_ONCE(skcd->val); - if (v & 1) + if (v & 3) return &cgrp_dfl_root.cgrp; return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; @@ -847,6 +848,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1ea181a58465..dd247747ec14 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6439,18 +6439,8 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; } @@ -6475,10 +6465,27 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); } +void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + if (skcd->val) { + if (skcd->no_refcnt) + return; + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); + if (skcd->no_refcnt) + return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/net/core/sock.c b/net/core/sock.c index d832c650287c..2e5b7870e5d3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1926,7 +1926,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; - cgroup_sk_alloc(&newsk->sk_cgrp_data); + cgroup_sk_clone(&newsk->sk_cgrp_data); rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); -- cgit v1.2.3 From 41da51bce36f44eefc1e3d0f47d18841cbd065ba Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 21 Nov 2019 23:25:07 +0000 Subject: fs: Add IOCB_NOIO flag for generic_file_read_iter Add an IOCB_NOIO flag that indicates to generic_file_read_iter that it shouldn't trigger any filesystem I/O for the actual request or for readahead. This allows to do tentative reads out of the page cache as some filesystems allow, and to take the appropriate locks and retry the reads only if the requested pages are not cached. Signed-off-by: Andreas Gruenbacher --- include/linux/fs.h | 1 + mm/filemap.c | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f881a892ea7..4b7cb76e5837 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +#define IOCB_NOIO (1 << 9) struct kiocb { struct file *ki_filp; diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..385759c4ce4b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,7 +2028,7 @@ find_page: page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,6 +2038,10 @@ find_page: goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); @@ -2160,6 +2164,11 @@ page_not_up_to_date_locked: } readpage: + if (iocb->ki_flags & IOCB_NOIO) { + unlock_page(page); + put_page(page); + goto would_block; + } /* * A previous I/O error may have been due to temporary * failures, eg. multipath errors. @@ -2249,9 +2258,19 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read); * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * * Return: * * number of bytes copied, even for partial reads - * * negative error code if nothing was read + * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) -- cgit v1.2.3 From 394de110a73395de2ca4516b0de435e91b11b604 Mon Sep 17 00:00:00 2001 From: Martin Varghese Date: Sun, 5 Jul 2020 14:23:49 +0530 Subject: net: Added pointer check for dst->ops->neigh_lookup in dst_neigh_lookup_skb The packets from tunnel devices (eg bareudp) may have only metadata in the dst pointer of skb. Hence a pointer check of neigh_lookup is needed in dst_neigh_lookup_skb Kernel crashes when packets from bareudp device is processed in the kernel neighbour subsytem. [ 133.384484] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 133.385240] #PF: supervisor instruction fetch in kernel mode [ 133.385828] #PF: error_code(0x0010) - not-present page [ 133.386603] PGD 0 P4D 0 [ 133.386875] Oops: 0010 [#1] SMP PTI [ 133.387275] CPU: 0 PID: 5045 Comm: ping Tainted: G W 5.8.0-rc2+ #15 [ 133.388052] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 133.391076] RIP: 0010:0x0 [ 133.392401] Code: Bad RIP value. [ 133.394029] RSP: 0018:ffffb79980003d50 EFLAGS: 00010246 [ 133.396656] RAX: 0000000080000102 RBX: ffff9de2fe0d6600 RCX: ffff9de2fe5e9d00 [ 133.399018] RDX: 0000000000000000 RSI: ffff9de2fe5e9d00 RDI: ffff9de2fc21b400 [ 133.399685] RBP: ffff9de2fe5e9d00 R08: 0000000000000000 R09: 0000000000000000 [ 133.400350] R10: ffff9de2fbc6be22 R11: ffff9de2fe0d6600 R12: ffff9de2fc21b400 [ 133.401010] R13: ffff9de2fe0d6628 R14: 0000000000000001 R15: 0000000000000003 [ 133.401667] FS: 00007fe014918740(0000) GS:ffff9de2fec00000(0000) knlGS:0000000000000000 [ 133.402412] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 133.402948] CR2: ffffffffffffffd6 CR3: 000000003bb72000 CR4: 00000000000006f0 [ 133.403611] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 133.404270] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 133.404933] Call Trace: [ 133.405169] [ 133.405367] __neigh_update+0x5a4/0x8f0 [ 133.405734] arp_process+0x294/0x820 [ 133.406076] ? __netif_receive_skb_core+0x866/0xe70 [ 133.406557] arp_rcv+0x129/0x1c0 [ 133.406882] __netif_receive_skb_one_core+0x95/0xb0 [ 133.407340] process_backlog+0xa7/0x150 [ 133.407705] net_rx_action+0x2af/0x420 [ 133.408457] __do_softirq+0xda/0x2a8 [ 133.408813] asm_call_on_stack+0x12/0x20 [ 133.409290] [ 133.409519] do_softirq_own_stack+0x39/0x50 [ 133.410036] do_softirq+0x50/0x60 [ 133.410401] __local_bh_enable_ip+0x50/0x60 [ 133.410871] ip_finish_output2+0x195/0x530 [ 133.411288] ip_output+0x72/0xf0 [ 133.411673] ? __ip_finish_output+0x1f0/0x1f0 [ 133.412122] ip_send_skb+0x15/0x40 [ 133.412471] raw_sendmsg+0x853/0xab0 [ 133.412855] ? insert_pfn+0xfe/0x270 [ 133.413827] ? vvar_fault+0xec/0x190 [ 133.414772] sock_sendmsg+0x57/0x80 [ 133.415685] __sys_sendto+0xdc/0x160 [ 133.416605] ? syscall_trace_enter+0x1d4/0x2b0 [ 133.417679] ? __audit_syscall_exit+0x1d9/0x280 [ 133.418753] ? __prepare_exit_to_usermode+0x5d/0x1a0 [ 133.419819] __x64_sys_sendto+0x24/0x30 [ 133.420848] do_syscall_64+0x4d/0x90 [ 133.421768] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 133.422833] RIP: 0033:0x7fe013689c03 [ 133.423749] Code: Bad RIP value. [ 133.424624] RSP: 002b:00007ffc7288f418 EFLAGS: 00000246 ORIG_RAX: 000000000000002c [ 133.425940] RAX: ffffffffffffffda RBX: 000056151fc63720 RCX: 00007fe013689c03 [ 133.427225] RDX: 0000000000000040 RSI: 000056151fc63720 RDI: 0000000000000003 [ 133.428481] RBP: 00007ffc72890b30 R08: 000056151fc60500 R09: 0000000000000010 [ 133.429757] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000040 [ 133.431041] R13: 000056151fc636e0 R14: 000056151fc616bc R15: 0000000000000080 [ 133.432481] Modules linked in: mpls_iptunnel act_mirred act_tunnel_key cls_flower sch_ingress veth mpls_router ip_tunnel bareudp ip6_udp_tunnel udp_tunnel macsec udp_diag inet_diag unix_diag af_packet_diag netlink_diag binfmt_misc xt_MASQUERADE iptable_nat xt_addrtype xt_conntrack nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter bridge stp llc ebtable_filter ebtables overlay ip6table_filter ip6_tables iptable_filter sunrpc ext4 mbcache jbd2 pcspkr i2c_piix4 virtio_balloon joydev ip_tables xfs libcrc32c ata_generic qxl pata_acpi drm_ttm_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ata_piix libata virtio_net net_failover virtio_console failover virtio_blk i2c_core virtio_pci virtio_ring serio_raw floppy virtio dm_mirror dm_region_hash dm_log dm_mod [ 133.444045] CR2: 0000000000000000 [ 133.445082] ---[ end trace f4aeee1958fd1638 ]--- [ 133.446236] RIP: 0010:0x0 [ 133.447180] Code: Bad RIP value. [ 133.448152] RSP: 0018:ffffb79980003d50 EFLAGS: 00010246 [ 133.449363] RAX: 0000000080000102 RBX: ffff9de2fe0d6600 RCX: ffff9de2fe5e9d00 [ 133.450835] RDX: 0000000000000000 RSI: ffff9de2fe5e9d00 RDI: ffff9de2fc21b400 [ 133.452237] RBP: ffff9de2fe5e9d00 R08: 0000000000000000 R09: 0000000000000000 [ 133.453722] R10: ffff9de2fbc6be22 R11: ffff9de2fe0d6600 R12: ffff9de2fc21b400 [ 133.455149] R13: ffff9de2fe0d6628 R14: 0000000000000001 R15: 0000000000000003 [ 133.456520] FS: 00007fe014918740(0000) GS:ffff9de2fec00000(0000) knlGS:0000000000000000 [ 133.458046] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 133.459342] CR2: ffffffffffffffd6 CR3: 000000003bb72000 CR4: 00000000000006f0 [ 133.460782] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 133.462240] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 133.463697] Kernel panic - not syncing: Fatal exception in interrupt [ 133.465226] Kernel Offset: 0xfa00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 133.467025] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fixes: aaa0c23cb901 ("Fix dst_neigh_lookup/dst_neigh_lookup_skb return value handling bug") Signed-off-by: Martin Varghese Signed-off-by: David S. Miller --- include/net/dst.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 07adfacd8088..852d8fb36ab7 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -400,7 +400,15 @@ static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, co static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst, struct sk_buff *skb) { - struct neighbour *n = dst->ops->neigh_lookup(dst, skb, NULL); + struct neighbour *n = NULL; + + /* The packets from tunnel devices (eg bareudp) may have only + * metadata in the dst pointer of skb. Hence a pointer check of + * neigh_lookup is needed. + */ + if (dst->ops->neigh_lookup) + n = dst->ops->neigh_lookup(dst, skb, NULL); + return IS_ERR(n) ? NULL : n; } -- cgit v1.2.3 From 469aceddfa3ed16e17ee30533fae45e90f62efd8 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 7 Jul 2020 13:03:25 +0200 Subject: vlan: consolidate VLAN parsing code and limit max parsing depth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toshiaki pointed out that we now have two very similar functions to extract the L3 protocol number in the presence of VLAN tags. And Daniel pointed out that the unbounded parsing loop makes it possible for maliciously crafted packets to loop through potentially hundreds of tags. Fix both of these issues by consolidating the two parsing functions and limiting the VLAN tag parsing to a max depth of 8 tags. As part of this, switch over __vlan_get_protocol() to use skb_header_pointer() instead of pskb_may_pull(), to avoid the possible side effects of the latter and keep the skb pointer 'const' through all the parsing functions. v2: - Use limit of 8 tags instead of 32 (matching XMIT_RECURSION_LIMIT) Reported-by: Toshiaki Makita Reported-by: Daniel Borkmann Fixes: d7bf2ebebc2b ("sched: consistently handle layer3 header accesses in the presence of VLANs") Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 57 +++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 427a5b8597c2..41a518336673 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -25,6 +25,8 @@ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ +#define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ + /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID @@ -308,34 +310,6 @@ static inline bool eth_type_vlan(__be16 ethertype) } } -/* A getter for the SKB protocol field which will handle VLAN tags consistently - * whether VLAN acceleration is enabled or not. - */ -static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) -{ - unsigned int offset = skb_mac_offset(skb) + sizeof(struct ethhdr); - __be16 proto = skb->protocol; - - if (!skip_vlan) - /* VLAN acceleration strips the VLAN header from the skb and - * moves it to skb->vlan_proto - */ - return skb_vlan_tag_present(skb) ? skb->vlan_proto : proto; - - while (eth_type_vlan(proto)) { - struct vlan_hdr vhdr, *vh; - - vh = skb_header_pointer(skb, offset, sizeof(vhdr), &vhdr); - if (!vh) - break; - - proto = vh->h_vlan_encapsulated_proto; - offset += sizeof(vhdr); - } - - return proto; -} - static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -605,10 +579,10 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { - unsigned int vlan_depth = skb->mac_len; + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at @@ -623,13 +597,12 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vlan_depth = ETH_HLEN; } do { - struct vlan_hdr *vh; + struct vlan_hdr vhdr, *vh; - if (unlikely(!pskb_may_pull(skb, - vlan_depth + VLAN_HLEN))) + vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) return 0; - vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); @@ -648,11 +621,25 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 vlan_get_protocol(struct sk_buff *skb) +static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* A getter for the SKB protocol field which will handle VLAN tags consistently + * whether VLAN acceleration is enabled or not. + */ +static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) +{ + if (!skip_vlan) + /* VLAN acceleration strips the VLAN header from the skb and + * moves it to skb->vlan_proto + */ + return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; + + return vlan_get_protocol(skb); +} + static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { -- cgit v1.2.3 From 61a707c543e2afe3aa7e88f87267c5dafa4b5afa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 May 2020 08:54:16 +0200 Subject: fs: add a __kernel_read helper This is the counterpart to __kernel_write, and skip the rw_verify_area call compared to kernel_read. Signed-off-by: Christoph Hellwig --- fs/read_write.c | 23 +++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/fs/read_write.c b/fs/read_write.c index 96e8e354f99b..21c9d90a257e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -430,6 +430,29 @@ ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, return -EINVAL; } +ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs = get_fs(); + ssize_t ret; + + if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) + return -EINVAL; + if (!(file->f_mode & FMODE_CAN_READ)) + return -EINVAL; + + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + set_fs(KERNEL_DS); + ret = __vfs_read(file, (void __user *)buf, count, pos); + set_fs(old_fs); + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); + return ret; +} + ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f881a892ea7..22cbe7b2e919 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3033,6 +3033,7 @@ extern int kernel_read_file_from_path_initns(const char *, void **, loff_t *, lo extern int kernel_read_file_from_fd(int, void **, loff_t *, loff_t, enum kernel_read_file_id); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); +ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); -- cgit v1.2.3 From 775802c0571fb438cd4f6548a323f9e4cb89f5aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 May 2020 11:17:46 +0200 Subject: fs: remove __vfs_read Fold it into the two callers. Signed-off-by: Christoph Hellwig --- fs/read_write.c | 43 +++++++++++++++++++++---------------------- include/linux/fs.h | 1 - 2 files changed, 21 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/fs/read_write.c b/fs/read_write.c index 42a027719324..4fb797822567 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -419,17 +419,6 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo return ret; } -ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, - loff_t *pos) -{ - if (file->f_op->read) - return file->f_op->read(file, buf, count, pos); - else if (file->f_op->read_iter) - return new_sync_read(file, buf, count, pos); - else - return -EINVAL; -} - ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) { mm_segment_t old_fs = get_fs(); @@ -443,7 +432,12 @@ ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; set_fs(KERNEL_DS); - ret = __vfs_read(file, (void __user *)buf, count, pos); + if (file->f_op->read) + ret = file->f_op->read(file, (void __user *)buf, count, pos); + else if (file->f_op->read_iter) + ret = new_sync_read(file, (void __user *)buf, count, pos); + else + ret = -EINVAL; set_fs(old_fs); if (ret > 0) { fsnotify_access(file); @@ -476,17 +470,22 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); - if (!ret) { - if (count > MAX_RW_COUNT) - count = MAX_RW_COUNT; - ret = __vfs_read(file, buf, count, pos); - if (ret > 0) { - fsnotify_access(file); - add_rchar(current, ret); - } - inc_syscr(current); - } + if (ret) + return ret; + if (count > MAX_RW_COUNT) + count = MAX_RW_COUNT; + if (file->f_op->read) + ret = file->f_op->read(file, buf, count, pos); + else if (file->f_op->read_iter) + ret = new_sync_read(file, buf, count, pos); + else + ret = -EINVAL; + if (ret > 0) { + fsnotify_access(file); + add_rchar(current, ret); + } + inc_syscr(current); return ret; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 22cbe7b2e919..0c0ec76b600b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1917,7 +1917,6 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, struct iovec *fast_pointer, struct iovec **ret_pointer); -extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_readv(struct file *, const struct iovec __user *, -- cgit v1.2.3 From 6ec4476ac82512f09c94aff5972654b70f3772b2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jul 2020 10:48:35 -0700 Subject: Raise gcc version requirement to 4.9 I realize that we fairly recently raised it to 4.8, but the fact is, 4.9 is a much better minimum version to target. We have a number of workarounds for actual bugs in pre-4.9 gcc versions (including things like internal compiler errors on ARM), but we also have some syntactic workarounds for lacking features. In particular, raising the minimum to 4.9 means that we can now just assume _Generic() exists, which is likely the much better replacement for a lot of very convoluted built-time magic with conditionals on sizeof and/or __builtin_choose_expr() with same_type() etc. Using _Generic also means that you will need to have a very recent version of 'sparse', but thats easy to build yourself, and much less of a hassle than some old gcc version can be. The latest (in a long string) of reasons for minimum compiler version upgrades was commit 5435f73d5c4a ("efi/x86: Fix build with gcc 4"). Ard points out that RHEL 7 uses gcc-4.8, but the people who stay back on old RHEL versions persumably also don't build their own kernels anyway. And maybe they should cross-built or just have a little side affair with a newer compiler? Acked-by: Ard Biesheuvel Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- arch/arm/kernel/asm-offsets.c | 9 --------- arch/mips/include/asm/unroll.h | 7 +++---- include/linux/bits.h | 3 +-- include/linux/compiler-gcc.h | 2 +- include/linux/compiler_types.h | 27 +-------------------------- mm/migrate.c | 13 +------------ tools/include/linux/bits.h | 3 +-- 7 files changed, 8 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index c036a4a2f8e2..a1570c8bab25 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -31,15 +31,6 @@ #if defined(__APCS_26__) #error Sorry, your compiler targets APCS-26 but this kernel requires APCS-32 #endif -/* - * GCC 4.8.0-4.8.2: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58854 - * miscompiles find_get_entry(), and can result in EXT3 and EXT4 - * filesystem corruption (possibly other FS too). - */ -#if defined(GCC_VERSION) && GCC_VERSION >= 40800 && GCC_VERSION < 40803 -#error Your compiler is too buggy; it is known to miscompile kernels -#error and result in filesystem corruption and oopses. -#endif int main(void) { diff --git a/arch/mips/include/asm/unroll.h b/arch/mips/include/asm/unroll.h index c628747d4ecd..8ed660adc84f 100644 --- a/arch/mips/include/asm/unroll.h +++ b/arch/mips/include/asm/unroll.h @@ -19,14 +19,13 @@ \ /* \ * We can't unroll if the number of iterations isn't \ - * compile-time constant. Unfortunately GCC versions \ - * up until 4.6 tend to miss obvious constants & cause \ + * compile-time constant. Unfortunately clang versions \ + * up until 8.0 tend to miss obvious constants & cause \ * this check to fail, even though they go on to \ * generate reasonable code for the switch statement, \ * so we skip the sanity check for those compilers. \ */ \ - BUILD_BUG_ON((CONFIG_GCC_VERSION >= 40700 || \ - CONFIG_CLANG_VERSION >= 80000) && \ + BUILD_BUG_ON((CONFIG_CLANG_VERSION >= 80000) && \ !__builtin_constant_p(times)); \ \ switch (times) { \ diff --git a/include/linux/bits.h b/include/linux/bits.h index 4671fbf28842..7f475d59a097 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -18,8 +18,7 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#if !defined(__ASSEMBLY__) && \ - (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#if !defined(__ASSEMBLY__) #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 1c74464c80c6..0b1dc61f3955 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -11,7 +11,7 @@ + __GNUC_PATCHLEVEL__) /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ -#if GCC_VERSION < 40800 +#if GCC_VERSION < 40900 # error Sorry, your compiler is too old - please upgrade it. #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index c3bf7710f69a..01dd58c74d80 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -252,32 +252,8 @@ struct ftrace_likely_data { * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving * non-scalar types unchanged. */ -#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 40900) || defined(__CHECKER__) /* - * We build this out of a couple of helper macros in a vain attempt to - * help you keep your lunch down while reading it. - */ -#define __pick_scalar_type(x, type, otherwise) \ - __builtin_choose_expr(__same_type(x, type), (type)0, otherwise) - -/* - * 'char' is not type-compatible with either 'signed char' or 'unsigned char', - * so we include the naked type here as well as the signed/unsigned variants. - */ -#define __pick_integer_type(x, type, otherwise) \ - __pick_scalar_type(x, type, \ - __pick_scalar_type(x, unsigned type, \ - __pick_scalar_type(x, signed type, otherwise))) - -#define __unqual_scalar_typeof(x) typeof( \ - __pick_integer_type(x, char, \ - __pick_integer_type(x, short, \ - __pick_integer_type(x, int, \ - __pick_integer_type(x, long, \ - __pick_integer_type(x, long long, x)))))) -#else -/* - * If supported, prefer C11 _Generic for better compile-times. As above, 'char' + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' * is not type-compatible with 'signed char', and we define a separate case. */ #define __scalar_type_to_expr_cases(type) \ @@ -293,7 +269,6 @@ struct ftrace_likely_data { __scalar_type_to_expr_cases(long), \ __scalar_type_to_expr_cases(long long), \ default: (x))) -#endif /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ diff --git a/mm/migrate.c b/mm/migrate.c index f37729673558..40cd7016ae6f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1160,22 +1160,11 @@ out: return rc; } -/* - * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work - * around it. - */ -#if defined(CONFIG_ARM) && \ - defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 -#define ICE_noinline noinline -#else -#define ICE_noinline -#endif - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static ICE_noinline int unmap_and_move(new_page_t get_new_page, +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 4671fbf28842..7f475d59a097 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -18,8 +18,7 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#if !defined(__ASSEMBLY__) && \ - (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#if !defined(__ASSEMBLY__) #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ -- cgit v1.2.3 From 160251842cd35a75edfb0a1d76afa3eb674ff40a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 11:49:23 -0700 Subject: kallsyms: Refactor kallsyms_show_value() to take cred In order to perform future tests against the cred saved during open(), switch kallsyms_show_value() to operate on a cred, and have all current callers pass current_cred(). This makes it very obvious where callers are checking the wrong credential in their "read" contexts. These will be fixed in the coming patches. Additionally switch return value to bool, since it is always used as a direct permission check, not a 0-on-success, negative-on-error style function return. Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- include/linux/filter.h | 2 +- include/linux/kallsyms.h | 5 +++-- kernel/kallsyms.c | 17 +++++++++++------ kernel/kprobes.c | 4 ++-- kernel/module.c | 2 +- 5 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 259377723603..55104f6c78e8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -889,7 +889,7 @@ static inline bool bpf_dump_raw_ok(void) /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value() == 1; + return kallsyms_show_value(current_cred()); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 98338dc6b5d2..481273f0c72d 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -18,6 +18,7 @@ #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) +struct cred; struct module; static inline int is_kernel_inittext(unsigned long addr) @@ -98,7 +99,7 @@ int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* How and when do we show kallsyms values? */ -extern int kallsyms_show_value(void); +extern bool kallsyms_show_value(const struct cred *cred); #else /* !CONFIG_KALLSYMS */ @@ -158,7 +159,7 @@ static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, u return -ERANGE; } -static inline int kallsyms_show_value(void) +static inline bool kallsyms_show_value(const struct cred *cred) { return false; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 16c8c605f4b0..bb14e64f62a4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -644,19 +644,20 @@ static inline int kallsyms_for_perf(void) * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ -int kallsyms_show_value(void) +bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) - return 1; + return true; /* fallthrough */ case 1: - if (has_capability_noaudit(current, CAP_SYSLOG)) - return 1; + if (security_capable(cred, &init_user_ns, CAP_SYSLOG, + CAP_OPT_NOAUDIT) == 0) + return true; /* fallthrough */ default: - return 0; + return false; } } @@ -673,7 +674,11 @@ static int kallsyms_open(struct inode *inode, struct file *file) return -ENOMEM; reset_iter(iter, 0); - iter->show_value = kallsyms_show_value(); + /* + * Instead of checking this on every s_show() call, cache + * the result here at open time. + */ + iter->show_value = kallsyms_show_value(file->f_cred); return 0; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 4a904cc56d68..d4de217e4a91 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2448,7 +2448,7 @@ static void report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) addr = NULL; if (sym) @@ -2540,7 +2540,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) * If /proc/kallsyms is not showing kernel address, we won't * show them here either. */ - if (!kallsyms_show_value()) + if (!kallsyms_show_value(current_cred())) seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL, (void *)ent->start_addr); else diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..a5022ae84e50 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4377,7 +4377,7 @@ static int modules_open(struct inode *inode, struct file *file) if (!err) { struct seq_file *m = file->private_data; - m->private = kallsyms_show_value() ? NULL : (void *)8ul; + m->private = kallsyms_show_value(current_cred()) ? NULL : (void *)8ul; } return err; -- cgit v1.2.3 From 63960260457a02af2a6cb35d75e6bdb17299c882 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Jul 2020 15:45:23 -0700 Subject: bpf: Check correct cred for CAP_SYSLOG in bpf_dump_raw_ok() When evaluating access control over kallsyms visibility, credentials at open() time need to be used, not the "current" creds (though in BPF's case, this has likely always been the same). Plumb access to associated file->f_cred down through bpf_dump_raw_ok() and its callers now that kallsysm_show_value() has been refactored to take struct cred. Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: bpf@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 7105e828c087 ("bpf: allow for correlation of maps and helpers in dump") Signed-off-by: Kees Cook --- include/linux/filter.h | 4 ++-- kernel/bpf/syscall.c | 37 +++++++++++++++++++++---------------- net/core/sysctl_net_core.c | 2 +- 3 files changed, 24 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 55104f6c78e8..0b0144752d78 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -884,12 +884,12 @@ void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_helper_changes_pkt_data(void *func); -static inline bool bpf_dump_raw_ok(void) +static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ - return kallsyms_show_value(current_cred()); + return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8da159936bab..859053ddf05b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3139,7 +3139,8 @@ static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, return NULL; } -static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, + const struct cred *f_cred) { const struct bpf_map *map; struct bpf_insn *insns; @@ -3165,7 +3166,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) code == (BPF_JMP | BPF_CALL_ARGS)) { if (code == (BPF_JMP | BPF_CALL_ARGS)) insns[i].code = BPF_JMP | BPF_CALL; - if (!bpf_dump_raw_ok()) + if (!bpf_dump_raw_ok(f_cred)) insns[i].imm = 0; continue; } @@ -3221,7 +3222,8 @@ static int set_info_rec_size(struct bpf_prog_info *info) return 0; } -static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, +static int bpf_prog_get_info_by_fd(struct file *file, + struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3290,11 +3292,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_insn *insns_sanitized; bool fault; - if (prog->blinded && !bpf_dump_raw_ok()) { + if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { info.xlated_prog_insns = 0; goto done; } - insns_sanitized = bpf_insn_prepare_dump(prog); + insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); if (!insns_sanitized) return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); @@ -3328,7 +3330,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { uinsns = u64_to_user_ptr(info.jited_prog_insns); ulen = min_t(u32, info.jited_prog_len, ulen); @@ -3363,7 +3365,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_ksyms; info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { unsigned long ksym_addr; u64 __user *user_ksyms; u32 i; @@ -3394,7 +3396,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.nr_jited_func_lens; info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; if (ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { u32 __user *user_lens; u32 func_len, i; @@ -3451,7 +3453,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, else info.nr_jited_line_info = 0; if (info.nr_jited_line_info && ulen) { - if (bpf_dump_raw_ok()) { + if (bpf_dump_raw_ok(file->f_cred)) { __u64 __user *user_linfo; u32 i; @@ -3497,7 +3499,8 @@ done: return 0; } -static int bpf_map_get_info_by_fd(struct bpf_map *map, +static int bpf_map_get_info_by_fd(struct file *file, + struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3540,7 +3543,8 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, return 0; } -static int bpf_btf_get_info_by_fd(struct btf *btf, +static int bpf_btf_get_info_by_fd(struct file *file, + struct btf *btf, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3555,7 +3559,8 @@ static int bpf_btf_get_info_by_fd(struct btf *btf, return btf_get_info_by_fd(btf, attr, uattr); } -static int bpf_link_get_info_by_fd(struct bpf_link *link, +static int bpf_link_get_info_by_fd(struct file *file, + struct bpf_link *link, const union bpf_attr *attr, union bpf_attr __user *uattr) { @@ -3608,15 +3613,15 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, return -EBADFD; if (f.file->f_op == &bpf_prog_fops) - err = bpf_prog_get_info_by_fd(f.file->private_data, attr, + err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_map_fops) - err = bpf_map_get_info_by_fd(f.file->private_data, attr, + err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file->private_data, attr, uattr); + err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else if (f.file->f_op == &bpf_link_fops) - err = bpf_link_get_info_by_fd(f.file->private_data, + err = bpf_link_get_info_by_fd(f.file, f.file->private_data, attr, uattr); else err = -EINVAL; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index f93f8ace6c56..6ada114bbcca 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -274,7 +274,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (jit_enable < 2 || - (jit_enable == 2 && bpf_dump_raw_ok())) { + (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); -- cgit v1.2.3 From 6d5f904904608a9cd32854d7d0a4dd65b27f9935 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Thu, 9 Jul 2020 09:15:29 +0800 Subject: io_uring: export cq overflow status to userspace For those applications which are not willing to use io_uring_enter() to reap and handle cqes, they may completely rely on liburing's io_uring_peek_cqe(), but if cq ring has overflowed, currently because io_uring_peek_cqe() is not aware of this overflow, it won't enter kernel to flush cqes, below test program can reveal this bug: static void test_cq_overflow(struct io_uring *ring) { struct io_uring_cqe *cqe; struct io_uring_sqe *sqe; int issued = 0; int ret = 0; do { sqe = io_uring_get_sqe(ring); if (!sqe) { fprintf(stderr, "get sqe failed\n"); break;; } ret = io_uring_submit(ring); if (ret <= 0) { if (ret != -EBUSY) fprintf(stderr, "sqe submit failed: %d\n", ret); break; } issued++; } while (ret > 0); assert(ret == -EBUSY); printf("issued requests: %d\n", issued); while (issued) { ret = io_uring_peek_cqe(ring, &cqe); if (ret) { if (ret != -EAGAIN) { fprintf(stderr, "peek completion failed: %s\n", strerror(ret)); break; } printf("left requets: %d\n", issued); continue; } io_uring_cqe_seen(ring, cqe); issued--; printf("left requets: %d\n", issued); } } int main(int argc, char *argv[]) { int ret; struct io_uring ring; ret = io_uring_queue_init(16, &ring, 0); if (ret) { fprintf(stderr, "ring setup failed: %d\n", ret); return 1; } test_cq_overflow(&ring); return 0; } To fix this issue, export cq overflow status to userspace by adding new IORING_SQ_CQ_OVERFLOW flag, then helper functions() in liburing, such as io_uring_peek_cqe, can be aware of this cq overflow and do flush accordingly. Signed-off-by: Xiaoguang Wang Signed-off-by: Jens Axboe --- fs/io_uring.c | 11 +++++++++-- include/uapi/linux/io_uring.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/io_uring.c b/fs/io_uring.c index d37d7ea5ebe5..32e37c38f274 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1274,6 +1274,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); @@ -1311,6 +1312,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->sq_check_overflow); set_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } req->flags |= REQ_F_OVERFLOW; refcount_inc(&req->refs); @@ -6080,9 +6082,9 @@ static int io_sq_thread(void *data) } /* Tell userspace we may need a wakeup call */ + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - /* make sure to read SQ tail after writing flags */ - smp_mb(); + spin_unlock_irq(&ctx->completion_lock); to_submit = io_sqring_entries(ctx); if (!to_submit || ret == -EBUSY) { @@ -6100,13 +6102,17 @@ static int io_sq_thread(void *data) schedule(); finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); + spin_lock_irq(&ctx->completion_lock); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + spin_unlock_irq(&ctx->completion_lock); } mutex_lock(&ctx->uring_lock); @@ -7488,6 +7494,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, if (list_empty(&ctx->cq_overflow_list)) { clear_bit(0, &ctx->sq_check_overflow); clear_bit(0, &ctx->cq_check_overflow); + ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } spin_unlock_irq(&ctx->completion_lock); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 92c22699a5a7..7843742b8b74 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -197,6 +197,7 @@ struct io_sqring_offsets { * sq_ring->flags */ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ +#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ struct io_cqring_offsets { __u32 head; -- cgit v1.2.3 From c9a368f1c0fbe2e3a21ebf231caeae58b18b2681 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 8 Jul 2020 23:11:10 -0700 Subject: bpf: net: Avoid incorrect bpf_sk_reuseport_detach call bpf_sk_reuseport_detach is currently called when sk->sk_user_data is not NULL. It is incorrect because sk->sk_user_data may not be managed by the bpf's reuseport_array. It has been reported in [1] that, the bpf_sk_reuseport_detach() which is called from udp_lib_unhash() has corrupted the sk_user_data managed by l2tp. This patch solves it by using another bit (defined as SK_USER_DATA_BPF) of the sk_user_data pointer value. It marks that a sk_user_data is managed/owned by BPF. The patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e ("net, sk_msg: Clear sk_user_data pointer on clone if tagged"). [ Note: sk->sk_user_data is used by bpf's reuseport_array only when a sk is added to the bpf's reuseport_array. i.e. doing setsockopt(SO_REUSEPORT) and having "sk->sk_reuseport == 1" alone will not stop sk->sk_user_data being used by other means. ] [1]: https://lore.kernel.org/netdev/20200706121259.GA20199@katalix.com/ Fixes: 5dc4c4b7d4e8 ("bpf: Introduce BPF_MAP_TYPE_REUSEPORT_SOCKARRAY") Reported-by: James Chapman Reported-by: syzbot+9f092552ba9a5efca5df@syzkaller.appspotmail.com Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: James Chapman Acked-by: James Chapman Link: https://lore.kernel.org/bpf/20200709061110.4019316-1-kafai@fb.com --- include/net/sock.h | 3 ++- kernel/bpf/reuseport_array.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 3428619faae4..1183507df95b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -533,7 +533,8 @@ enum sk_pacing { * be copied. */ #define SK_USER_DATA_NOCOPY 1UL -#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY) +#define SK_USER_DATA_BPF 2UL /* Managed by BPF */ +#define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a95bc8d7e812..cae9d505e04a 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -24,7 +24,7 @@ void bpf_sk_reuseport_detach(struct sock *sk) write_lock_bh(&sk->sk_callback_lock); sk_user_data = (uintptr_t)sk->sk_user_data; - if (sk_user_data) { + if (sk_user_data & SK_USER_DATA_BPF) { struct sock __rcu **socks; socks = (void *)(sk_user_data & SK_USER_DATA_PTRMASK); @@ -309,7 +309,8 @@ int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, if (err) goto put_file_unlock; - sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY; + sk_user_data = (uintptr_t)&array->ptrs[index] | SK_USER_DATA_NOCOPY | + SK_USER_DATA_BPF; WRITE_ONCE(nsk->sk_user_data, (void *)sk_user_data); rcu_assign_pointer(array->ptrs[index], nsk); free_osk = osk; -- cgit v1.2.3 From 14b032b8f8fce03a546dcf365454bec8c4a58d7d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 9 Jul 2020 16:28:44 -0700 Subject: cgroup: Fix sock_cgroup_data on big-endian. In order for no_refcnt and is_data to be the lowest order two bits in the 'val' we have to pad out the bitfield of the u8. Fixes: ad0f75e5f57c ("cgroup: fix cgroup_sk_alloc() for sk_clone_lock()") Reported-by: Guenter Roeck Signed-off-by: David S. Miller --- include/linux/cgroup-defs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 4f1cd0edc57d..fee0b5547cd0 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -792,6 +792,7 @@ struct sock_cgroup_data { struct { u8 is_data : 1; u8 no_refcnt : 1; + u8 unused : 6; u8 padding; u16 prioidx; u32 classid; @@ -801,6 +802,7 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; + u8 unused : 6; u8 no_refcnt : 1; u8 is_data : 1; } __packed; -- cgit v1.2.3 From 88b3d5c90e9685be54dd5bc441970044020eca76 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Mon, 22 Jun 2020 09:03:31 +0300 Subject: net/mlx5e: Fix port buffers cell size value Device unit for port buffers size, xoff_threshold and xon_threshold is cells. Fix a bug in driver where cell unit size was hard-coded to 128 bytes. This hard-coded value is buggy, as it is wrong for some hardware versions. Driver to read cell size from SBCAM register and translate bytes to cell units accordingly. In order to fix the bug, this patch exposes SBCAM (Shared buffer capabilities mask) layout and defines. If SBCAM.cap_cell_size is valid, use it for all bytes to cells calculations. If not valid, fallback to 128. Cell size do not change on the fly per device. Instead of issuing SBCAM access reg command every time such translation is needed, cache it in mlx5e_dcbx as part of mlx5e_dcbnl_initialize(). Pass dcbx.port_buff_cell_sz as a param to every function that needs bytes to cells translation. While fixing the bug, move MLX5E_BUFFER_CELL_SHIFT macro to en_dcbnl.c, as it is only used by that file. Fixes: 0696d60853d5 ("net/mlx5e: Receive buffer configuration") Signed-off-by: Eran Ben Elisha Reviewed-by: Huy Nguyen Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h | 1 + .../ethernet/mellanox/mlx5/core/en/port_buffer.c | 53 ++++++++++++---------- .../ethernet/mellanox/mlx5/core/en/port_buffer.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 19 ++++++++ include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 28 ++++++++++++ 6 files changed, 78 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h index 7be6b2d36b60..9976de8b9047 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -29,6 +29,7 @@ struct mlx5e_dcbx { bool manual_buffer; u32 cable_len; u32 xoff; + u16 port_buff_cell_sz; }; #define MLX5E_MAX_DSCP (64) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index ae99fac08b53..673f1c82d381 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -34,6 +34,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, struct mlx5e_port_buffer *port_buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); u32 total_used = 0; @@ -57,11 +58,11 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, port_buffer->buffer[i].epsb = MLX5_GET(bufferx_reg, buffer, epsb); port_buffer->buffer[i].size = - MLX5_GET(bufferx_reg, buffer, size) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, size) * port_buff_cell_sz; port_buffer->buffer[i].xon = - MLX5_GET(bufferx_reg, buffer, xon_threshold) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, xon_threshold) * port_buff_cell_sz; port_buffer->buffer[i].xoff = - MLX5_GET(bufferx_reg, buffer, xoff_threshold) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(bufferx_reg, buffer, xoff_threshold) * port_buff_cell_sz; total_used += port_buffer->buffer[i].size; mlx5e_dbg(HW, priv, "buffer %d: size=%d, xon=%d, xoff=%d, epsb=%d, lossy=%d\n", i, @@ -73,7 +74,7 @@ int mlx5e_port_query_buffer(struct mlx5e_priv *priv, } port_buffer->port_buffer_size = - MLX5_GET(pbmc_reg, out, port_buffer_size) << MLX5E_BUFFER_CELL_SHIFT; + MLX5_GET(pbmc_reg, out, port_buffer_size) * port_buff_cell_sz; port_buffer->spare_buffer_size = port_buffer->port_buffer_size - total_used; @@ -88,9 +89,9 @@ out: static int port_set_buffer(struct mlx5e_priv *priv, struct mlx5e_port_buffer *port_buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); - void *buffer; void *in; int err; int i; @@ -104,16 +105,18 @@ static int port_set_buffer(struct mlx5e_priv *priv, goto out; for (i = 0; i < MLX5E_MAX_BUFFER; i++) { - buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); - - MLX5_SET(bufferx_reg, buffer, size, - port_buffer->buffer[i].size >> MLX5E_BUFFER_CELL_SHIFT); - MLX5_SET(bufferx_reg, buffer, lossy, - port_buffer->buffer[i].lossy); - MLX5_SET(bufferx_reg, buffer, xoff_threshold, - port_buffer->buffer[i].xoff >> MLX5E_BUFFER_CELL_SHIFT); - MLX5_SET(bufferx_reg, buffer, xon_threshold, - port_buffer->buffer[i].xon >> MLX5E_BUFFER_CELL_SHIFT); + void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + u64 size = port_buffer->buffer[i].size; + u64 xoff = port_buffer->buffer[i].xoff; + u64 xon = port_buffer->buffer[i].xon; + + do_div(size, port_buff_cell_sz); + do_div(xoff, port_buff_cell_sz); + do_div(xon, port_buff_cell_sz); + MLX5_SET(bufferx_reg, buffer, size, size); + MLX5_SET(bufferx_reg, buffer, lossy, port_buffer->buffer[i].lossy); + MLX5_SET(bufferx_reg, buffer, xoff_threshold, xoff); + MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); } err = mlx5e_port_set_pbmc(mdev, in); @@ -143,7 +146,7 @@ static u32 calculate_xoff(struct mlx5e_priv *priv, unsigned int mtu) } static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, - u32 xoff, unsigned int max_mtu) + u32 xoff, unsigned int max_mtu, u16 port_buff_cell_sz) { int i; @@ -155,7 +158,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, } if (port_buffer->buffer[i].size < - (xoff + max_mtu + (1 << MLX5E_BUFFER_CELL_SHIFT))) { + (xoff + max_mtu + port_buff_cell_sz)) { pr_err("buffer_size[%d]=%d is not enough for lossless buffer\n", i, port_buffer->buffer[i].size); return -ENOMEM; @@ -175,6 +178,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, * @pfc_en: current pfc configuration * @buffer: current prio to buffer mapping * @xoff: xoff value + * @port_buff_cell_sz: port buffer cell_size * @port_buffer: port receive buffer configuration * @change: * @@ -189,7 +193,7 @@ static int update_xoff_threshold(struct mlx5e_port_buffer *port_buffer, * sets change to true if buffer configuration was modified. */ static int update_buffer_lossy(unsigned int max_mtu, - u8 pfc_en, u8 *buffer, u32 xoff, + u8 pfc_en, u8 *buffer, u32 xoff, u16 port_buff_cell_sz, struct mlx5e_port_buffer *port_buffer, bool *change) { @@ -225,7 +229,7 @@ static int update_buffer_lossy(unsigned int max_mtu, } if (changed) { - err = update_xoff_threshold(port_buffer, xoff, max_mtu); + err = update_xoff_threshold(port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; @@ -262,6 +266,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, u32 *buffer_size, u8 *prio2buffer) { + u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5e_port_buffer port_buffer; u32 xoff = calculate_xoff(priv, mtu); bool update_prio2buffer = false; @@ -282,7 +287,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (change & MLX5E_PORT_BUFFER_CABLE_LEN) { update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } @@ -292,7 +297,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; - err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff, + err = update_buffer_lossy(max_mtu, pfc->pfc_en, buffer, xoff, port_buff_cell_sz, &port_buffer, &update_buffer); if (err) return err; @@ -304,7 +309,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, if (err) return err; - err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer, + err = update_buffer_lossy(max_mtu, curr_pfc_en, prio2buffer, port_buff_cell_sz, xoff, &port_buffer, &update_buffer); if (err) return err; @@ -329,7 +334,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, return -EINVAL; update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } @@ -337,7 +342,7 @@ int mlx5e_port_manual_buffer_config(struct mlx5e_priv *priv, /* Need to update buffer configuration if xoff value is changed */ if (!update_buffer && xoff != priv->dcbx.xoff) { update_buffer = true; - err = update_xoff_threshold(&port_buffer, xoff, max_mtu); + err = update_xoff_threshold(&port_buffer, xoff, max_mtu, port_buff_cell_sz); if (err) return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h index 34f55b81a0de..80af7a5ac604 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.h @@ -36,7 +36,6 @@ #include "port.h" #define MLX5E_MAX_BUFFER 8 -#define MLX5E_BUFFER_CELL_SHIFT 7 #define MLX5E_DEFAULT_CABLE_LEN 7 /* 7 meters */ #define MLX5_BUFFER_SUPPORTED(mdev) (MLX5_CAP_GEN(mdev, pcam_reg) && \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index bc102d094bbd..d20243d6a032 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -1217,6 +1217,24 @@ static int mlx5e_trust_initialize(struct mlx5e_priv *priv) return 0; } +#define MLX5E_BUFFER_CELL_SHIFT 7 + +static u16 mlx5e_query_port_buffers_cell_size(struct mlx5e_priv *priv) +{ + struct mlx5_core_dev *mdev = priv->mdev; + u32 out[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + u32 in[MLX5_ST_SZ_DW(sbcam_reg)] = {}; + + if (!MLX5_CAP_GEN(mdev, sbcam_reg)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + if (mlx5_core_access_reg(mdev, in, sizeof(in), out, sizeof(out), + MLX5_REG_SBCAM, 0, 0)) + return (1 << MLX5E_BUFFER_CELL_SHIFT); + + return MLX5_GET(sbcam_reg, out, cap_cell_size); +} + void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) { struct mlx5e_dcbx *dcbx = &priv->dcbx; @@ -1234,6 +1252,7 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) if (priv->dcbx.mode == MLX5E_DCBX_PARAM_VER_OPER_HOST) priv->dcbx.cap |= DCB_CAP_DCBX_HOST; + priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); priv->dcbx.manual_buffer = false; priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 13c0e4556eda..1e6ca716635a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -147,6 +147,7 @@ enum { MLX5_REG_MCDA = 0x9063, MLX5_REG_MCAM = 0x907f, MLX5_REG_MIRC = 0x9162, + MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ca1887dd0423..073b79eacc99 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9960,6 +9960,34 @@ struct mlx5_ifc_pptb_reg_bits { u8 untagged_buff[0x4]; }; +struct mlx5_ifc_sbcam_reg_bits { + u8 reserved_at_0[0x8]; + u8 feature_group[0x8]; + u8 reserved_at_10[0x8]; + u8 access_reg_group[0x8]; + + u8 reserved_at_20[0x20]; + + u8 sb_access_reg_cap_mask[4][0x20]; + + u8 reserved_at_c0[0x80]; + + u8 sb_feature_cap_mask[4][0x20]; + + u8 reserved_at_1c0[0x40]; + + u8 cap_total_buffer_size[0x20]; + + u8 cap_cell_size[0x10]; + u8 cap_max_pg_buffers[0x8]; + u8 cap_num_pool_supported[0x8]; + + u8 reserved_at_240[0x8]; + u8 cap_sbsr_stat_size[0x8]; + u8 cap_max_tclass_data[0x8]; + u8 cap_max_cpu_ingress_tclass_sb[0x8]; +}; + struct mlx5_ifc_pbmc_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; -- cgit v1.2.3