From 6b839b3b76cf17296ebd4a893841f32cae08229c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 5 Feb 2024 09:26:30 -0800 Subject: regset: use kvzalloc() for regset_get_alloc() While browsing through ChromeOS crash reports, I found one with an allocation failure that looked like this: chrome: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=urgent,mems_allowed=0 CPU: 7 PID: 3295 Comm: chrome Not tainted 5.15.133-20574-g8044615ac35c #1 (HASH:1162 1) Hardware name: Google Lazor (rev3 - 8) with KB Backlight (DT) Call trace: ... warn_alloc+0x104/0x174 __alloc_pages+0x5f0/0x6e4 kmalloc_order+0x44/0x98 kmalloc_order_trace+0x34/0x124 __kmalloc+0x228/0x36c __regset_get+0x68/0xcc regset_get_alloc+0x1c/0x28 elf_core_dump+0x3d8/0xd8c do_coredump+0xeb8/0x1378 get_signal+0x14c/0x804 ... An order 7 allocation is (1 << 7) contiguous pages, or 512K. It's not a surprise that this allocation failed on a system that's been running for a while. More digging showed that it was fairly easy to see the order 7 allocation by just sending a SIGQUIT to chrome (or other processes) to generate a core dump. The actual amount being allocated was 279,584 bytes and it was for "core_note_type" NT_ARM_SVE. There was quite a bit of discussion [1] on the mailing lists in response to my v1 patch attempting to switch to vmalloc. The overall conclusion was that we could likely reduce the 279,584 byte allocation by quite a bit and Mark Brown has sent a patch to that effect [2]. However even with the 279,584 byte allocation gone there are still 65,552 byte allocations. These are just barely more than the 65,536 bytes and thus would require an order 5 allocation. An order 5 allocation is still something to avoid unless necessary and nothing needs the memory here to be contiguous. Change the allocation to kvzalloc() which should still be efficient for small allocations but doesn't force the memory subsystem to work hard (and maybe fail) at getting a large contiguous chunk. [1] https://lore.kernel.org/r/20240201171159.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid [2] https://lore.kernel.org/r/20240203-arm64-sve-ptrace-regset-size-v1-1-2c3ba1386b9e@kernel.org Link: https://lkml.kernel.org/r/20240205092626.v2.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid Signed-off-by: Douglas Anderson Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Christian Brauner Cc: Dave Martin Cc: Eric Biederman Cc: Jan Kara Cc: Kees Cook Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Oleg Nesterov Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/regset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/regset.c b/kernel/regset.c index 586823786f39..b2871fa68b2a 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -16,14 +16,14 @@ static int __regset_get(struct task_struct *target, if (size > regset->n * regset->size) size = regset->n * regset->size; if (!p) { - to_free = p = kzalloc(size, GFP_KERNEL); + to_free = p = kvzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; } res = regset->regset_get(target, regset, (struct membuf){.p = p, .left = size}); if (res < 0) { - kfree(to_free); + kvfree(to_free); return res; } *data = p; @@ -71,6 +71,6 @@ int copy_regset_to_user(struct task_struct *target, ret = regset_get_alloc(target, regset, size, &buf); if (ret > 0) ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; - kfree(buf); + kvfree(buf); return ret; } -- cgit v1.2.3 From 56fd61628b7a5c1ff474619580796f11d5c8973a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Mar 2024 15:30:42 +0100 Subject: kcov: avoid clang out-of-range warning The area_size is never larger than the maximum on 64-bit architectutes: kernel/kcov.c:634:29: error: result of comparison of constant 1152921504606846975 with expression of type '__u32' (aka 'unsigned int') is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) ~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The compiler can correctly optimize the check away and the code appears correct to me, so just add a cast to avoid the warning. Link: https://lkml.kernel.org/r/20240328143051.1069575-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Justin Stitt Cc: Andrey Konovalov Cc: Bill Wendling Cc: Dmitry Vyukov Cc: Nathan Chancellor Cc: Nick Desaulniers Signed-off-by: Andrew Morton --- kernel/kcov.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index f9ac2e9e460f..c3124f6d5536 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -627,7 +627,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, mode = kcov_get_mode(remote_arg->trace_mode); if (mode < 0) return mode; - if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) + if ((unsigned long)remote_arg->area_size > + LONG_MAX / sizeof(unsigned long)) return -EINVAL; kcov->mode = mode; t->kcov = kcov; -- cgit v1.2.3 From 051e7503070179f2278c0d4fa7dee441adb558ca Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Apr 2024 16:00:57 +0200 Subject: blktrace: convert strncpy() to strscpy_pad() gcc-9 warns about a possibly non-terminated string copy: kernel/trace/blktrace.c: In function 'do_blk_trace_setup': kernel/trace/blktrace.c:527:2: error: 'strncpy' specified bound 32 equals destination size [-Werror=stringop-truncation] Newer versions are fine here because they see the following explicit nul-termination. Using strscpy_pad() avoids the warning and simplifies the code a little. The padding helps give a clean buffer to userspace. Link: https://lkml.kernel.org/r/20240409140059.3806717-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Justin Stitt Cc: Alexey Starikovskiy Cc: Bob Moore Cc: Jens Axboe Cc: Len Brown Cc: Lin Ming Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Rafael J. Wysocki Cc: "Richard Russon (FlatCap)" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/trace/blktrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d5d94510afd3..8fd292d34d89 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -524,8 +524,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); /* * some device names have larger paths - convert the slashes -- cgit v1.2.3 From 4707c13de3e42a47f0d99fe5fb58fa9dd23b455e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 18 Apr 2024 11:58:43 +0800 Subject: crash: add prefix for crash dumping messages Add pr_fmt() to kernel/crash_core.c to add the module name to debugging message printed as prefix. And also add prefix 'crashkernel:' to two lines of message printing code in kernel/crash_reserve.c. In kernel/crash_reserve.c, almost all debugging messages have 'crashkernel:' prefix or there's keyword crashkernel at the beginning or in the middle, adding pr_fmt() makes it redundant. Link: https://lkml.kernel.org/r/20240418035843.1562887-1-bhe@redhat.com Signed-off-by: Baoquan He Cc: Dave Young Cc: Jiri Slaby Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 ++ kernel/crash_reserve.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 78b5dc7cee3a..1e7ac977f7c0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,6 +4,8 @@ * Copyright (C) 2002-2004 Eric Biederman */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 066668799f75..5b2722a93a48 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -109,7 +109,7 @@ static int __init parse_crashkernel_mem(char *cmdline, size = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("Memory value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; @@ -132,7 +132,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); + pr_warn("crahskernel: Memory value expected after '@'\n"); return -EINVAL; } } -- cgit v1.2.3 From 602ba77361160d99c77e3dadf7d891364f8c71b3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Apr 2024 23:02:35 -0700 Subject: watchdog: handle comma separated nmi_watchdog command line Per the document, the kernel can accept comma separated command line like nmi_watchdog=nopanic,0. However, the code doesn't really handle it. Fix the kernel to handle it properly. Link: https://lkml.kernel.org/r/20240430060236.1878002-1-song@kernel.org Signed-off-by: Song Liu Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/watchdog.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d7b2125503af..7f54484de16f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -71,6 +71,7 @@ void __init hardlockup_detector_disable(void) static int __init hardlockup_panic_setup(char *str) { +next: if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "nopanic", 7)) @@ -79,6 +80,12 @@ static int __init hardlockup_panic_setup(char *str) watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; + while (*(str++)) { + if (*str == ',') { + str++; + goto next; + } + } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); -- cgit v1.2.3 From 393fb313a2e150b768e4850658679e2afff431e9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Apr 2024 23:02:36 -0700 Subject: watchdog: allow nmi watchdog to use raw perf event NMI watchdog permanently consumes one hardware counters per CPU on the system. For systems that use many hardware counters, this causes more aggressive time multiplexing of perf events. OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely used. Add kernel cmdline arg nmi_watchdog=rNNN to configure the watchdog to use raw event. For example, on Intel CPUs, we can use "r300" to configure the watchdog to use ref-cycles event. If the raw event does not work, fall back to use "cycles". [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20240430060236.1878002-2-song@kernel.org Signed-off-by: Song Liu Cc: Peter Zijlstra Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 5 +-- include/linux/nmi.h | 2 ++ kernel/watchdog.c | 2 ++ kernel/watchdog_perf.c | 46 +++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 902ecd92a29f..1fa79a3d0d1a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3773,10 +3773,12 @@ Format: [state][,regs][,debounce][,die] nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels - Format: [panic,][nopanic,][num] + Format: [panic,][nopanic,][rNNN,][num] Valid num: 0 or 1 0 - turn hardlockup detector in nmi_watchdog off 1 - turn hardlockup detector in nmi_watchdog on + rNNN - configure the watchdog with raw perf event 0xNNN + When panic is specified, panic when an NMI watchdog timeout occurs (or 'nopanic' to not panic on an NMI watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set) @@ -7464,4 +7466,3 @@ memory, and other data can't be written using xmon commands. off xmon is disabled. - diff --git a/include/linux/nmi.h b/include/linux/nmi.h index f53438eae815..a8dfb38c9bb6 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -105,10 +105,12 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_detector_perf_cleanup(void); +extern void hardlockup_config_perf_event(const char *str); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_detector_perf_cleanup(void) { } +static inline void hardlockup_config_perf_event(const char *str) { } #endif void watchdog_hardlockup_stop(void); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f54484de16f..ab0129b15f25 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -80,6 +80,8 @@ next: watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; + else if (!strncmp(str, "r", 1)) + hardlockup_config_perf_event(str + 1); while (*(str++)) { if (*str == ',') { str++; diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 8ea00c4a24b2..5f7d1f0d4268 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -90,6 +90,14 @@ static struct perf_event_attr wd_hw_attr = { .disabled = 1, }; +static struct perf_event_attr fallback_wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, struct perf_sample_data *data, @@ -122,6 +130,13 @@ static int hardlockup_detector_event_create(void) /* Try to register using hardware perf events */ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + wd_attr = &fallback_wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + } + if (IS_ERR(evt)) { pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, PTR_ERR(evt)); @@ -259,3 +274,34 @@ int __init watchdog_hardlockup_probe(void) } return ret; } + +/** + * hardlockup_config_perf_event - Overwrite config of wd_hw_attr. + * + * @str: number which identifies the raw perf event to use + */ +void __init hardlockup_config_perf_event(const char *str) +{ + u64 config; + char buf[24]; + char *comma = strchr(str, ','); + + if (!comma) { + if (kstrtoull(str, 16, &config)) + return; + } else { + unsigned int len = comma - str; + + if (len >= sizeof(buf)) + return; + + if (strscpy(buf, str, sizeof(buf)) < 0) + return; + buf[len] = 0; + if (kstrtoull(buf, 16, &config)) + return; + } + + wd_hw_attr.type = PERF_TYPE_RAW; + wd_hw_attr.config = config; +} -- cgit v1.2.3 From 8fcb916cac8985188a3f825966ffa99507a90cb1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 4 May 2024 16:41:07 -0700 Subject: kernel/watchdog_perf.c: tidy up kerneldoc It is unconventional to have a blank line between name-of-function and description-of-args. Cc: Peter Zijlstra Cc: Song Liu Cc: "Matthew Wilcox (Oracle)" Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- kernel/watchdog_perf.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 5f7d1f0d4268..d577c4a8321e 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -148,7 +148,6 @@ static int hardlockup_detector_event_create(void) /** * watchdog_hardlockup_enable - Enable the local event - * * @cpu: The CPU to enable hard lockup on. */ void watchdog_hardlockup_enable(unsigned int cpu) @@ -167,7 +166,6 @@ void watchdog_hardlockup_enable(unsigned int cpu) /** * watchdog_hardlockup_disable - Disable the local event - * * @cpu: The CPU to enable hard lockup on. */ void watchdog_hardlockup_disable(unsigned int cpu) @@ -277,7 +275,6 @@ int __init watchdog_hardlockup_probe(void) /** * hardlockup_config_perf_event - Overwrite config of wd_hw_attr. - * * @str: number which identifies the raw perf event to use */ void __init hardlockup_config_perf_event(const char *str) -- cgit v1.2.3