diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/syscall.c | 18 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 11 | ||||
-rw-r--r-- | kernel/dma/swiotlb.c | 22 | ||||
-rw-r--r-- | kernel/fork.c | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 23 | ||||
-rw-r--r-- | kernel/signal.c | 20 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | kernel/trace/blktrace.c | 26 | ||||
-rw-r--r-- | kernel/trace/trace.c | 14 | ||||
-rw-r--r-- | kernel/trace/trace_events_filter.c | 107 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_events_synth.c | 13 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_osnoise.c | 73 | ||||
-rw-r--r-- | kernel/trace/trace_probe.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 5 | ||||
-rw-r--r-- | kernel/user_namespace.c | 14 | ||||
-rw-r--r-- | kernel/watch_queue.c | 15 |
18 files changed, 296 insertions, 95 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 53384622e8da..42490c39dfbf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1824,8 +1824,14 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +struct bpf_prog_kstats { + u64 nsecs; + u64 cnt; + u64 misses; +}; + static void bpf_prog_get_stats(const struct bpf_prog *prog, - struct bpf_prog_stats *stats) + struct bpf_prog_kstats *stats) { u64 nsecs = 0, cnt = 0, misses = 0; int cpu; @@ -1838,9 +1844,9 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); - tnsecs = st->nsecs; - tcnt = st->cnt; - tmisses = st->misses; + tnsecs = u64_stats_read(&st->nsecs); + tcnt = u64_stats_read(&st->cnt); + tmisses = u64_stats_read(&st->misses); } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; @@ -1856,7 +1862,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - struct bpf_prog_stats stats; + struct bpf_prog_kstats stats; bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); @@ -3595,7 +3601,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info; u32 info_len = attr->info.info_len; - struct bpf_prog_stats stats; + struct bpf_prog_kstats stats; char __user *uinsns; u32 ulen; int err; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index d3a307a8c42b..2660fbced9ad 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -541,11 +541,12 @@ static u64 notrace bpf_prog_start_time(void) static void notrace inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; + unsigned int flags; stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); - stats->misses++; - u64_stats_update_end(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->misses); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } /* The logic is similar to bpf_prog_run(), but with an explicit @@ -589,8 +590,8 @@ static void notrace update_prog_stats(struct bpf_prog *prog, stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); - stats->cnt++; - stats->nsecs += sched_clock() - start; + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 87c40517e822..e58dce93c661 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -578,9 +578,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -647,10 +652,13 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr, void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) - swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); + /* + * Unconditional bounce is necessary to avoid corruption on + * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite + * the whole lengt of the bounce buffer. + */ + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + BUG_ON(!valid_dma_direction(dir)); } void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, diff --git a/kernel/fork.c b/kernel/fork.c index 28aee1a8875b..89475c994ca9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2297,6 +2297,17 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_put_pidfd; /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -2405,7 +2416,7 @@ static __latent_entropy struct task_struct *copy_process( fd_install(pidfd, pidfile); proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c2dec6ce9809..a0747eaa2dba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4376,6 +4376,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4391,18 +4392,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4413,7 +4419,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } diff --git a/kernel/signal.c b/kernel/signal.c index aea93d6a5520..6e3dbb3d1217 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2710,19 +2710,19 @@ relock: goto relock; } - /* Has this task already been marked for death? */ - if (signal_group_exit(signal)) { - ksig->info.si_signo = signr = SIGKILL; - sigdelset(¤t->pending.signal, SIGKILL); - trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, - &sighand->action[SIGKILL - 1]); - recalc_sigpending(); - goto fatal; - } - for (;;) { struct k_sigaction *ka; + /* Has this task already been marked for death? */ + if (signal_group_exit(signal)) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); + recalc_sigpending(); + goto fatal; + } + if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) goto relock; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..0586047f7323 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -228,6 +228,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write, return ret; } +void __weak unpriv_ebpf_notify(int new_state) +{ +} + static int bpf_unpriv_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -245,6 +249,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write, return -EPERM; *(int *)table->data = unpriv_enable; } + + unpriv_ebpf_notify(unpriv_enable); + return ret; } #endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fa91f398f28b..c42ff77eb6cc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -310,10 +310,20 @@ record_it: local_irq_restore(flags); } -static void blk_trace_free(struct blk_trace *bt) +static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) { relay_close(bt->rchan); - debugfs_remove(bt->dir); + + /* + * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created + * under 'q->debugfs_dir', thus lookup and remove them. + */ + if (!bt->dir) { + debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); + debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + } else { + debugfs_remove(bt->dir); + } free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -335,10 +345,10 @@ static void put_probe_ref(void) mutex_unlock(&blk_probe_mutex); } -static void blk_trace_cleanup(struct blk_trace *bt) +static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt) { synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); put_probe_ref(); } @@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q) return -EINVAL; if (bt->trace_state != Blktrace_running) - blk_trace_cleanup(bt); + blk_trace_cleanup(q, bt); return 0; } @@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: if (ret) - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } @@ -1615,7 +1625,7 @@ static int blk_trace_remove_queue(struct request_queue *q) put_probe_ref(); synchronize_rcu(); - blk_trace_free(bt); + blk_trace_free(q, bt); return 0; } @@ -1646,7 +1656,7 @@ static int blk_trace_setup_queue(struct request_queue *q, return 0; free_bt: - blk_trace_free(bt); + blk_trace_free(q, bt); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 618c20ce2479..01002656f1ae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,7 +235,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static int __init set_trace_boot_options(char *str) { strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); - return 0; + return 1; } __setup("trace_options=", set_trace_boot_options); @@ -246,7 +246,7 @@ static int __init set_trace_boot_clock(char *str) { strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE); trace_boot_clock = trace_boot_clock_buf; - return 0; + return 1; } __setup("trace_clock=", set_trace_boot_clock); @@ -1496,10 +1496,12 @@ static int __init set_buf_size(char *str) if (!str) return 0; buf_size = memparse(str, &str); - /* nr_entries can not be zero */ - if (buf_size == 0) - return 0; - trace_buf_size = buf_size; + /* + * nr_entries can not be zero and the startup + * tests require some buffer space. Therefore + * ensure we have at least 4096 bytes of buffer. + */ + trace_buf_size = max(4096UL, buf_size); return 1; } __setup("trace_buf_size=", set_buf_size); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c9124038b140..06d6318ee537 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -5,6 +5,7 @@ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> */ +#include <linux/uaccess.h> #include <linux/module.h> #include <linux/ctype.h> #include <linux/mutex.h> @@ -654,6 +655,52 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); +/* user space strings temp buffer */ +#define USTRING_BUF_SIZE 1024 + +struct ustring_buffer { + char buffer[USTRING_BUF_SIZE]; +}; + +static __percpu struct ustring_buffer *ustring_per_cpu; + +static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + return kstr; +} + +static __always_inline char *test_ustring(char *str) +{ + struct ustring_buffer *ubuf; + char __user *ustr; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + + return kstr; +} + /* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event) { @@ -667,19 +714,43 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } -/* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event) +static __always_inline int filter_pchar(struct filter_pred *pred, char *str) { - char **addr = (char **)(event + pred->offset); int cmp, match; - int len = strlen(*addr) + 1; /* including tailing '\0' */ + int len; - cmp = pred->regex.match(*addr, &pred->regex, len); + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex.match(str, &pred->regex, len); match = cmp ^ pred->not; return match; } +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_string(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} + +/* Filter predicate for char * pointers in user space*/ +static int filter_pred_pchar_user(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; + + str = test_ustring(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); +} /* * Filter predicate for dynamic sized arrays of characters. @@ -1158,6 +1229,7 @@ static int parse_pred(const char *str, void *data, struct filter_pred *pred = NULL; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + bool ustring = false; char q; u64 val; int len; @@ -1192,6 +1264,12 @@ static int parse_pred(const char *str, void *data, return -EINVAL; } + /* See if the field is a user space string */ + if ((len = str_has_prefix(str + i, ".ustring"))) { + ustring = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1320,8 +1398,20 @@ static int parse_pred(const char *str, void *data, } else if (field->filter_type == FILTER_DYN_STRING) pred->fn = filter_pred_strloc; - else - pred->fn = filter_pred_pchar; + else { + + if (!ustring_per_cpu) { + /* Once allocated, keep it around for good */ + ustring_per_cpu = alloc_percpu(struct ustring_buffer); + if (!ustring_per_cpu) + goto err_mem; + } + + if (ustring) + pred->fn = filter_pred_pchar_user; + else + pred->fn = filter_pred_pchar; + } /* go past the last quote */ i++; @@ -1387,6 +1477,9 @@ static int parse_pred(const char *str, void *data, err_free: kfree(pred); return -EINVAL; +err_mem: + kfree(pred); + return -ENOMEM; } enum { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 918f969dffcf..ea168d42c8a2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2049,9 +2049,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, /* * For backward compatibility, if field_name * was "cpu", then we treat this the same as - * common_cpu. + * common_cpu. This also works for "CPU". */ - if (strcmp(field_name, "cpu") == 0) { + if (field && field->filter_type == FILTER_CPU) { *flags |= HIST_FIELD_FL_CPU; } else { hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, @@ -4478,7 +4478,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) cmp_fn = tracing_map_cmp_none; - else if (!field) + else if (!field || hist_field->flags & HIST_FIELD_FL_CPU) cmp_fn = tracing_map_cmp_num(hist_field->size, hist_field->is_signed); else if (is_string_field(field)) diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 22db3ce95e74..8c26092db8de 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -2053,6 +2053,13 @@ static int create_synth_event(const char *raw_command) last_cmd_set(raw_command); + name = raw_command; + + /* Don't try to process if not our system */ + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + p = strpbrk(raw_command, " \t"); if (!p) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -2061,12 +2068,6 @@ static int create_synth_event(const char *raw_command) fields = skip_spaces(p); - name = raw_command; - - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; - /* This interface accepts group name prefix */ if (strchr(name, '/')) { len = str_has_prefix(name, SYNTH_SYSTEM "/"); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 385981c293ba..39ee60725519 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -31,7 +31,7 @@ static int __init set_kprobe_boot_events(char *str) strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); disable_tracing_selftest("running kprobe events"); - return 0; + return 1; } __setup("kprobe_event=", set_kprobe_boot_events); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 65a518649997..93de784ee681 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1196,6 +1196,26 @@ static int run_osnoise(void) } /* + * In some cases, notably when running on a nohz_full CPU with + * a stopped tick PREEMPT_RCU has no way to account for QSs. + * This will eventually cause unwarranted noise as PREEMPT_RCU + * will force preemption as the means of ending the current + * grace period. We avoid this problem by calling + * rcu_momentary_dyntick_idle(), which performs a zero duration + * EQS allowing PREEMPT_RCU to end the current grace period. + * This call shouldn't be wrapped inside an RCU critical + * section. + * + * Note that in non PREEMPT_RCU kernels QSs are handled through + * cond_resched() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { + local_irq_disable(); + rcu_momentary_dyntick_idle(); + local_irq_enable(); + } + + /* * For the non-preemptive kernel config: let threads runs, if * they so wish. */ @@ -1250,6 +1270,37 @@ static struct cpumask osnoise_cpumask; static struct cpumask save_cpumask; /* + * osnoise_sleep - sleep until the next period + */ +static void osnoise_sleep(void) +{ + u64 interval; + ktime_t wake_time; + + mutex_lock(&interface_lock); + interval = osnoise_data.sample_period - osnoise_data.sample_runtime; + mutex_unlock(&interface_lock); + + /* + * differently from hwlat_detector, the osnoise tracer can run + * without a pause because preemption is on. + */ + if (!interval) { + /* Let synchronize_rcu_tasks() make progress */ + cond_resched_tasks_rcu_qs(); + return; + } + + wake_time = ktime_add_us(ktime_get(), interval); + __set_current_state(TASK_INTERRUPTIBLE); + + while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) { + if (kthread_should_stop()) + break; + } +} + +/* * osnoise_main - The osnoise detection kernel thread * * Calls run_osnoise() function to measure the osnoise for the configured runtime, @@ -1257,30 +1308,10 @@ static struct cpumask save_cpumask; */ static int osnoise_main(void *data) { - u64 interval; while (!kthread_should_stop()) { - run_osnoise(); - - mutex_lock(&interface_lock); - interval = osnoise_data.sample_period - osnoise_data.sample_runtime; - mutex_unlock(&interface_lock); - - do_div(interval, USEC_PER_MSEC); - - /* - * differently from hwlat_detector, the osnoise tracer can run - * without a pause because preemption is on. - */ - if (interval < 1) { - /* Let synchronize_rcu_tasks() make progress */ - cond_resched_tasks_rcu_qs(); - continue; - } - - if (msleep_interruptible(interval)) - break; + osnoise_sleep(); } return 0; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ed2a3f37297..bb4605b60de7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -356,6 +356,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) return -EINVAL; } *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + if (!*pbuf) + return -ENOMEM; return 0; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f5f0039d31e5..78ec1c16ccf4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1619,6 +1619,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); + if (!tu->filename) { + ret = -ENOMEM; + goto error; + } + init_trace_event_call(tu); ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 6b2e3ca7ee99..5481ba44a8d6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->user_ns = user_ns; } +static unsigned long enforced_nproc_rlimit(void) +{ + unsigned long limit = RLIM_INFINITY; + + /* Is RLIMIT_NPROC currently enforced? */ + if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || + (current_user_ns() != &init_user_ns)) + limit = rlimit(RLIMIT_NPROC); + + return limit; +} + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -122,7 +134,7 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 9c9eb20dd2c5..055bc20ecdda 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, bit += page->index; set_bit(bit, wqueue->notes_bitmap); + generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing @@ -112,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue, buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; - pipe->head = head + 1; + smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); @@ -243,7 +244,8 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) goto error; } - ret = pipe_resize_ring(pipe, nr_notes); + nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; @@ -268,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; - wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; + wqueue->nr_notes = nr_notes; return 0; error_p: @@ -320,7 +322,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ - if (tf[i].type >= sizeof(wfilter->type_filter) * 8) + if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } @@ -336,7 +338,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe, q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { - if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG) + if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; @@ -371,6 +373,7 @@ static void __put_watch_queue(struct kref *kref) for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); + bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) @@ -566,7 +569,7 @@ void watch_queue_clear(struct watch_queue *wqueue) rcu_read_lock(); spin_lock_bh(&wqueue->lock); - /* Prevent new additions and prevent notifications from happening */ + /* Prevent new notifications from being stored. */ wqueue->defunct = true; while (!hlist_empty(&wqueue->watches)) { |