summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/syscall.c18
-rw-r--r--kernel/bpf/trampoline.c11
-rw-r--r--kernel/dma/swiotlb.c22
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/signal.c20
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/trace/blktrace.c26
-rw-r--r--kernel/trace/trace.c14
-rw-r--r--kernel/trace/trace_events_filter.c107
-rw-r--r--kernel/trace/trace_events_hist.c6
-rw-r--r--kernel/trace/trace_events_synth.c13
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_osnoise.c73
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--kernel/user_namespace.c14
-rw-r--r--kernel/watch_queue.c15
18 files changed, 296 insertions, 95 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 53384622e8da..42490c39dfbf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1824,8 +1824,14 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
+struct bpf_prog_kstats {
+ u64 nsecs;
+ u64 cnt;
+ u64 misses;
+};
+
static void bpf_prog_get_stats(const struct bpf_prog *prog,
- struct bpf_prog_stats *stats)
+ struct bpf_prog_kstats *stats)
{
u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
@@ -1838,9 +1844,9 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
start = u64_stats_fetch_begin_irq(&st->syncp);
- tnsecs = st->nsecs;
- tcnt = st->cnt;
- tmisses = st->misses;
+ tnsecs = u64_stats_read(&st->nsecs);
+ tcnt = u64_stats_read(&st->cnt);
+ tmisses = u64_stats_read(&st->misses);
} while (u64_stats_fetch_retry_irq(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
@@ -1856,7 +1862,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
@@ -3595,7 +3601,7 @@ static int bpf_prog_get_info_by_fd(struct file *file,
struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
struct bpf_prog_info info;
u32 info_len = attr->info.info_len;
- struct bpf_prog_stats stats;
+ struct bpf_prog_kstats stats;
char __user *uinsns;
u32 ulen;
int err;
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d3a307a8c42b..2660fbced9ad 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -541,11 +541,12 @@ static u64 notrace bpf_prog_start_time(void)
static void notrace inc_misses_counter(struct bpf_prog *prog)
{
struct bpf_prog_stats *stats;
+ unsigned int flags;
stats = this_cpu_ptr(prog->stats);
- u64_stats_update_begin(&stats->syncp);
- stats->misses++;
- u64_stats_update_end(&stats->syncp);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
+ u64_stats_inc(&stats->misses);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
/* The logic is similar to bpf_prog_run(), but with an explicit
@@ -589,8 +590,8 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
- stats->cnt++;
- stats->nsecs += sched_clock() - start;
+ u64_stats_inc(&stats->cnt);
+ u64_stats_add(&stats->nsecs, sched_clock() - start);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 87c40517e822..e58dce93c661 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -578,9 +578,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
tlb_addr = slot_addr(mem->start, index) + offset;
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ /*
+ * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+ * to the tlb buffer, if we knew for sure the device will
+ * overwirte the entire current content. But we don't. Thus
+ * unconditional bounce may prevent leaking swiotlb content (i.e.
+ * kernel memory) to user-space.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
@@ -647,10 +652,13 @@ void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
{
- if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
- else
- BUG_ON(dir != DMA_FROM_DEVICE);
+ /*
+ * Unconditional bounce is necessary to avoid corruption on
+ * sync_*_for_cpu or dma_ummap_* when the device didn't overwrite
+ * the whole lengt of the bounce buffer.
+ */
+ swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE);
+ BUG_ON(!valid_dma_direction(dir));
}
void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr,
diff --git a/kernel/fork.c b/kernel/fork.c
index 28aee1a8875b..89475c994ca9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2297,6 +2297,17 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_put_pidfd;
/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2405,7 +2416,7 @@ static __latent_entropy struct task_struct *copy_process(
fd_install(pidfd, pidfile);
proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c2dec6ce9809..a0747eaa2dba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4376,6 +4376,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
init_entity_runnable_average(&p->se);
+
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4391,18 +4392,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *tg;
-#endif
+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
- struct task_group, css);
- p->sched_task_group = autogroup_task_group(p, tg);
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
#endif
rseq_migrate(p);
/*
@@ -4413,7 +4419,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+void sched_post_fork(struct task_struct *p)
+{
uclamp_post_fork(p);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index aea93d6a5520..6e3dbb3d1217 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2710,19 +2710,19 @@ relock:
goto relock;
}
- /* Has this task already been marked for death? */
- if (signal_group_exit(signal)) {
- ksig->info.si_signo = signr = SIGKILL;
- sigdelset(&current->pending.signal, SIGKILL);
- trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
- &sighand->action[SIGKILL - 1]);
- recalc_sigpending();
- goto fatal;
- }
-
for (;;) {
struct k_sigaction *ka;
+ /* Has this task already been marked for death? */
+ if (signal_group_exit(signal)) {
+ ksig->info.si_signo = signr = SIGKILL;
+ sigdelset(&current->pending.signal, SIGKILL);
+ trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
+ &sighand->action[SIGKILL - 1]);
+ recalc_sigpending();
+ goto fatal;
+ }
+
if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
do_signal_stop(0))
goto relock;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d7..0586047f7323 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -228,6 +228,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write,
return ret;
}
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
static int bpf_unpriv_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -245,6 +249,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
return -EPERM;
*(int *)table->data = unpriv_enable;
}
+
+ unpriv_ebpf_notify(unpriv_enable);
+
return ret;
}
#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fa91f398f28b..c42ff77eb6cc 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -310,10 +310,20 @@ record_it:
local_irq_restore(flags);
}
-static void blk_trace_free(struct blk_trace *bt)
+static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
{
relay_close(bt->rchan);
- debugfs_remove(bt->dir);
+
+ /*
+ * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
+ * under 'q->debugfs_dir', thus lookup and remove them.
+ */
+ if (!bt->dir) {
+ debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
+ debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ } else {
+ debugfs_remove(bt->dir);
+ }
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -335,10 +345,10 @@ static void put_probe_ref(void)
mutex_unlock(&blk_probe_mutex);
}
-static void blk_trace_cleanup(struct blk_trace *bt)
+static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
{
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
put_probe_ref();
}
@@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q)
return -EINVAL;
if (bt->trace_state != Blktrace_running)
- blk_trace_cleanup(bt);
+ blk_trace_cleanup(q, bt);
return 0;
}
@@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = 0;
err:
if (ret)
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -1615,7 +1625,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
put_probe_ref();
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return 0;
}
@@ -1646,7 +1656,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
return 0;
free_bt:
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 618c20ce2479..01002656f1ae 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -235,7 +235,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
- return 0;
+ return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -246,7 +246,7 @@ static int __init set_trace_boot_clock(char *str)
{
strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
trace_boot_clock = trace_boot_clock_buf;
- return 0;
+ return 1;
}
__setup("trace_clock=", set_trace_boot_clock);
@@ -1496,10 +1496,12 @@ static int __init set_buf_size(char *str)
if (!str)
return 0;
buf_size = memparse(str, &str);
- /* nr_entries can not be zero */
- if (buf_size == 0)
- return 0;
- trace_buf_size = buf_size;
+ /*
+ * nr_entries can not be zero and the startup
+ * tests require some buffer space. Therefore
+ * ensure we have at least 4096 bytes of buffer.
+ */
+ trace_buf_size = max(4096UL, buf_size);
return 1;
}
__setup("trace_buf_size=", set_buf_size);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c9124038b140..06d6318ee537 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -5,6 +5,7 @@
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
+#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
@@ -654,6 +655,52 @@ DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
DEFINE_EQUALITY_PRED(8);
+/* user space strings temp buffer */
+#define USTRING_BUF_SIZE 1024
+
+struct ustring_buffer {
+ char buffer[USTRING_BUF_SIZE];
+};
+
+static __percpu struct ustring_buffer *ustring_per_cpu;
+
+static __always_inline char *test_string(char *str)
+{
+ struct ustring_buffer *ubuf;
+ char *kstr;
+
+ if (!ustring_per_cpu)
+ return NULL;
+
+ ubuf = this_cpu_ptr(ustring_per_cpu);
+ kstr = ubuf->buffer;
+
+ /* For safety, do not trust the string pointer */
+ if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE))
+ return NULL;
+ return kstr;
+}
+
+static __always_inline char *test_ustring(char *str)
+{
+ struct ustring_buffer *ubuf;
+ char __user *ustr;
+ char *kstr;
+
+ if (!ustring_per_cpu)
+ return NULL;
+
+ ubuf = this_cpu_ptr(ustring_per_cpu);
+ kstr = ubuf->buffer;
+
+ /* user space address? */
+ ustr = (char __user *)str;
+ if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE))
+ return NULL;
+
+ return kstr;
+}
+
/* Filter predicate for fixed sized arrays of characters */
static int filter_pred_string(struct filter_pred *pred, void *event)
{
@@ -667,19 +714,43 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
return match;
}
-/* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event)
+static __always_inline int filter_pchar(struct filter_pred *pred, char *str)
{
- char **addr = (char **)(event + pred->offset);
int cmp, match;
- int len = strlen(*addr) + 1; /* including tailing '\0' */
+ int len;
- cmp = pred->regex.match(*addr, &pred->regex, len);
+ len = strlen(str) + 1; /* including tailing '\0' */
+ cmp = pred->regex.match(str, &pred->regex, len);
match = cmp ^ pred->not;
return match;
}
+/* Filter predicate for char * pointers */
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
+{
+ char **addr = (char **)(event + pred->offset);
+ char *str;
+
+ str = test_string(*addr);
+ if (!str)
+ return 0;
+
+ return filter_pchar(pred, str);
+}
+
+/* Filter predicate for char * pointers in user space*/
+static int filter_pred_pchar_user(struct filter_pred *pred, void *event)
+{
+ char **addr = (char **)(event + pred->offset);
+ char *str;
+
+ str = test_ustring(*addr);
+ if (!str)
+ return 0;
+
+ return filter_pchar(pred, str);
+}
/*
* Filter predicate for dynamic sized arrays of characters.
@@ -1158,6 +1229,7 @@ static int parse_pred(const char *str, void *data,
struct filter_pred *pred = NULL;
char num_buf[24]; /* Big enough to hold an address */
char *field_name;
+ bool ustring = false;
char q;
u64 val;
int len;
@@ -1192,6 +1264,12 @@ static int parse_pred(const char *str, void *data,
return -EINVAL;
}
+ /* See if the field is a user space string */
+ if ((len = str_has_prefix(str + i, ".ustring"))) {
+ ustring = true;
+ i += len;
+ }
+
while (isspace(str[i]))
i++;
@@ -1320,8 +1398,20 @@ static int parse_pred(const char *str, void *data,
} else if (field->filter_type == FILTER_DYN_STRING)
pred->fn = filter_pred_strloc;
- else
- pred->fn = filter_pred_pchar;
+ else {
+
+ if (!ustring_per_cpu) {
+ /* Once allocated, keep it around for good */
+ ustring_per_cpu = alloc_percpu(struct ustring_buffer);
+ if (!ustring_per_cpu)
+ goto err_mem;
+ }
+
+ if (ustring)
+ pred->fn = filter_pred_pchar_user;
+ else
+ pred->fn = filter_pred_pchar;
+ }
/* go past the last quote */
i++;
@@ -1387,6 +1477,9 @@ static int parse_pred(const char *str, void *data,
err_free:
kfree(pred);
return -EINVAL;
+err_mem:
+ kfree(pred);
+ return -ENOMEM;
}
enum {
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 918f969dffcf..ea168d42c8a2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2049,9 +2049,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
/*
* For backward compatibility, if field_name
* was "cpu", then we treat this the same as
- * common_cpu.
+ * common_cpu. This also works for "CPU".
*/
- if (strcmp(field_name, "cpu") == 0) {
+ if (field && field->filter_type == FILTER_CPU) {
*flags |= HIST_FIELD_FL_CPU;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
@@ -4478,7 +4478,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
- else if (!field)
+ else if (!field || hist_field->flags & HIST_FIELD_FL_CPU)
cmp_fn = tracing_map_cmp_num(hist_field->size,
hist_field->is_signed);
else if (is_string_field(field))
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 22db3ce95e74..8c26092db8de 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -2053,6 +2053,13 @@ static int create_synth_event(const char *raw_command)
last_cmd_set(raw_command);
+ name = raw_command;
+
+ /* Don't try to process if not our system */
+ if (name[0] != 's' || name[1] != ':')
+ return -ECANCELED;
+ name += 2;
+
p = strpbrk(raw_command, " \t");
if (!p) {
synth_err(SYNTH_ERR_INVALID_CMD, 0);
@@ -2061,12 +2068,6 @@ static int create_synth_event(const char *raw_command)
fields = skip_spaces(p);
- name = raw_command;
-
- if (name[0] != 's' || name[1] != ':')
- return -ECANCELED;
- name += 2;
-
/* This interface accepts group name prefix */
if (strchr(name, '/')) {
len = str_has_prefix(name, SYNTH_SYSTEM "/");
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 385981c293ba..39ee60725519 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,7 @@ static int __init set_kprobe_boot_events(char *str)
strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
disable_tracing_selftest("running kprobe events");
- return 0;
+ return 1;
}
__setup("kprobe_event=", set_kprobe_boot_events);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 65a518649997..93de784ee681 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1196,6 +1196,26 @@ static int run_osnoise(void)
}
/*
+ * In some cases, notably when running on a nohz_full CPU with
+ * a stopped tick PREEMPT_RCU has no way to account for QSs.
+ * This will eventually cause unwarranted noise as PREEMPT_RCU
+ * will force preemption as the means of ending the current
+ * grace period. We avoid this problem by calling
+ * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * EQS allowing PREEMPT_RCU to end the current grace period.
+ * This call shouldn't be wrapped inside an RCU critical
+ * section.
+ *
+ * Note that in non PREEMPT_RCU kernels QSs are handled through
+ * cond_resched()
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ local_irq_disable();
+ rcu_momentary_dyntick_idle();
+ local_irq_enable();
+ }
+
+ /*
* For the non-preemptive kernel config: let threads runs, if
* they so wish.
*/
@@ -1250,6 +1270,37 @@ static struct cpumask osnoise_cpumask;
static struct cpumask save_cpumask;
/*
+ * osnoise_sleep - sleep until the next period
+ */
+static void osnoise_sleep(void)
+{
+ u64 interval;
+ ktime_t wake_time;
+
+ mutex_lock(&interface_lock);
+ interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
+ mutex_unlock(&interface_lock);
+
+ /*
+ * differently from hwlat_detector, the osnoise tracer can run
+ * without a pause because preemption is on.
+ */
+ if (!interval) {
+ /* Let synchronize_rcu_tasks() make progress */
+ cond_resched_tasks_rcu_qs();
+ return;
+ }
+
+ wake_time = ktime_add_us(ktime_get(), interval);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) {
+ if (kthread_should_stop())
+ break;
+ }
+}
+
+/*
* osnoise_main - The osnoise detection kernel thread
*
* Calls run_osnoise() function to measure the osnoise for the configured runtime,
@@ -1257,30 +1308,10 @@ static struct cpumask save_cpumask;
*/
static int osnoise_main(void *data)
{
- u64 interval;
while (!kthread_should_stop()) {
-
run_osnoise();
-
- mutex_lock(&interface_lock);
- interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
- mutex_unlock(&interface_lock);
-
- do_div(interval, USEC_PER_MSEC);
-
- /*
- * differently from hwlat_detector, the osnoise tracer can run
- * without a pause because preemption is on.
- */
- if (interval < 1) {
- /* Let synchronize_rcu_tasks() make progress */
- cond_resched_tasks_rcu_qs();
- continue;
- }
-
- if (msleep_interruptible(interval))
- break;
+ osnoise_sleep();
}
return 0;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 3ed2a3f37297..bb4605b60de7 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -356,6 +356,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs)
return -EINVAL;
}
*pbuf = kstrndup(str, len - 1, GFP_KERNEL);
+ if (!*pbuf)
+ return -ENOMEM;
return 0;
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f5f0039d31e5..78ec1c16ccf4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1619,6 +1619,11 @@ create_local_trace_uprobe(char *name, unsigned long offs,
tu->path = path;
tu->ref_ctr_offset = ref_ctr_offset;
tu->filename = kstrdup(name, GFP_KERNEL);
+ if (!tu->filename) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
init_trace_event_call(tu);
ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6b2e3ca7ee99..5481ba44a8d6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->user_ns = user_ns;
}
+static unsigned long enforced_nproc_rlimit(void)
+{
+ unsigned long limit = RLIM_INFINITY;
+
+ /* Is RLIMIT_NPROC currently enforced? */
+ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
+ (current_user_ns() != &init_user_ns))
+ limit = rlimit(RLIMIT_NPROC);
+
+ return limit;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -122,7 +134,7 @@ int create_user_ns(struct cred *new)
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 9c9eb20dd2c5..055bc20ecdda 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
bit += page->index;
set_bit(bit, wqueue->notes_bitmap);
+ generic_pipe_buf_release(pipe, buf);
}
// No try_steal function => no stealing
@@ -112,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
buf->offset = offset;
buf->len = len;
buf->flags = PIPE_BUF_FLAG_WHOLE;
- pipe->head = head + 1;
+ smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */
if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
spin_unlock_irq(&pipe->rd_wait.lock);
@@ -243,7 +244,8 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
goto error;
}
- ret = pipe_resize_ring(pipe, nr_notes);
+ nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes));
if (ret < 0)
goto error;
@@ -268,7 +270,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
wqueue->notes = pages;
wqueue->notes_bitmap = bitmap;
wqueue->nr_pages = nr_pages;
- wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ wqueue->nr_notes = nr_notes;
return 0;
error_p:
@@ -320,7 +322,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
tf[i].info_mask & WATCH_INFO_LENGTH)
goto err_filter;
/* Ignore any unknown types */
- if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
nr_filter++;
}
@@ -336,7 +338,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
q = wfilter->filters;
for (i = 0; i < filter.nr_filters; i++) {
- if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
q->type = tf[i].type;
@@ -371,6 +373,7 @@ static void __put_watch_queue(struct kref *kref)
for (i = 0; i < wqueue->nr_pages; i++)
__free_page(wqueue->notes[i]);
+ bitmap_free(wqueue->notes_bitmap);
wfilter = rcu_access_pointer(wqueue->filter);
if (wfilter)
@@ -566,7 +569,7 @@ void watch_queue_clear(struct watch_queue *wqueue)
rcu_read_lock();
spin_lock_bh(&wqueue->lock);
- /* Prevent new additions and prevent notifications from happening */
+ /* Prevent new notifications from being stored. */
wqueue->defunct = true;
while (!hlist_empty(&wqueue->watches)) {