summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile105
-rw-r--r--kernel/audit.c50
-rw-r--r--kernel/audit.h20
-rw-r--r--kernel/audit_fsnotify.c216
-rw-r--r--kernel/audit_tree.c8
-rw-r--r--kernel/audit_watch.c56
-rw-r--r--kernel/auditfilter.c97
-rw-r--r--kernel/auditsc.c9
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c149
-rw-r--r--kernel/bpf/core.c42
-rw-r--r--kernel/bpf/hashtab.c18
-rw-r--r--kernel/bpf/helpers.c7
-rw-r--r--kernel/bpf/inode.c387
-rw-r--r--kernel/bpf/syscall.c188
-rw-r--r--kernel/bpf/verifier.c172
-rw-r--r--kernel/cgroup.c1314
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c353
-rw-r--r--kernel/cpu.c75
-rw-r--r--kernel/cpu_pm.c2
-rw-r--r--kernel/cpuset.c88
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/events/core.c641
-rw-r--r--kernel/events/ring_buffer.c17
-rw-r--r--kernel/events/uprobes.c228
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/extable.c1
-rw-r--r--kernel/fork.c67
-rw-r--r--kernel/futex.c113
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/chip.c104
-rw-r--r--kernel/irq/cpuhotplug.c82
-rw-r--r--kernel/irq/generic-chip.c6
-rw-r--r--kernel/irq/handle.c17
-rw-r--r--kernel/irq/internals.h17
-rw-r--r--kernel/irq/irqdesc.c22
-rw-r--r--kernel/irq/irqdomain.c184
-rw-r--r--kernel/irq/manage.c306
-rw-r--r--kernel/irq/msi.c31
-rw-r--r--kernel/irq/pm.c14
-rw-r--r--kernel/irq/proc.c23
-rw-r--r--kernel/irq/resend.c6
-rw-r--r--kernel/irq/settings.h12
-rw-r--r--kernel/irq/spurious.c26
-rw-r--r--kernel/jump_label.c158
-rw-r--r--kernel/kexec.c2531
-rw-r--r--kernel/kexec_core.c1534
-rw-r--r--kernel/kexec_file.c1047
-rw-r--r--kernel/kexec_internal.h22
-rw-r--r--kernel/kmod.c106
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/kthread.c27
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/locking/Makefile4
-rw-r--r--kernel/locking/lockdep.c12
-rw-r--r--kernel/locking/locktorture.c164
-rw-r--r--kernel/locking/mcs_spinlock.h4
-rw-r--r--kernel/locking/mutex.c9
-rw-r--r--kernel/locking/osq_lock.c11
-rw-r--r--kernel/locking/percpu-rwsem.c103
-rw-r--r--kernel/locking/qrwlock.c55
-rw-r--r--kernel/locking/qspinlock.c8
-rw-r--r--kernel/locking/qspinlock_paravirt.h99
-rw-r--r--kernel/locking/rtmutex-tester.c420
-rw-r--r--kernel/locking/rtmutex.c35
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/locking/rwsem-xadd.c5
-rw-r--r--kernel/membarrier.c66
-rw-r--r--kernel/memremap.c200
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/module_signing.c214
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c3
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c6
-rw-r--r--kernel/power/swap.c28
-rw-r--r--kernel/power/wakelock.c18
-rw-r--r--kernel/printk/printk.c16
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c18
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcutorture.c56
-rw-r--r--kernel/rcu/srcu.c19
-rw-r--r--kernel/rcu/sync.c223
-rw-r--r--kernel/rcu/tiny.c16
-rw-r--r--kernel/rcu/tree.c1036
-rw-r--r--kernel/rcu/tree.h141
-rw-r--r--kernel/rcu/tree_plugin.h559
-rw-r--r--kernel/rcu/tree_trace.c29
-rw-r--r--kernel/rcu/update.c92
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched/core.c416
-rw-r--r--kernel/sched/cpudeadline.c5
-rw-r--r--kernel/sched/cpudeadline.h1
-rw-r--r--kernel/sched/cputime.c103
-rw-r--r--kernel/sched/deadline.c57
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c1213
-rw-r--r--kernel/sched/features.h29
-rw-r--r--kernel/sched/idle.c16
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c64
-rw-r--r--kernel/sched/sched.h99
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/seccomp.c95
-rw-r--r--kernel/signal.c53
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/smpboot.c32
-rw-r--r--kernel/stop_machine.c124
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c6
-rw-r--r--kernel/sysctl.c45
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c106
-rw-r--r--kernel/task_work.c12
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/clockevents.c42
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/hrtimer.c38
-rw-r--r--kernel/time/ntp.c21
-rw-r--r--kernel/time/ntp_internal.h2
-rw-r--r--kernel/time/posix-cpu-timers.c63
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c49
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-sched.c87
-rw-r--r--kernel/time/time.c53
-rw-r--r--kernel/time/timeconst.bc2
-rw-r--r--kernel/time/timekeeping.c41
-rw-r--r--kernel/time/timer.c17
-rw-r--r--kernel/time/timer_list.c56
-rw-r--r--kernel/torture.c1
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/blktrace.c37
-rw-r--r--kernel/trace/bpf_trace.c114
-rw-r--r--kernel/trace/ftrace.c208
-rw-r--r--kernel/trace/ring_buffer.c780
-rw-r--r--kernel/trace/ring_buffer_benchmark.c79
-rw-r--r--kernel/trace/trace.c458
-rw-r--r--kernel/trace/trace.h168
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_branch.c15
-rw-r--r--kernel/trace/trace_events.c531
-rw-r--r--kernel/trace/trace_events_filter.c62
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c67
-rw-r--r--kernel/trace/trace_irqsoff.c106
-rw-r--r--kernel/trace/trace_kdb.c8
-rw-r--r--kernel/trace/trace_kprobe.c20
-rw-r--r--kernel/trace/trace_mmiotrace.c4
-rw-r--r--kernel/trace/trace_output.c101
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_printk.c14
-rw-r--r--kernel/trace/trace_probe.h8
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c124
-rw-r--r--kernel/trace/trace_stack.c155
-rw-r--r--kernel/trace/trace_syscalls.c3
-rw-r--r--kernel/trace/trace_uprobe.c22
-rw-r--r--kernel/tracepoint.c61
-rw-r--r--kernel/user_namespace.c5
-rw-r--r--kernel/watchdog.c290
-rw-r--r--kernel/workqueue.c62
172 files changed, 12655 insertions, 8476 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 790d83c7d160..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,4 +5,3 @@ config_data.h
config_data.gz
timeconst.h
hz.bc
-x509_certificate_list
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..53abf008ecb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
+obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
@@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
@@ -98,6 +100,9 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
+
+obj-$(CONFIG_HAS_IOMEM) += memremap.o
$(obj)/configs.o: $(obj)/config_data.h
@@ -111,99 +116,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
-
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
-#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509". Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
-#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
- $(or $(realpath $(CERT)),$(CERT))))
-X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
-
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
-
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
-$(shell rm $(obj)/.x509.list)
-endif
-endif
-
-kernel/system_certificates.o: $(obj)/x509_certificate_list
-
-quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
-
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
- $(call if_changed,x509certs)
-
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
- @echo $(X509_CERTIFICATES) >$@
-endif
-
-clean-files := x509_certificate_list .x509.list
-
-ifeq ($(CONFIG_MODULE_SIG),y)
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
-#
-###############################################################################
-ifndef CONFIG_MODULE_SIG_HASH
-$(error Could not determine digest type to use from kernel config)
-endif
-
-signing_key.priv signing_key.x509: x509.genkey
- @echo "###"
- @echo "### Now generating an X.509 key pair to be used for signing modules."
- @echo "###"
- @echo "### If this takes a long time, you might wish to run rngd in the"
- @echo "### background to keep the supply of entropy topped up. It"
- @echo "### needs to be run as root, and uses a hardware random"
- @echo "### number generator if one is available."
- @echo "###"
- openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
- -batch -x509 -config x509.genkey \
- -outform DER -out signing_key.x509 \
- -keyout signing_key.priv 2>&1
- @echo "###"
- @echo "### Key pair generated."
- @echo "###"
-
-x509.genkey:
- @echo Generating X.509 key generation config
- @echo >x509.genkey "[ req ]"
- @echo >>x509.genkey "default_bits = 4096"
- @echo >>x509.genkey "distinguished_name = req_distinguished_name"
- @echo >>x509.genkey "prompt = no"
- @echo >>x509.genkey "string_mask = utf8only"
- @echo >>x509.genkey "x509_extensions = myexts"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ req_distinguished_name ]"
- @echo >>x509.genkey "#O = Unspecified company"
- @echo >>x509.genkey "CN = Build time autogenerated kernel key"
- @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ myexts ]"
- @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
- @echo >>x509.genkey "keyUsage=digitalSignature"
- @echo >>x509.genkey "subjectKeyIdentifier=hash"
- @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
diff --git a/kernel/audit.c b/kernel/audit.c
index f9e6065346db..5ffcbd354a52 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb)
static void kauditd_send_skb(struct sk_buff *skb)
{
int err;
+ int attempts = 0;
+#define AUDITD_RETRIES 5
+
+restart:
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
- BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
+ pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
+ audit_pid, err);
if (audit_pid) {
- pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd disappeared");
- audit_pid = 0;
- audit_sock = NULL;
+ if (err == -ECONNREFUSED || err == -EPERM
+ || ++attempts >= AUDITD_RETRIES) {
+ char s[32];
+
+ snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
+ audit_log_lost(s);
+ audit_pid = 0;
+ audit_sock = NULL;
+ } else {
+ pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
+ attempts, audit_pid);
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ goto restart;
+ }
}
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
@@ -684,25 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
- int rc = 0;
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
- return rc;
+ return;
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
- return rc;
+ return;
audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
-
- return rc;
}
int is_audit_feature_set(int i)
@@ -1357,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(audit_filter_type(type)))
return NULL;
- if (gfp_mask & __GFP_WAIT) {
+ if (gfp_mask & __GFP_DIRECT_RECLAIM) {
if (audit_pid && audit_pid == current->pid)
- gfp_mask &= ~__GFP_WAIT;
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
else
reserve = 0;
}
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
long sleep_time;
sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
@@ -1566,14 +1580,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
* @string: string to be checked
* @len: max length of the string to check
*/
-int audit_string_contains_control(const char *string, size_t len)
+bool audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/**
@@ -1761,7 +1775,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
} else
audit_log_format(ab, " name=(null)");
- if (n->ino != (unsigned long)-1)
+ if (n->ino != AUDIT_INO_UNSET)
audit_log_format(ab, " inode=%lu"
" dev=%02x:%02x mode=%#ho"
" ouid=%u ogid=%u rdev=%02x:%02x",
diff --git a/kernel/audit.h b/kernel/audit.h
index d641f9bb3ed0..de6cbb7cf547 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -50,6 +50,7 @@ enum audit_state {
/* Rule lists */
struct audit_watch;
+struct audit_fsnotify_mark;
struct audit_tree;
struct audit_chunk;
@@ -252,6 +253,7 @@ struct audit_net {
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
+extern int audit_del_rule(struct audit_entry *);
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
@@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
+extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
+extern void audit_remove_mark_rule(struct audit_krule *krule);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+
#else
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
@@ -278,12 +289,19 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev
#define audit_watch_path(w) ""
#define audit_watch_compare(w, i, d) 0
+#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
+#define audit_mark_path(m) ""
+#define audit_remove_mark(m)
+#define audit_remove_mark_rule(k)
+#define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
extern void audit_put_chunk(struct audit_chunk *);
-extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
+extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
extern int audit_make_tree(struct audit_krule *, char *, u32);
extern int audit_add_tree_rule(struct audit_krule *);
extern int audit_remove_tree_rule(struct audit_krule *);
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
new file mode 100644
index 000000000000..27c6046c2c3d
--- /dev/null
+++ b/kernel/audit_fsnotify.c
@@ -0,0 +1,216 @@
+/* audit_fsnotify.c -- tracking inodes
+ *
+ * Copyright 2003-2009,2014-2015 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * this mark lives on the parent directory of the inode in question.
+ * but dev, ino, and path are about the child
+ */
+struct audit_fsnotify_mark {
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ char *path; /* insertion path */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
+ struct audit_krule *rule;
+};
+
+/* fsnotify handle. */
+static struct fsnotify_group *audit_fsnotify_group;
+
+/* fsnotify events we care about. */
+#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+
+static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
+{
+ kfree(audit_mark->path);
+ kfree(audit_mark);
+}
+
+static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ struct audit_fsnotify_mark *audit_mark;
+
+ audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
+ audit_fsnotify_mark_free(audit_mark);
+}
+
+char *audit_mark_path(struct audit_fsnotify_mark *mark)
+{
+ return mark->path;
+}
+
+int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
+{
+ if (mark->ino == AUDIT_INO_UNSET)
+ return 0;
+ return (mark->ino == ino) && (mark->dev == dev);
+}
+
+static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
+ struct inode *inode)
+{
+ audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
+ audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
+}
+
+struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ struct path path;
+ struct dentry *dentry;
+ struct inode *inode;
+ int ret;
+
+ if (pathname[0] != '/' || pathname[len-1] == '/')
+ return ERR_PTR(-EINVAL);
+
+ dentry = kern_path_locked(pathname, &path);
+ if (IS_ERR(dentry))
+ return (void *)dentry; /* returning an error */
+ inode = path.dentry->d_inode;
+ mutex_unlock(&inode->i_mutex);
+
+ audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
+ if (unlikely(!audit_mark)) {
+ audit_mark = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+ audit_mark->mark.mask = AUDIT_FS_EVENTS;
+ audit_mark->path = pathname;
+ audit_update_mark(audit_mark, dentry->d_inode);
+ audit_mark->rule = krule;
+
+ ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+ if (ret < 0) {
+ audit_fsnotify_mark_free(audit_mark);
+ audit_mark = ERR_PTR(ret);
+ }
+out:
+ dput(dentry);
+ path_put(&path);
+ return audit_mark;
+}
+
+static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
+{
+ struct audit_buffer *ab;
+ struct audit_krule *rule = audit_mark->rule;
+
+ if (!audit_enabled)
+ return;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "auid=%u ses=%u op=",
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ audit_get_sessionid(current));
+ audit_log_string(ab, op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, audit_mark->path);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
+void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
+{
+ fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
+ fsnotify_put_mark(&audit_mark->mark);
+}
+
+void audit_remove_mark_rule(struct audit_krule *krule)
+{
+ struct audit_fsnotify_mark *mark = krule->exe;
+
+ audit_remove_mark(mark);
+}
+
+static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
+{
+ struct audit_krule *rule = audit_mark->rule;
+ struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
+
+ audit_mark_log_rule_change(audit_mark, "autoremove_rule");
+ audit_del_rule(entry);
+}
+
+/* Update mark data in audit rules based on fsnotify events. */
+static int audit_mark_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ struct inode *inode = NULL;
+
+ audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
+
+ BUG_ON(group != audit_fsnotify_group);
+
+ switch (data_type) {
+ case (FSNOTIFY_EVENT_PATH):
+ inode = ((struct path *)data)->dentry->d_inode;
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = (struct inode *)data;
+ break;
+ default:
+ BUG();
+ return 0;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
+ if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
+ return 0;
+ audit_update_mark(audit_mark, inode);
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ audit_autoremove_mark_rule(audit_mark);
+
+ return 0;
+}
+
+static const struct fsnotify_ops audit_mark_fsnotify_ops = {
+ .handle_event = audit_mark_handle_event,
+};
+
+static int __init audit_fsnotify_init(void)
+{
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+ if (IS_ERR(audit_fsnotify_group)) {
+ audit_fsnotify_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
+ return 0;
+}
+device_initcall(audit_fsnotify_init);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0f9877273fc..5efe9b299a12 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
return NULL;
}
-int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
+bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
{
int n;
for (n = 0; n < chunk->count; n++)
if (chunk->owners[n].owner == tree)
- return 1;
- return 0;
+ return true;
+ return false;
}
/* tagging and untagging inodes with trees */
@@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree)
if (rule->tree) {
/* not a half-baked one */
audit_tree_log_remove_rule(rule);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6e30024d9aac..656c7e93ac0d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch)
int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
- return (watch->ino != (unsigned long)-1) &&
+ return (watch->ino != AUDIT_INO_UNSET) &&
(watch->ino == ino) &&
(watch->dev == dev);
}
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path)
INIT_LIST_HEAD(&watch->rules);
atomic_set(&watch->count, 1);
watch->path = path;
- watch->dev = (dev_t)-1;
- watch->ino = (unsigned long)-1;
+ watch->dev = AUDIT_DEV_UNSET;
+ watch->ino = AUDIT_INO_UNSET;
return watch;
}
@@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
if (IS_ERR(watch))
return PTR_ERR(watch);
- audit_get_watch(watch);
krule->watch = watch;
return 0;
@@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
list_replace(&oentry->rule.list,
&nentry->rule.list);
}
+ if (oentry->rule.exe)
+ audit_remove_mark(oentry->rule.exe);
audit_watch_log_rule_change(r, owatch, "updated_rules");
@@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
audit_watch_log_rule_change(r, w, "remove_rule");
+ if (e->rule.exe)
+ audit_remove_mark(e->rule.exe);
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
@@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule,
watch_found = 1;
- /* put krule's and initial refs to temporary watch */
- audit_put_watch(watch);
+ /* put krule's ref to temporary watch */
audit_put_watch(watch);
audit_get_watch(w);
krule->watch = watch = w;
+
+ audit_put_parent(parent);
break;
}
if (!watch_found) {
- audit_get_parent(parent);
watch->parent = parent;
+ audit_get_watch(watch);
list_add(&watch->wlist, &parent->watches);
}
list_add(&krule->rlist, &watch->rules);
@@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
audit_add_to_parent(krule, parent);
- /* match get in audit_find_parent or audit_init_parent */
- audit_put_parent(parent);
-
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
@@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
else if (mask & (FS_DELETE|FS_MOVED_FROM))
- audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
@@ -517,3 +518,36 @@ static int __init audit_watch_init(void)
return 0;
}
device_initcall(audit_watch_init);
+
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ char *pathname;
+
+ pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+ if (!pathname)
+ return -ENOMEM;
+
+ audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+ if (IS_ERR(audit_mark)) {
+ kfree(pathname);
+ return PTR_ERR(audit_mark);
+ }
+ new->exe = audit_mark;
+
+ return 0;
+}
+
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+ struct file *exe_file;
+ unsigned long ino;
+ dev_t dev;
+
+ rcu_read_lock();
+ exe_file = rcu_dereference(tsk->mm->exe_file);
+ ino = exe_file->f_inode->i_ino;
+ dev = exe_file->f_inode->i_sb->s_dev;
+ rcu_read_unlock();
+ return audit_mark_compare(mark, ino, dev);
+}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 72e1660a79a3..b8ff9e193753 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -39,13 +39,13 @@
* Locking model:
*
* audit_filter_mutex:
- * Synchronizes writes and blocking reads of audit's filterlist
- * data. Rcu is used to traverse the filterlist and access
- * contents of structs audit_entry, audit_watch and opaque
- * LSM rules during filtering. If modified, these structures
- * must be copied and replace their counterparts in the filterlist.
- * An audit_parent struct is not accessed during filtering, so may
- * be written directly provided audit_filter_mutex is held.
+ * Synchronizes writes and blocking reads of audit's filterlist
+ * data. Rcu is used to traverse the filterlist and access
+ * contents of structs audit_entry, audit_watch and opaque
+ * LSM rules during filtering. If modified, these structures
+ * must be copied and replace their counterparts in the filterlist.
+ * An audit_parent struct is not accessed during filtering, so may
+ * be written directly provided audit_filter_mutex is held.
*/
/* Audit filter lists, defined in <linux/audit.h> */
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
+ case AUDIT_EXE:
+ if (f->op != Audit_equal)
+ return -EINVAL;
+ if (entry->rule.listnr != AUDIT_FILTER_EXIT)
+ return -EINVAL;
+ break;
};
return 0;
}
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
size_t remain = datasz - sizeof(struct audit_rule_data);
int i;
char *str;
+ struct audit_fsnotify_mark *audit_mark;
entry = audit_to_entry_common(data);
if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
+ case AUDIT_EXE:
+ if (entry->rule.exe || f->val > PATH_MAX)
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
+ goto exit_free;
+ }
+ entry->rule.buflen += f->val;
+
+ audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+ if (IS_ERR(audit_mark)) {
+ kfree(str);
+ err = PTR_ERR(audit_mark);
+ goto exit_free;
+ }
+ entry->rule.exe = audit_mark;
+ break;
}
}
@@ -549,10 +574,10 @@ exit_nofree:
return entry;
exit_free:
- if (entry->rule.watch)
- audit_put_watch(entry->rule.watch); /* matches initial get */
if (entry->rule.tree)
audit_put_tree(entry->rule.tree); /* that's the temporary one */
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe); /* that's the template one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->filterkey);
break;
+ case AUDIT_EXE:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, audit_mark_path(krule->exe));
+ break;
case AUDIT_LOGINUID_SET:
if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
data->fields[i] = AUDIT_LOGINUID;
@@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_EXE:
+ /* both paths exist based on above type compare */
+ if (strcmp(audit_mark_path(a->exe),
+ audit_mark_path(b->exe)))
+ return 1;
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
err = -ENOMEM;
else
new->filterkey = fk;
+ break;
+ case AUDIT_EXE:
+ err = audit_dupe_exe(new, old);
+ break;
}
if (err) {
+ if (new->exe)
+ audit_remove_mark(new->exe);
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- int err;
+ int err = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
if (watch) {
@@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry)
*/
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
if (err) {
mutex_unlock(&audit_filter_mutex);
- goto error;
+ return err;
}
}
@@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- return 0;
-
-error:
- if (watch)
- audit_put_watch(watch); /* tmp watch, matches initial get */
return err;
}
/* Remove an existing rule from filterlist. */
-static inline int audit_del_rule(struct audit_entry *entry)
+int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
- struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
int ret = 0;
@@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
if (!e) {
- mutex_unlock(&audit_filter_mutex);
ret = -ENOENT;
goto out;
}
@@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
- list_del_rcu(&e->list);
- list_del(&e->rule.list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
+ if (e->rule.exe)
+ audit_remove_mark_rule(&e->rule);
#ifdef CONFIG_AUDITSYSCALL
if (!dont_count)
@@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (!audit_match_signal(entry))
audit_signals--;
#endif
- mutex_unlock(&audit_filter_mutex);
+
+ list_del_rcu(&e->list);
+ list_del(&e->rule.list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
out:
- if (watch)
- audit_put_watch(watch); /* match initial get */
+ mutex_unlock(&audit_filter_mutex);
+
if (tree)
audit_put_tree(tree); /* that's the temporary one */
@@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
WARN_ON(1);
}
- if (err || type == AUDIT_DEL_RULE)
+ if (err || type == AUDIT_DEL_RULE) {
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
audit_free_rule(entry);
+ }
return err;
}
@@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
return 0;
nentry = audit_dupe_rule(r);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e85bdfd15fed..b86cc04959de 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
return 0;
list_for_each_entry(n, &ctx->names_list, list) {
- if ((n->ino != -1) &&
+ if ((n->ino != AUDIT_INO_UNSET) &&
((n->mode & S_IFMT) == mode))
return 1;
}
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
+ case AUDIT_EXE:
+ result = audit_exe_compare(tsk, rule->exe);
+ break;
case AUDIT_UID:
result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
@@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
aname->should_free = true;
}
- aname->ino = (unsigned long)-1;
+ aname->ino = AUDIT_INO_UNSET;
aname->type = type;
list_add_tail(&aname->list, &context->names_list);
@@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent,
if (inode)
audit_copy_inode(found_child, dentry, inode);
else
- found_child->ino = (unsigned long)-1;
+ found_child->ino = AUDIT_INO_UNSET;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6983be12bd3..13272582eee0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,2 +1,4 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
+
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..3f4c99e06c6b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
+#include <linux/perf_event.h>
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
-
+ array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
array->elem_size = elem_size;
return &array->map;
@@ -150,15 +151,15 @@ static int __init register_array_map(void)
}
late_initcall(register_array_map);
-static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
{
- /* only bpf_prog file descriptors can be stored in prog_array map */
+ /* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
return array_map_alloc(attr);
}
-static void prog_array_map_free(struct bpf_map *map)
+static void fd_array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
@@ -167,21 +168,21 @@ static void prog_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
- BUG_ON(array->prog[i] != NULL);
+ BUG_ON(array->ptrs[i] != NULL);
kvfree(array);
}
-static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
{
return NULL;
}
/* only called from syscall */
-static int prog_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct bpf_prog *prog, *old_prog;
+ void *new_ptr, *old_ptr;
u32 index = *(u32 *)key, ufd;
if (map_flags != BPF_ANY)
@@ -191,57 +192,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- prog = bpf_prog_get(ufd);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
-
- if (!bpf_prog_array_compatible(array, prog)) {
- bpf_prog_put(prog);
- return -EINVAL;
- }
+ new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ if (IS_ERR(new_ptr))
+ return PTR_ERR(new_ptr);
- old_prog = xchg(array->prog + index, prog);
- if (old_prog)
- bpf_prog_put_rcu(old_prog);
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ if (old_ptr)
+ map->ops->map_fd_put_ptr(old_ptr);
return 0;
}
-static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- struct bpf_prog *old_prog;
+ void *old_ptr;
u32 index = *(u32 *)key;
if (index >= array->map.max_entries)
return -E2BIG;
- old_prog = xchg(array->prog + index, NULL);
- if (old_prog) {
- bpf_prog_put_rcu(old_prog);
+ old_ptr = xchg(array->ptrs + index, NULL);
+ if (old_ptr) {
+ map->ops->map_fd_put_ptr(old_ptr);
return 0;
} else {
return -ENOENT;
}
}
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog = bpf_prog_get(fd);
+ if (IS_ERR(prog))
+ return prog;
+
+ if (!bpf_prog_array_compatible(array, prog)) {
+ bpf_prog_put(prog);
+ return ERR_PTR(-EINVAL);
+ }
+ return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+ struct bpf_prog *prog = ptr;
+
+ bpf_prog_put_rcu(prog);
+}
+
/* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_prog_array_map_clear(struct bpf_map *map)
+void bpf_fd_array_map_clear(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
for (i = 0; i < array->map.max_entries; i++)
- prog_array_map_delete_elem(map, &i);
+ fd_array_map_delete_elem(map, &i);
}
static const struct bpf_map_ops prog_array_ops = {
- .map_alloc = prog_array_map_alloc,
- .map_free = prog_array_map_free,
+ .map_alloc = fd_array_map_alloc,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
- .map_lookup_elem = prog_array_map_lookup_elem,
- .map_update_elem = prog_array_map_update_elem,
- .map_delete_elem = prog_array_map_delete_elem,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = prog_fd_array_get_ptr,
+ .map_fd_put_ptr = prog_fd_array_put_ptr,
};
static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +274,69 @@ static int __init register_prog_array_map(void)
return 0;
}
late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct perf_event *event;
+ const struct perf_event_attr *attr;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return event;
+
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr))
+ goto err;
+
+ if (attr->inherit)
+ goto err;
+
+ if (attr->type == PERF_TYPE_RAW)
+ return event;
+
+ if (attr->type == PERF_TYPE_HARDWARE)
+ return event;
+
+ if (attr->type == PERF_TYPE_SOFTWARE &&
+ attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+ return event;
+err:
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+ struct perf_event *event = ptr;
+
+ perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+ .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c5bedc82bc1c..334b1bdd572c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (fp == NULL)
return NULL;
+ kmemcheck_annotate_bitfield(fp, meta);
+
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
@@ -90,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
+ fp->aux->prog = fp;
return fp;
}
@@ -110,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
if (fp != NULL) {
+ kmemcheck_annotate_bitfield(fp, meta);
+
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = size / PAGE_SIZE;
+ fp->aux->prog = fp;
/* We keep fp->aux from fp_old around in the new
* reallocated structure.
@@ -177,6 +183,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
return 0;
}
+EXPORT_SYMBOL_GPL(__bpf_call_base);
/**
* __bpf_prog_run - run eBPF program on a given context
@@ -449,11 +456,15 @@ select_insn:
tail_call_cnt++;
- prog = READ_ONCE(array->prog[index]);
+ prog = READ_ONCE(array->ptrs[index]);
if (unlikely(!prog))
goto out;
- ARG1 = BPF_R1;
+ /* ARG1 at this point is guaranteed to point to CTX from
+ * the verifier side due to the fact that the tail call is
+ * handeled like a helper, that is, bpf_tail_call_proto,
+ * where arg1_type is ARG_PTR_TO_CTX.
+ */
insn = prog->insnsi;
goto select_insn;
out:
@@ -717,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp)
struct bpf_prog_aux *aux = fp->aux;
INIT_WORK(&aux->work, bpf_prog_free_deferred);
- aux->prog = fp;
schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+ prandom_init_once(&bpf_user_rnd_state);
+}
+
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* Should someone ever have the rather unwise idea to use some
+ * of the registers passed into this function, then note that
+ * this function is called from native eBPF and classic-to-eBPF
+ * transformations. Register assignments from both sides are
+ * different, f.e. classic always sets fn(ctx, A, X) here.
+ */
+ struct rnd_state *state;
+ u32 res;
+
+ state = &get_cpu_var(bpf_user_rnd_state);
+ res = prandom_u32_state(state);
+ put_cpu_var(state);
+
+ return res;
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..19909b22b4f8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@
struct bpf_htab {
struct bpf_map map;
struct hlist_head *buckets;
- spinlock_t lock;
+ raw_spinlock_t lock;
u32 count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
@@ -82,12 +82,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
for (i = 0; i < htab->n_buckets; i++)
INIT_HLIST_HEAD(&htab->buckets[i]);
- spin_lock_init(&htab->lock);
+ raw_spin_lock_init(&htab->lock);
htab->count = 0;
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8) +
htab->map.value_size;
+
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->elem_size * htab->map.max_entries,
+ PAGE_SIZE) >> PAGE_SHIFT;
return &htab->map;
free_htab:
@@ -230,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new->hash = htab_map_hash(l_new->key, key_size);
/* bpf_map_update_elem() can be called in_irq() */
- spin_lock_irqsave(&htab->lock, flags);
+ raw_spin_lock_irqsave(&htab->lock, flags);
head = select_bucket(htab, l_new->hash);
@@ -266,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
} else {
htab->count++;
}
- spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
return 0;
err:
- spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
kfree(l_new);
return ret;
}
@@ -291,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
hash = htab_map_hash(key, key_size);
- spin_lock_irqsave(&htab->lock, flags);
+ raw_spin_lock_irqsave(&htab->lock, flags);
head = select_bucket(htab, hash);
@@ -304,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
ret = 0;
}
- spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
return ret;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec09421e..4504ca66118d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
.arg2_type = ARG_PTR_TO_MAP_KEY,
};
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- return prandom_u32();
-}
-
const struct bpf_func_proto bpf_get_prandom_u32_proto = {
- .func = bpf_get_prandom_u32,
+ .func = bpf_user_rnd_u32,
.gpl_only = false,
.ret_type = RET_INTEGER,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
new file mode 100644
index 000000000000..be6d726e31c9
--- /dev/null
+++ b/kernel/bpf/inode.c
@@ -0,0 +1,387 @@
+/*
+ * Minimal file system backend for holding eBPF maps and programs,
+ * used by bpf(2) object pinning.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <daniel@iogearbox.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/major.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+enum bpf_type {
+ BPF_TYPE_UNSPEC = 0,
+ BPF_TYPE_PROG,
+ BPF_TYPE_MAP,
+};
+
+static void *bpf_any_get(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+ break;
+ case BPF_TYPE_MAP:
+ atomic_inc(&((struct bpf_map *)raw)->refcnt);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return raw;
+}
+
+static void bpf_any_put(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ bpf_prog_put(raw);
+ break;
+ case BPF_TYPE_MAP:
+ bpf_map_put(raw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
+{
+ void *raw;
+
+ *type = BPF_TYPE_MAP;
+ raw = bpf_map_get(ufd);
+ if (IS_ERR(raw)) {
+ *type = BPF_TYPE_PROG;
+ raw = bpf_prog_get(ufd);
+ }
+
+ return raw;
+}
+
+static const struct inode_operations bpf_dir_iops;
+
+static const struct inode_operations bpf_prog_iops = { };
+static const struct inode_operations bpf_map_iops = { };
+
+static struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
+{
+ struct inode *inode;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ case S_IFREG:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOSPC);
+
+ inode->i_ino = get_next_ino();
+ inode->i_atime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime;
+ inode->i_ctime = inode->i_atime;
+
+ inode_init_owner(inode, dir, mode);
+
+ return inode;
+}
+
+static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
+{
+ *type = BPF_TYPE_UNSPEC;
+ if (inode->i_op == &bpf_prog_iops)
+ *type = BPF_TYPE_PROG;
+ else if (inode->i_op == &bpf_map_iops)
+ *type = BPF_TYPE_MAP;
+ else
+ return -EACCES;
+
+ return 0;
+}
+
+static bool bpf_dname_reserved(const struct dentry *dentry)
+{
+ return strchr(dentry->d_name.name, '.');
+}
+
+static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ if (bpf_dname_reserved(dentry))
+ return -EPERM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &bpf_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+
+ inc_nlink(inode);
+ inc_nlink(dir);
+
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ return 0;
+}
+
+static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
+ umode_t mode, const struct inode_operations *iops)
+{
+ struct inode *inode;
+
+ if (bpf_dname_reserved(dentry))
+ return -EPERM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = iops;
+ inode->i_private = dentry->d_fsdata;
+
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ return 0;
+}
+
+static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t devt)
+{
+ enum bpf_type type = MINOR(devt);
+
+ if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
+ dentry->d_fsdata == NULL)
+ return -EPERM;
+
+ switch (type) {
+ case BPF_TYPE_PROG:
+ return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
+ case BPF_TYPE_MAP:
+ return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
+ default:
+ return -EPERM;
+ }
+}
+
+static const struct inode_operations bpf_dir_iops = {
+ .lookup = simple_lookup,
+ .mknod = bpf_mkobj,
+ .mkdir = bpf_mkdir,
+ .rmdir = simple_rmdir,
+ .unlink = simple_unlink,
+};
+
+static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+ enum bpf_type type)
+{
+ struct dentry *dentry;
+ struct inode *dir;
+ struct path path;
+ umode_t mode;
+ dev_t devt;
+ int ret;
+
+ dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ devt = MKDEV(UNNAMED_MAJOR, type);
+
+ ret = security_path_mknod(&path, dentry, mode, devt);
+ if (ret)
+ goto out;
+
+ dir = d_inode(path.dentry);
+ if (dir->i_op != &bpf_dir_iops) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ dentry->d_fsdata = raw;
+ ret = vfs_mknod(dir, dentry, mode, devt);
+ dentry->d_fsdata = NULL;
+out:
+ done_path_create(&path, dentry);
+ return ret;
+}
+
+int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+{
+ struct filename *pname;
+ enum bpf_type type;
+ void *raw;
+ int ret;
+
+ pname = getname(pathname);
+ if (IS_ERR(pname))
+ return PTR_ERR(pname);
+
+ raw = bpf_fd_probe_obj(ufd, &type);
+ if (IS_ERR(raw)) {
+ ret = PTR_ERR(raw);
+ goto out;
+ }
+
+ ret = bpf_obj_do_pin(pname, raw, type);
+ if (ret != 0)
+ bpf_any_put(raw, type);
+out:
+ putname(pname);
+ return ret;
+}
+
+static void *bpf_obj_do_get(const struct filename *pathname,
+ enum bpf_type *type)
+{
+ struct inode *inode;
+ struct path path;
+ void *raw;
+ int ret;
+
+ ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ inode = d_backing_inode(path.dentry);
+ ret = inode_permission(inode, MAY_WRITE);
+ if (ret)
+ goto out;
+
+ ret = bpf_inode_type(inode, type);
+ if (ret)
+ goto out;
+
+ raw = bpf_any_get(inode->i_private, *type);
+ touch_atime(&path);
+
+ path_put(&path);
+ return raw;
+out:
+ path_put(&path);
+ return ERR_PTR(ret);
+}
+
+int bpf_obj_get_user(const char __user *pathname)
+{
+ enum bpf_type type = BPF_TYPE_UNSPEC;
+ struct filename *pname;
+ int ret = -ENOENT;
+ void *raw;
+
+ pname = getname(pathname);
+ if (IS_ERR(pname))
+ return PTR_ERR(pname);
+
+ raw = bpf_obj_do_get(pname, &type);
+ if (IS_ERR(raw)) {
+ ret = PTR_ERR(raw);
+ goto out;
+ }
+
+ if (type == BPF_TYPE_PROG)
+ ret = bpf_prog_new_fd(raw);
+ else if (type == BPF_TYPE_MAP)
+ ret = bpf_map_new_fd(raw);
+ else
+ goto out;
+
+ if (ret < 0)
+ bpf_any_put(raw, type);
+out:
+ putname(pname);
+ return ret;
+}
+
+static void bpf_evict_inode(struct inode *inode)
+{
+ enum bpf_type type;
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+}
+
+static const struct super_operations bpf_super_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = bpf_evict_inode,
+};
+
+static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr bpf_rfiles[] = { { "" } };
+ struct inode *inode;
+ int ret;
+
+ ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
+ if (ret)
+ return ret;
+
+ sb->s_op = &bpf_super_ops;
+
+ inode = sb->s_root->d_inode;
+ inode->i_op = &bpf_dir_iops;
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= S_ISVTX | S_IRWXUGO;
+
+ return 0;
+}
+
+static struct dentry *bpf_mount(struct file_system_type *type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+}
+
+static struct file_system_type bpf_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "bpf",
+ .mount = bpf_mount,
+ .kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+MODULE_ALIAS_FS("bpf");
+
+static int __init bpf_init(void)
+{
+ int ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "bpf");
+ if (ret)
+ return ret;
+
+ ret = register_filesystem(&bpf_fs_type);
+ if (ret)
+ sysfs_remove_mount_point(fs_kobj, "bpf");
+
+ return ret;
+}
+fs_initcall(bpf_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..0d3313d02a7e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
#include <linux/filter.h>
#include <linux/version.h>
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
static LIST_HEAD(bpf_map_types);
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(map->pages, &user->locked_vm);
+
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ map->user = user;
+ return 0;
+}
+
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = map->user;
+
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ bpf_map_uncharge_memlock(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -72,7 +101,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
/* prog_array stores refcnt-ed bpf_prog pointers
* release them all when user space closes prog_array_fd
*/
- bpf_prog_array_map_clear(map);
+ bpf_fd_array_map_clear(map);
bpf_map_put(map);
return 0;
@@ -82,6 +111,12 @@ static const struct file_operations bpf_map_fops = {
.release = bpf_map_release,
};
+int bpf_map_new_fd(struct bpf_map *map)
+{
+ return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
+ O_RDWR | O_CLOEXEC);
+}
+
/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
@@ -108,8 +143,11 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
- err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map;
+ err = bpf_map_new_fd(map);
if (err < 0)
/* failed to allocate fd */
goto free_map;
@@ -124,19 +162,29 @@ free_map:
/* if error is returned, fd is released.
* On success caller should complete fd access with matching fdput()
*/
-struct bpf_map *bpf_map_get(struct fd f)
+struct bpf_map *__bpf_map_get(struct fd f)
{
- struct bpf_map *map;
-
if (!f.file)
return ERR_PTR(-EBADF);
-
if (f.file->f_op != &bpf_map_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- map = f.file->private_data;
+ return f.file->private_data;
+}
+
+struct bpf_map *bpf_map_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return map;
+
+ atomic_inc(&map->refcnt);
+ fdput(f);
return map;
}
@@ -155,15 +203,16 @@ static int map_lookup_elem(union bpf_attr *attr)
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *value, *ptr;
+ struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
return -EINVAL;
- map = bpf_map_get(f);
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -213,15 +262,16 @@ static int map_update_elem(union bpf_attr *attr)
void __user *ukey = u64_to_ptr(attr->key);
void __user *uvalue = u64_to_ptr(attr->value);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *value;
+ struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;
- map = bpf_map_get(f);
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -265,15 +315,16 @@ static int map_delete_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_ptr(attr->key);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
+ struct fd f;
void *key;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
- map = bpf_map_get(f);
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -305,15 +356,16 @@ static int map_get_next_key(union bpf_attr *attr)
void __user *ukey = u64_to_ptr(attr->key);
void __user *unext_key = u64_to_ptr(attr->next_key);
int ufd = attr->map_fd;
- struct fd f = fdget(ufd);
struct bpf_map *map;
void *key, *next_key;
+ struct fd f;
int err;
if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
return -EINVAL;
- map = bpf_map_get(f);
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -398,6 +450,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
*/
BUG_ON(!prog->aux->ops->get_func_proto);
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
if (insn->imm == BPF_FUNC_tail_call) {
/* mark bpf_tail_call as different opcode
* to avoid conditional branch in
@@ -432,29 +488,51 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
-static void __prog_put_rcu(struct rcu_head *rcu)
+static int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(prog->pages, &user->locked_vm);
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ prog->aux->user = user;
+ return 0;
+}
+
+static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = prog->aux->user;
+
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+}
+
+static void __prog_put_common(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
free_used_maps(aux);
+ bpf_prog_uncharge_memlock(aux->prog);
bpf_prog_free(aux->prog);
}
/* version of bpf_prog_put() that is called after a grace period */
void bpf_prog_put_rcu(struct bpf_prog *prog)
{
- if (atomic_dec_and_test(&prog->aux->refcnt)) {
- prog->aux->prog = prog;
- call_rcu(&prog->aux->rcu, __prog_put_rcu);
- }
+ if (atomic_dec_and_test(&prog->aux->refcnt))
+ call_rcu(&prog->aux->rcu, __prog_put_common);
}
void bpf_prog_put(struct bpf_prog *prog)
{
- if (atomic_dec_and_test(&prog->aux->refcnt)) {
- free_used_maps(prog->aux);
- bpf_prog_free(prog);
- }
+ if (atomic_dec_and_test(&prog->aux->refcnt))
+ __prog_put_common(&prog->aux->rcu);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -470,21 +548,22 @@ static const struct file_operations bpf_prog_fops = {
.release = bpf_prog_release,
};
-static struct bpf_prog *get_prog(struct fd f)
+int bpf_prog_new_fd(struct bpf_prog *prog)
{
- struct bpf_prog *prog;
+ return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
+ O_RDWR | O_CLOEXEC);
+}
+static struct bpf_prog *__bpf_prog_get(struct fd f)
+{
if (!f.file)
return ERR_PTR(-EBADF);
-
if (f.file->f_op != &bpf_prog_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- prog = f.file->private_data;
-
- return prog;
+ return f.file->private_data;
}
/* called by sockets/tracing/seccomp before attaching program to an event
@@ -495,13 +574,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
struct fd f = fdget(ufd);
struct bpf_prog *prog;
- prog = get_prog(f);
-
+ prog = __bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
atomic_inc(&prog->aux->refcnt);
fdput(f);
+
return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_get);
@@ -536,11 +615,18 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_nouncharge;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -549,10 +635,10 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;
prog->orig_prog = NULL;
- prog->jited = false;
+ prog->jited = 0;
atomic_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl;
+ prog->gpl_compatible = is_gpl ? 1 : 0;
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
@@ -572,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+ err = bpf_prog_new_fd(prog);
if (err < 0)
/* failed to allocate fd */
goto free_used_maps;
@@ -582,20 +668,36 @@ static int bpf_prog_load(union bpf_attr *attr)
free_used_maps:
free_used_maps(prog->aux);
free_prog:
+ bpf_prog_uncharge_memlock(prog);
+free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
+#define BPF_OBJ_LAST_FIELD bpf_fd
+
+static int bpf_obj_pin(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_OBJ))
+ return -EINVAL;
+
+ return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+}
+
+static int bpf_obj_get(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ return -EINVAL;
+
+ return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
int err;
- /* the syscall is limited to root temporarily. This restriction will be
- * lifted when security audit is clean. Note that eBPF+tracing must have
- * this restriction, since it may pass kernel data to user space
- */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
if (!access_ok(VERIFY_READ, uattr, 1))
@@ -650,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr);
break;
+ case BPF_OBJ_PIN:
+ err = bpf_obj_pin(&attr);
+ break;
+ case BPF_OBJ_GET:
+ err = bpf_obj_get(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866fd36a..c6073056badf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
struct verifier_state_list **explored_states; /* search pruning optimization */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
+ bool allow_ptr_leaks;
};
/* verbose verifier prints what it's seeing
@@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock);
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static void verbose(const char *fmt, ...)
+static __printf(1, 2) void verbose(const char *fmt, ...)
{
va_list args;
@@ -238,6 +239,15 @@ static const char * const reg_type_str[] = {
[CONST_IMM] = "imm",
};
+static const struct {
+ int map_type;
+ int func_id;
+} func_limit[] = {
+ {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
+ {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+ {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
+};
+
static void print_verifier_state(struct verifier_env *env)
{
enum bpf_reg_type t;
@@ -275,7 +285,7 @@ static const char *const bpf_class_string[] = {
[BPF_ALU64] = "alu64",
};
-static const char *const bpf_alu_string[] = {
+static const char *const bpf_alu_string[16] = {
[BPF_ADD >> 4] = "+=",
[BPF_SUB >> 4] = "-=",
[BPF_MUL >> 4] = "*=",
@@ -299,7 +309,7 @@ static const char *const bpf_ldst_string[] = {
[BPF_DW >> 3] = "u64",
};
-static const char *const bpf_jmp_string[] = {
+static const char *const bpf_jmp_string[16] = {
[BPF_JA >> 4] = "jmp",
[BPF_JEQ >> 4] = "==",
[BPF_JGT >> 4] = ">",
@@ -530,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size)
return -EINVAL;
}
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+ switch (type) {
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ case PTR_TO_STACK:
+ case PTR_TO_CTX:
+ case FRAME_PTR:
+ case CONST_PTR_TO_MAP:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -542,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
*/
if (value_regno >= 0 &&
- (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
- state->regs[value_regno].type == PTR_TO_STACK ||
- state->regs[value_regno].type == PTR_TO_CTX)) {
+ is_spillable_regtype(state->regs[value_regno].type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -635,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
return -EACCES;
}
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+ if (env->allow_ptr_leaks)
+ return false;
+
+ switch (env->cur_state.regs[regno].type) {
+ case UNKNOWN_VALUE:
+ case CONST_IMM:
+ return false;
+ default:
+ return true;
+ }
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -648,6 +685,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
struct verifier_state *state = &env->cur_state;
int size, err = 0;
+ if (state->regs[regno].type == PTR_TO_STACK)
+ off += state->regs[regno].imm;
+
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
@@ -658,24 +698,42 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
}
if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into map\n", value_regno);
+ return -EACCES;
+ }
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
} else if (state->regs[regno].type == PTR_TO_CTX) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into ctx\n", value_regno);
+ return -EACCES;
+ }
err = check_ctx_access(env, off, size, t);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
- } else if (state->regs[regno].type == FRAME_PTR) {
+ } else if (state->regs[regno].type == FRAME_PTR ||
+ state->regs[regno].type == PTR_TO_STACK) {
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
- if (t == BPF_WRITE)
+ if (t == BPF_WRITE) {
+ if (!env->allow_ptr_leaks &&
+ state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose("attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
err = check_stack_write(state, off, size, value_regno);
- else
+ } else {
err = check_stack_read(state, off, size, value_regno);
+ }
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[state->regs[regno].type]);
@@ -763,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return -EACCES;
}
- if (arg_type == ARG_ANYTHING)
+ if (arg_type == ARG_ANYTHING) {
+ if (is_pointer_value(env, regno)) {
+ verbose("R%d leaks addr into helper function\n", regno);
+ return -EACCES;
+ }
return 0;
+ }
if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -833,6 +896,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return err;
}
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+ bool bool_map, bool_func;
+ int i;
+
+ if (!map)
+ return 0;
+
+ for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
+ bool_map = (map->map_type == func_limit[i].map_type);
+ bool_func = (func_id == func_limit[i].func_id);
+ /* only when map & func pair match it can continue.
+ * don't allow any other map type to be passed into
+ * the special func;
+ */
+ if (bool_func && bool_map != bool_func)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_call(struct verifier_env *env, int func_id)
{
struct verifier_state *state = &env->cur_state;
@@ -908,28 +993,17 @@ static int check_call(struct verifier_env *env, int func_id)
return -EINVAL;
}
- if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
- func_id != BPF_FUNC_tail_call)
- /* prog_array map type needs extra care:
- * only allow to pass it into bpf_tail_call() for now.
- * bpf_map_delete_elem() can be allowed in the future,
- * while bpf_map_update_elem() must only be done via syscall
- */
- return -EINVAL;
-
- if (func_id == BPF_FUNC_tail_call &&
- map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
- /* don't allow any other map type to be passed into
- * bpf_tail_call()
- */
- return -EINVAL;
+ err = check_map_func_compatibility(map, func_id);
+ if (err)
+ return err;
return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
{
+ struct reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -954,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check dest operand */
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
if (err)
@@ -990,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d partial copy of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ }
regs[insn->dst_reg].type = UNKNOWN_VALUE;
regs[insn->dst_reg].map_ptr = NULL;
}
@@ -1039,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
regs[insn->dst_reg].type == FRAME_PTR &&
- BPF_SRC(insn->code) == BPF_K)
+ BPF_SRC(insn->code) == BPF_K) {
stack_relative = true;
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ } else if (BPF_SRC(insn->code) == BPF_X &&
+ is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
/* check dest operand */
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1079,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
return err;
+
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer comparison prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
} else {
if (insn->src_reg != BPF_REG_0) {
verbose("BPF_JMP uses reserved fields\n");
@@ -1133,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = 0;
}
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ return -EACCES;
} else if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE)) {
@@ -1636,7 +1740,7 @@ static int do_check(struct verifier_env *env)
}
if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(regs, insn);
+ err = check_alu_op(env, insn);
if (err)
return err;
@@ -1794,6 +1898,11 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose("R0 leaks addr as return value\n");
+ return -EACCES;
+ }
+
process_bpf_exit:
insn_idx = pop_stack(env, &prev_insn_idx);
if (insn_idx < 0) {
@@ -1880,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
}
f = fdget(insn->imm);
-
- map = bpf_map_get(f);
+ map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose("fd %d is not pointing to valid bpf_map\n",
insn->imm);
@@ -2002,7 +2110,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
cnt = env->prog->aux->ops->
convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf);
+ insn->off, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
@@ -2122,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
ret = do_check(env);
skip_full_check:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..f1603c153890 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,6 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/rwsem.h>
#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
@@ -76,7 +75,7 @@
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
- * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * css_set_lock protects task->cgroups pointer, the list of css_set
* objects, and the chain of tasks off each css_set.
*
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -84,12 +83,12 @@
*/
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
-DECLARE_RWSEM(css_set_rwsem);
+DEFINE_SPINLOCK(css_set_lock);
EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_rwsem);
+EXPORT_SYMBOL_GPL(css_set_lock);
#else
static DEFINE_MUTEX(cgroup_mutex);
-static DECLARE_RWSEM(css_set_rwsem);
+static DEFINE_SPINLOCK(css_set_lock);
#endif
/*
@@ -107,8 +106,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
- rcu_lockdep_assert(rcu_read_lock_held() || \
- lockdep_is_held(&cgroup_mutex), \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ !lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
/*
@@ -139,12 +138,34 @@ static const char *cgroup_subsys_name[] = {
};
#undef SUBSYS
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x) \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
*/
struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
/*
* The default hierarchy always exists but is hidden until mounted for the
@@ -152,12 +173,6 @@ struct cgroup_root cgrp_dfl_root;
*/
static bool cgrp_dfl_root_visible;
-/*
- * Set by the boot param of the same name and makes subsystems with NULL
- * ->dfl_files to use ->legacy_files on the default hierarchy.
- */
-static bool cgroup_legacy_files_on_dfl;
-
/* some controllers are not supported in the default hierarchy */
static unsigned long cgrp_dfl_root_inhibit_ss_mask;
@@ -185,20 +200,97 @@ static u64 css_serial_nr_next = 1;
*/
static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
+static unsigned long have_free_callback __read_mostly;
+
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
bool visible);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core. This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+ return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ * and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed. Everything should be at process granularity. Use
+ * "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted. pids will be unique unless they got
+ * recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed. Replacement
+ * notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
+ * and its descendants contain no task; otherwise, 1. The file also
+ * generates kernfs notification which can be monitored through poll and
+ * [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ * take masks of ancestors with non-empty cpus/mems, instead of being
+ * moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ * masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ * is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+ return cgrp->root == &cgrp_dfl_root;
+}
+
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
@@ -207,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
- ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
@@ -331,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
return !(cgrp->self.flags & CSS_ONLINE);
}
+static void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
+static void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -480,19 +588,31 @@ struct css_set init_css_set = {
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
};
static int css_set_count = 1; /* 1 for init_css_set */
/**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+
+/**
* cgroup_update_populated - updated populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
- * @cgrp is either getting the first task (css_set) or losing the last.
- * Update @cgrp->populated_cnt accordingly. The count is propagated
- * towards root so that a given cgroup's populated_cnt is zero iff the
- * cgroup and all its descendants are empty.
+ * One of the css_sets associated with @cgrp is either getting its first
+ * task or losing the last. Update @cgrp->populated_cnt accordingly. The
+ * count is propagated towards root so that a given cgroup's populated_cnt
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
*
* @cgrp's interface file "cgroup.populated" is zero if
* @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
@@ -502,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
do {
bool trigger;
@@ -515,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (!trigger)
break;
- if (cgrp->populated_kn)
- kernfs_notify(cgrp->populated_kn);
+ check_for_release(cgrp);
+ cgroup_file_notify(&cgrp->events_file);
+
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last. Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+ struct cgrp_cset_link *link;
+
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ cgroup_update_populated(link->cgrp, populated);
+}
+
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset. If @task didn't belong to any
+ * css_set, @from_cset can be NULL. If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+ struct css_set *from_cset, struct css_set *to_cset,
+ bool use_mg_tasks)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (from_cset) {
+ struct css_task_iter *it, *pos;
+
+ WARN_ON_ONCE(list_empty(&task->cg_list));
+
+ /*
+ * @task is leaving, advance task iterators which are
+ * pointing to it so that they can resume at the next
+ * position. Advancing an iterator might remove it from
+ * the list, use safe walk. See css_task_iter_advance*()
+ * for details.
+ */
+ list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+ iters_node)
+ if (it->task_pos == &task->cg_list)
+ css_task_iter_advance(it);
+
+ list_del_init(&task->cg_list);
+ if (!css_set_populated(from_cset))
+ css_set_update_populated(from_cset, false);
+ } else {
+ WARN_ON_ONCE(!list_empty(&task->cg_list));
+ }
+
+ if (to_cset) {
+ /*
+ * We are synchronized through cgroup_threadgroup_rwsem
+ * against PF_EXITING setting such that we can't race
+ * against cgroup_exit() changing the css_set to
+ * init_css_set and dropping the old one.
+ */
+ WARN_ON_ONCE(task->flags & PF_EXITING);
+
+ if (!css_set_populated(to_cset))
+ css_set_update_populated(to_cset, true);
+ rcu_assign_pointer(task->cgroups, to_cset);
+ list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+ &to_cset->tasks);
+ }
+}
+
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
@@ -548,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
struct cgroup_subsys *ss;
int ssid;
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
if (!atomic_dec_and_test(&cset->refcount))
return;
@@ -560,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset)
css_set_count--;
list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *cgrp = link->cgrp;
-
list_del(&link->cset_link);
list_del(&link->cgrp_link);
-
- /* @cgrp can't go away while we're holding css_set_rwsem */
- if (list_empty(&cgrp->cset_links)) {
- cgroup_update_populated(cgrp, false);
- check_for_release(cgrp);
- }
-
+ if (cgroup_parent(link->cgrp))
+ cgroup_put(link->cgrp);
kfree(link);
}
@@ -587,9 +781,9 @@ static void put_css_set(struct css_set *cset)
if (atomic_add_unless(&cset->refcount, -1, 1))
return;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
put_css_set_locked(cset);
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
/*
@@ -778,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
link->cset = cset;
link->cgrp = cgrp;
- if (list_empty(&cgrp->cset_links))
- cgroup_update_populated(cgrp, true);
- list_move(&link->cset_link, &cgrp->cset_links);
-
/*
- * Always add links to the tail of the list so that the list
- * is sorted by order of hierarchy creation
+ * Always add links to the tail of the lists so that the lists are
+ * in choronological order.
*/
+ list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+
+ if (cgroup_parent(cgrp))
+ cgroup_get(cgrp);
}
/**
@@ -812,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
/* First see if we already have a cgroup group that matches
* the desired set */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
if (cset)
return cset;
@@ -837,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
+ INIT_LIST_HEAD(&cset->task_iters);
INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -865,7 +1060,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
list_add_tail(&cset->e_cset_node[ssid],
&cset->subsys[ssid]->cgroup->e_csets[ssid]);
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
return cset;
}
@@ -929,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
kfree(link);
}
- up_write(&css_set_rwsem);
+
+ spin_unlock_bh(&css_set_lock);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
@@ -958,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup *res = NULL;
lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
if (cset == &init_css_set) {
res = &root->cgrp;
@@ -981,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
+ * called with cgroup_mutex and css_set_lock held.
*/
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
@@ -1020,17 +1216,19 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
+ struct cgroup_subsys *ss = cft->ss;
+
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cft->ss->name, cft->name);
+ cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ cft->name);
else
strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
@@ -1040,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
+ * S_IRUGO for read, S_IWUSR for write.
*/
static umode_t cgroup_file_mode(const struct cftype *cft)
{
umode_t mode = 0;
- if (cft->mode)
- return cft->mode;
-
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
- if (cft->write_u64 || cft->write_s64 || cft->write)
- mode |= S_IWUSR;
+ if (cft->write_u64 || cft->write_s64 || cft->write) {
+ if (cft->flags & CFTYPE_WORLD_WRITABLE)
+ mode |= S_IWUGO;
+ else
+ mode |= S_IWUSR;
+ }
return mode;
}
-static void cgroup_get(struct cgroup *cgrp)
-{
- WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
-}
-
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
/**
* cgroup_calc_child_subsys_mask - calculate child_subsys_mask
* @cgrp: the target cgroup
@@ -1217,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
}
/**
- * cgroup_clear_dir - remove subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be removed
+ * css_clear_dir - remove subsys files in a cgroup directory
+ * @css: taget css
+ * @cgrp_override: specify if target cgroup is different from css->cgroup
*/
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void css_clear_dir(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp_override)
{
- struct cgroup_subsys *ss;
- int i;
+ struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cftype *cfts;
- for_each_subsys(ss, i) {
- struct cftype *cfts;
+ list_for_each_entry(cfts, &css->ss->cfts, node)
+ cgroup_addrm_files(css, cgrp, cfts, false);
+}
- if (!(subsys_mask & (1 << i)))
- continue;
- list_for_each_entry(cfts, &ss->cfts, node)
- cgroup_addrm_files(cgrp, cfts, false);
+/**
+ * css_populate_dir - create subsys files in a cgroup directory
+ * @css: target css
+ * @cgrp_overried: specify if target cgroup is different from css->cgroup
+ *
+ * On failure, no file is added.
+ */
+static int css_populate_dir(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp_override)
+{
+ struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cftype *cfts, *failed_cfts;
+ int ret;
+
+ if (!css->ss) {
+ if (cgroup_on_dfl(cgrp))
+ cfts = cgroup_dfl_base_files;
+ else
+ cfts = cgroup_legacy_base_files;
+
+ return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+ }
+
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ ret = cgroup_addrm_files(css, cgrp, cfts, true);
+ if (ret < 0) {
+ failed_cfts = cfts;
+ goto err;
+ }
}
+ return 0;
+err:
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ if (cfts == failed_cfts)
+ break;
+ cgroup_addrm_files(css, cgrp, cfts, false);
+ }
+ return ret;
}
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask)
{
+ struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
unsigned long tmp_ss_mask;
int ssid, i, ret;
@@ -1260,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
if (dst_root == &cgrp_dfl_root)
tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
- ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
- if (ret) {
- if (dst_root != &cgrp_dfl_root)
- return ret;
+ for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ struct cgroup *scgrp = &ss->root->cgrp;
+ int tssid;
+
+ ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
+ if (!ret)
+ continue;
/*
* Rebinding back to the default root is not allowed to
@@ -1271,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
* be rare. Moving subsystems back and forth even more so.
* Just warn about it and continue.
*/
- if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
- ret, ss_mask);
- pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ if (dst_root == &cgrp_dfl_root) {
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ }
+ continue;
+ }
+
+ for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
+ if (tssid == ssid)
+ break;
+ css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
}
+ return ret;
}
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
- for_each_subsys_which(ss, ssid, &ss_mask)
- cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
-
for_each_subsys_which(ss, ssid, &ss_mask) {
- struct cgroup_root *src_root;
- struct cgroup_subsys_state *css;
+ struct cgroup_root *src_root = ss->root;
+ struct cgroup *scgrp = &src_root->cgrp;
+ struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
struct css_set *cset;
- src_root = ss->root;
- css = cgroup_css(&src_root->cgrp, ss);
+ WARN_ON(!css || cgroup_css(dcgrp, ss));
- WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+ css_clear_dir(css, NULL);
- RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
- rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
+ rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
- css->cgroup = &dst_root->cgrp;
+ css->cgroup = dcgrp;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
- &dst_root->cgrp.e_csets[ss->id]);
- up_write(&css_set_rwsem);
+ &dcgrp->e_csets[ss->id]);
+ spin_unlock_bh(&css_set_lock);
src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.subtree_control &= ~(1 << ssid);
- cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+ scgrp->subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(scgrp);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root) {
- dst_root->cgrp.subtree_control |= 1 << ssid;
- cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ if (dst_root == &cgrp_dfl_root) {
+ static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
+ } else {
+ dcgrp->subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(dcgrp);
+ static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
if (ss->bind)
ss->bind(css);
}
- kernfs_activate(dst_root->cgrp.kn);
+ kernfs_activate(dcgrp->kn);
return 0;
}
@@ -1332,9 +1561,10 @@ static int cgroup_show_options(struct seq_file *seq,
struct cgroup_subsys *ss;
int ssid;
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(seq, ",%s", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_show_option(seq, ss->legacy_name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1342,13 +1572,14 @@ static int cgroup_show_options(struct seq_file *seq,
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
- seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
- seq_printf(seq, ",name=%s", root->name);
+ seq_show_option(seq, "name", root->name);
return 0;
}
@@ -1447,9 +1678,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
}
for_each_subsys(ss, i) {
- if (strcmp(token, ss->name))
+ if (strcmp(token, ss->legacy_name))
continue;
- if (ss->disabled)
+ if (!cgroup_ssid_enabled(i))
continue;
/* Mutually exclusive option 'all' + subsystem name */
@@ -1480,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
- if (!ss->disabled)
+ if (cgroup_ssid_enabled(i))
opts->subsys_mask |= (1 << i);
/*
@@ -1576,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
if (use_task_css_set_links)
goto out_unlock;
@@ -1606,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void)
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
- list_add(&p->cg_list, &cset->tasks);
+ if (!css_set_populated(cset))
+ css_set_update_populated(cset, true);
+ list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
}
spin_unlock_irq(&p->sighand->siglock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
out_unlock:
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1623,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->self.files);
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
@@ -1660,13 +1894,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
- struct cftype *base_files;
struct css_set *cset;
int i, ret;
lockdep_assert_held(&cgroup_mutex);
- ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+ ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
if (ret < 0)
goto out;
root_cgrp->id = ret;
@@ -1677,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
goto out;
/*
- * We're accessing css_set_count without locking css_set_rwsem here,
+ * We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. The worst that can happen is that we
* have some link structures left over
@@ -1699,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- if (root == &cgrp_dfl_root)
- base_files = cgroup_dfl_base_files;
- else
- base_files = cgroup_legacy_base_files;
-
- ret = cgroup_addrm_files(root_cgrp, base_files, true);
+ ret = css_populate_dir(&root_cgrp->self, NULL);
if (ret)
goto destroy_root;
@@ -1724,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
- down_write(&css_set_rwsem);
- hash_for_each(css_set_table, i, cset, hlist)
+ spin_lock_bh(&css_set_lock);
+ hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
- up_write(&css_set_rwsem);
+ if (css_set_populated(cset))
+ cgroup_update_populated(root_cgrp, true);
+ }
+ spin_unlock_bh(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -1960,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
char *path = NULL;
mutex_lock(&cgroup_mutex);
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -1973,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
path = buf;
}
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return path;
}
@@ -2001,6 +2232,49 @@ struct cgroup_taskset {
struct task_struct *cur_task;
};
+#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
+ .src_csets = LIST_HEAD_INIT(tset.src_csets), \
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
+ .csets = &tset.src_csets, \
+}
+
+/**
+ * cgroup_taskset_add - try to add a migration target task to a taskset
+ * @task: target task
+ * @tset: target taskset
+ *
+ * Add @task, which is a migration target, to @tset. This function becomes
+ * noop if @task doesn't need to be migrated. @task's css_set should have
+ * been added as a migration source and @task->cg_list will be moved from
+ * the css_set's tasks list to mg_tasks one.
+ */
+static void cgroup_taskset_add(struct task_struct *task,
+ struct cgroup_taskset *tset)
+{
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ return;
+
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
+ return;
+
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ return;
+
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset->src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset->dst_csets);
+}
+
/**
* cgroup_taskset_first - reset taskset and return the first task
* @tset: taskset of interest
@@ -2048,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
}
/**
- * cgroup_task_migrate - move a task from one cgroup to another.
- * @old_cgrp: the cgroup @tsk is being migrated from
- * @tsk: the task being migrated
- * @new_cset: the new css_set @tsk is being attached to
+ * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * @tset: taget taskset
+ * @dst_cgrp: destination cgroup
*
- * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
+ * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
+ * ->can_attach callbacks fails and guarantees that either all or none of
+ * the tasks in @tset are migrated. @tset is consumed regardless of
+ * success.
*/
-static void cgroup_task_migrate(struct cgroup *old_cgrp,
- struct task_struct *tsk,
- struct css_set *new_cset)
+static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
+ struct cgroup *dst_cgrp)
{
- struct css_set *old_cset;
-
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct task_struct *task, *tmp_task;
+ struct css_set *cset, *tmp_cset;
+ int i, ret;
- /*
- * We are synchronized through cgroup_threadgroup_rwsem against
- * PF_EXITING setting such that we can't race against cgroup_exit()
- * changing the css_set to init_css_set and dropping the old one.
- */
- WARN_ON_ONCE(tsk->flags & PF_EXITING);
- old_cset = task_css_set(tsk);
+ /* methods shouldn't be called if no task is actually migrating */
+ if (list_empty(&tset->src_csets))
+ return 0;
- get_css_set(new_cset);
- rcu_assign_pointer(tsk->cgroups, new_cset);
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, dst_cgrp) {
+ if (css->ss->can_attach) {
+ ret = css->ss->can_attach(css, tset);
+ if (ret) {
+ failed_css = css;
+ goto out_cancel_attach;
+ }
+ }
+ }
/*
- * Use move_tail so that cgroup_taskset_first() still returns the
- * leader after migration. This works because cgroup_migrate()
- * ensures that the dst_cset of the leader is the first on the
- * tset's dst_csets list.
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
*/
- list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+ spin_lock_bh(&css_set_lock);
+ list_for_each_entry(cset, &tset->src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+ struct css_set *from_cset = task_css_set(task);
+ struct css_set *to_cset = cset->mg_dst_cset;
+
+ get_css_set(to_cset);
+ css_set_move_task(task, from_cset, to_cset, true);
+ put_css_set_locked(from_cset);
+ }
+ }
+ spin_unlock_bh(&css_set_lock);
/*
- * We just gained a reference on old_cset by taking it from the
- * task. As trading it for new_cset is protected by cgroup_mutex,
- * we're safe to drop it here; it will be freed under RCU.
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
*/
- put_css_set_locked(old_cset);
+ tset->csets = &tset->dst_csets;
+
+ for_each_e_css(css, i, dst_cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, tset);
+
+ ret = 0;
+ goto out_release_tset;
+
+out_cancel_attach:
+ for_each_e_css(css, i, dst_cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, tset);
+ }
+out_release_tset:
+ spin_lock_bh(&css_set_lock);
+ list_splice_init(&tset->dst_csets, &tset->src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ spin_unlock_bh(&css_set_lock);
+ return ret;
}
/**
@@ -2104,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
lockdep_assert_held(&cgroup_mutex);
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
/**
@@ -2137,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *src_cgrp;
lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2226,9 +2539,9 @@ err:
/**
* cgroup_migrate - migrate a process or task to a cgroup
- * @cgrp: the destination cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
+ * @cgrp: the destination cgroup
*
* Migrate a process or task denoted by @leader to @cgrp. If migrating a
* process, the caller must be holding cgroup_threadgroup_rwsem. The
@@ -2242,115 +2555,29 @@ err:
* decided for all targets by invoking group_migrate_prepare_dst() before
* actually starting migrating.
*/
-static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
- bool threadgroup)
-{
- struct cgroup_taskset tset = {
- .src_csets = LIST_HEAD_INIT(tset.src_csets),
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
- .csets = &tset.src_csets,
- };
- struct cgroup_subsys_state *css, *failed_css = NULL;
- struct css_set *cset, *tmp_cset;
- struct task_struct *task, *tmp_task;
- int i, ret;
+static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup *cgrp)
+{
+ struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+ struct task_struct *task;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
rcu_read_lock();
task = leader;
do {
- /* @task either already exited or can't exit until the end */
- if (task->flags & PF_EXITING)
- goto next;
-
- /* leave @task alone if post_fork() hasn't linked it yet */
- if (list_empty(&task->cg_list))
- goto next;
-
- cset = task_css_set(task);
- if (!cset->mg_src_cgrp)
- goto next;
-
- /*
- * cgroup_taskset_first() must always return the leader.
- * Take care to avoid disturbing the ordering.
- */
- list_move_tail(&task->cg_list, &cset->mg_tasks);
- if (list_empty(&cset->mg_node))
- list_add_tail(&cset->mg_node, &tset.src_csets);
- if (list_empty(&cset->mg_dst_cset->mg_node))
- list_move_tail(&cset->mg_dst_cset->mg_node,
- &tset.dst_csets);
- next:
+ cgroup_taskset_add(task, &tset);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- up_write(&css_set_rwsem);
-
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset.src_csets))
- return 0;
-
- /* check that we can legitimately attach to the cgroup */
- for_each_e_css(css, i, cgrp) {
- if (css->ss->can_attach) {
- ret = css->ss->can_attach(css, &tset);
- if (ret) {
- failed_css = css;
- goto out_cancel_attach;
- }
- }
- }
-
- /*
- * Now that we're guaranteed success, proceed to move all tasks to
- * the new cgroup. There are no failure cases after here, so this
- * is the commit point.
- */
- down_write(&css_set_rwsem);
- list_for_each_entry(cset, &tset.src_csets, mg_node) {
- list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
- cgroup_task_migrate(cset->mg_src_cgrp, task,
- cset->mg_dst_cset);
- }
- up_write(&css_set_rwsem);
-
- /*
- * Migration is committed, all target tasks are now on dst_csets.
- * Nothing is sensitive to fork() after this point. Notify
- * controllers that migration is complete.
- */
- tset.csets = &tset.dst_csets;
-
- for_each_e_css(css, i, cgrp)
- if (css->ss->attach)
- css->ss->attach(css, &tset);
-
- ret = 0;
- goto out_release_tset;
+ spin_unlock_bh(&css_set_lock);
-out_cancel_attach:
- for_each_e_css(css, i, cgrp) {
- if (css == failed_css)
- break;
- if (css->ss->cancel_attach)
- css->ss->cancel_attach(css, &tset);
- }
-out_release_tset:
- down_write(&css_set_rwsem);
- list_splice_init(&tset.dst_csets, &tset.src_csets);
- list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
- list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
- list_del_init(&cset->mg_node);
- }
- up_write(&css_set_rwsem);
- return ret;
+ return cgroup_taskset_migrate(&tset, cgrp);
}
/**
@@ -2369,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
int ret;
/* look up all src csets */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
rcu_read_lock();
task = leader;
do {
@@ -2379,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
if (!ret)
- ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+ ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
cgroup_migrate_finish(&preloaded_csets);
return ret;
@@ -2412,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *cgrp;
struct inode *inode;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
while (!cgroup_is_descendant(dst_cgrp, cgrp))
cgrp = cgroup_parent(cgrp);
ret = -ENOMEM;
- inode = kernfs_get_inode(sb, cgrp->procs_kn);
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
if (inode) {
ret = inode_permission(inode, MAY_WRITE);
iput(inode);
@@ -2511,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (root == &cgrp_dfl_root)
continue;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
@@ -2628,6 +2855,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
LIST_HEAD(preloaded_csets);
+ struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct cgroup_subsys_state *css;
struct css_set *src_cset;
int ret;
@@ -2637,7 +2865,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
struct cgrp_cset_link *link;
@@ -2649,57 +2877,28 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
cgroup_migrate_add_src(link->cset, cgrp,
&preloaded_csets);
}
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
if (ret)
goto out_finish;
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
- struct task_struct *last_task = NULL, *task;
+ struct task_struct *task, *ntask;
/* src_csets precede dst_csets, break on the first dst_cset */
if (!src_cset->mg_src_cgrp)
break;
- /*
- * All tasks in src_cset need to be migrated to the
- * matching dst_cset. Empty it process by process. We
- * walk tasks but migrate processes. The leader might even
- * belong to a different cset but such src_cset would also
- * be among the target src_csets because the default
- * hierarchy enforces per-process membership.
- */
- while (true) {
- down_read(&css_set_rwsem);
- task = list_first_entry_or_null(&src_cset->tasks,
- struct task_struct, cg_list);
- if (task) {
- task = task->group_leader;
- WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
- get_task_struct(task);
- }
- up_read(&css_set_rwsem);
-
- if (!task)
- break;
-
- /* guard against possible infinite loop */
- if (WARN(last_task == task,
- "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
- goto out_finish;
- last_task = task;
-
- ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
-
- put_task_struct(task);
-
- if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
- goto out_finish;
- }
+ /* all tasks in src_csets need to be migrated */
+ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+ cgroup_taskset_add(task, &tset);
}
+ spin_unlock_bh(&css_set_lock);
+ ret = cgroup_taskset_migrate(&tset, cgrp);
out_finish:
cgroup_migrate_finish(&preloaded_csets);
percpu_up_write(&cgroup_threadgroup_rwsem);
@@ -2729,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
if (tok[0] == '\0')
continue;
for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
- if (ss->disabled || strcmp(tok + 1, ss->name))
+ if (!cgroup_ssid_enabled(ssid) ||
+ strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
@@ -2853,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
ret = create_css(child, ss,
cgrp->subtree_control & (1 << ssid));
else
- ret = cgroup_populate_dir(child, 1 << ssid);
+ ret = css_populate_dir(cgroup_css(child, ss),
+ NULL);
if (ret)
goto err_undo_css;
}
@@ -2886,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
if (css_disable & (1 << ssid)) {
kill_css(css);
} else {
- cgroup_clear_dir(child, 1 << ssid);
+ css_clear_dir(css, NULL);
if (ss->css_reset)
ss->css_reset(css);
}
@@ -2934,15 +3135,16 @@ err_undo_css:
if (css_enable & (1 << ssid))
kill_css(css);
else
- cgroup_clear_dir(child, 1 << ssid);
+ css_clear_dir(css, NULL);
}
}
goto out_unlock;
}
-static int cgroup_populated_show(struct seq_file *seq, void *v)
+static int cgroup_events_show(struct seq_file *seq, void *v)
{
- seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ seq_printf(seq, "populated %d\n",
+ cgroup_is_populated(seq_css(seq)->cgroup));
return 0;
}
@@ -3085,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
return kernfs_setattr(kn, &iattr);
}
-static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+ struct cftype *cft)
{
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
@@ -3107,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
return ret;
}
- if (cft->write == cgroup_procs_write)
- cgrp->procs_kn = kn;
- else if (cft->seq_show == cgroup_populated_show)
- cgrp->populated_kn = kn;
+ if (cft->file_offset) {
+ struct cgroup_file *cfile = (void *)css + cft->file_offset;
+
+ kernfs_get(kn);
+ cfile->kn = kn;
+ list_add(&cfile->node, &css->files);
+ }
+
return 0;
}
/**
* cgroup_addrm_files - add or remove files to a cgroup directory
- * @cgrp: the target cgroup
+ * @css: the target css
+ * @cgrp: the target cgroup (usually css->cgroup)
* @cfts: array of cftypes to be added
* @is_add: whether to add or remove
*
* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * For removals, this function never fails. If addition fails, this
- * function doesn't remove files already added. The caller is responsible
- * for cleaning up.
+ * For removals, this function never fails.
*/
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp, struct cftype cfts[],
bool is_add)
{
- struct cftype *cft;
+ struct cftype *cft, *cft_end = NULL;
int ret;
lockdep_assert_held(&cgroup_mutex);
- for (cft = cfts; cft->name[0] != '\0'; cft++) {
+restart:
+ for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
@@ -3145,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
continue;
if (is_add) {
- ret = cgroup_add_file(cgrp, cft);
+ ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
pr_warn("%s: failed to add %s, err=%d\n",
__func__, cft->name, ret);
- return ret;
+ cft_end = cft;
+ is_add = false;
+ goto restart;
}
} else {
cgroup_rm_file(cgrp, cft);
@@ -3175,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
if (cgroup_is_dead(cgrp))
continue;
- ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
if (ret)
break;
}
@@ -3287,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
- if (ss->disabled)
+ if (!cgroup_ssid_enabled(ss->id))
return 0;
if (!cfts || cfts[0].name[0] == '\0')
@@ -3337,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
- /*
- * If legacy_flies_on_dfl, we want to show the legacy files on the
- * dfl hierarchy but iff the target subsystem hasn't been updated
- * for the dfl hierarchy yet.
- */
- if (!cgroup_legacy_files_on_dfl ||
- ss->dfl_cftypes != ss->legacy_cftypes) {
- for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
- cft->flags |= __CFTYPE_NOT_ON_DFL;
- }
-
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
@@ -3362,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
int count = 0;
struct cgrp_cset_link *link;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
return count;
}
@@ -3597,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
}
/**
- * css_advance_task_iter - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
*/
-static void css_advance_task_iter(struct css_task_iter *it)
+static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
struct list_head *l = it->cset_pos;
struct cgrp_cset_link *link;
struct css_set *cset;
+ lockdep_assert_held(&css_set_lock);
+
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == it->cset_head) {
it->cset_pos = NULL;
+ it->task_pos = NULL;
return;
}
@@ -3623,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
}
- } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+ } while (!css_set_populated(cset));
it->cset_pos = l;
@@ -3634,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it)
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
+
+ /*
+ * We don't keep css_sets locked across iteration steps and thus
+ * need to take steps to ensure that iteration can be resumed after
+ * the lock is re-acquired. Iteration is performed at two levels -
+ * css_sets and tasks in them.
+ *
+ * Once created, a css_set never leaves its cgroup lists, so a
+ * pinned css_set is guaranteed to stay put and we can resume
+ * iteration afterwards.
+ *
+ * Tasks may leave @cset across iteration steps. This is resolved
+ * by registering each iterator with the css_set currently being
+ * walked and making css_set_move_task() advance iterators whose
+ * next task is leaving.
+ */
+ if (it->cur_cset) {
+ list_del(&it->iters_node);
+ put_css_set_locked(it->cur_cset);
+ }
+ get_css_set(cset);
+ it->cur_cset = cset;
+ list_add(&it->iters_node, &cset->task_iters);
+}
+
+static void css_task_iter_advance(struct css_task_iter *it)
+{
+ struct list_head *l = it->task_pos;
+
+ lockdep_assert_held(&css_set_lock);
+ WARN_ON_ONCE(!l);
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
+ l = l->next;
+
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
+
+ if (l == it->mg_tasks_head)
+ css_task_iter_advance_css_set(it);
+ else
+ it->task_pos = l;
}
/**
@@ -3645,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it)
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes. The caller can't sleep while iteration is in
- * progress.
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
- __acquires(css_set_rwsem)
{
/* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links);
- down_read(&css_set_rwsem);
+ memset(it, 0, sizeof(*it));
+
+ spin_lock_bh(&css_set_lock);
it->ss = css->ss;
@@ -3668,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
it->cset_head = it->cset_pos;
- css_advance_task_iter(it);
+ css_task_iter_advance_css_set(it);
+
+ spin_unlock_bh(&css_set_lock);
}
/**
@@ -3681,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
- struct task_struct *res;
- struct list_head *l = it->task_pos;
-
- /* If the iterator cg is NULL, we have no tasks */
- if (!it->cset_pos)
- return NULL;
- res = list_entry(l, struct task_struct, cg_list);
+ if (it->cur_task) {
+ put_task_struct(it->cur_task);
+ it->cur_task = NULL;
+ }
- /*
- * Advance iterator to find next entry. cset->tasks is consumed
- * first and then ->mg_tasks. After ->mg_tasks, we move onto the
- * next cset.
- */
- l = l->next;
+ spin_lock_bh(&css_set_lock);
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
+ if (it->task_pos) {
+ it->cur_task = list_entry(it->task_pos, struct task_struct,
+ cg_list);
+ get_task_struct(it->cur_task);
+ css_task_iter_advance(it);
+ }
- if (l == it->mg_tasks_head)
- css_advance_task_iter(it);
- else
- it->task_pos = l;
+ spin_unlock_bh(&css_set_lock);
- return res;
+ return it->cur_task;
}
/**
@@ -3714,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
* Finish task iteration started by css_task_iter_start().
*/
void css_task_iter_end(struct css_task_iter *it)
- __releases(css_set_rwsem)
{
- up_read(&css_set_rwsem);
+ if (it->cur_cset) {
+ spin_lock_bh(&css_set_lock);
+ list_del(&it->iters_node);
+ put_css_set_locked(it->cur_cset);
+ spin_unlock_bh(&css_set_lock);
+ }
+
+ if (it->cur_task)
+ put_task_struct(it->cur_task);
}
/**
@@ -3741,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
if (ret)
@@ -3762,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
css_task_iter_end(&it);
if (task) {
- ret = cgroup_migrate(to, task, false);
+ ret = cgroup_migrate(task, false, to);
put_task_struct(task);
}
} while (task && !ret);
@@ -4259,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
static struct cftype cgroup_dfl_base_files[] = {
{
.name = "cgroup.procs",
+ .file_offset = offsetof(struct cgroup, procs_file),
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
.write = cgroup_procs_write,
- .mode = S_IRUGO | S_IWUSR,
},
{
.name = "cgroup.controllers",
@@ -4283,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = {
.write = cgroup_subtree_control_write,
},
{
- .name = "cgroup.populated",
+ .name = "cgroup.events",
.flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = cgroup_populated_show,
+ .file_offset = offsetof(struct cgroup, events_file),
+ .seq_show = cgroup_events_show,
},
{ } /* terminate */
};
@@ -4300,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = {
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
.write = cgroup_procs_write,
- .mode = S_IRUGO | S_IWUSR,
},
{
.name = "cgroup.clone_children",
@@ -4320,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = {
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
.write = cgroup_tasks_write,
- .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
@@ -4337,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = {
{ } /* terminate */
};
-/**
- * cgroup_populate_dir - create subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be added
- *
- * On failure, no file is added.
- */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
-{
- struct cgroup_subsys *ss;
- int i, ret = 0;
-
- /* process cftsets of each subsystem */
- for_each_subsys(ss, i) {
- struct cftype *cfts;
-
- if (!(subsys_mask & (1 << i)))
- continue;
-
- list_for_each_entry(cfts, &ss->cfts, node) {
- ret = cgroup_addrm_files(cgrp, cfts, true);
- if (ret < 0)
- goto err;
- }
- }
- return 0;
-err:
- cgroup_clear_dir(cgrp, subsys_mask);
- return ret;
-}
-
/*
* css destruction is four-stage process.
*
@@ -4396,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
+ struct cgroup_file *cfile;
percpu_ref_exit(&css->refcnt);
+ list_for_each_entry(cfile, &css->files, node)
+ kernfs_put(cfile->kn);
+
if (ss) {
/* css free path */
int id = css->id;
@@ -4503,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->ss = ss;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
+ INIT_LIST_HEAD(&css->files);
css->serial_nr = css_serial_nr_next++;
if (cgroup_parent(cgrp)) {
@@ -4579,13 +4801,13 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
if (err)
goto err_free_css;
- err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+ err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
if (err < 0)
goto err_free_percpu_ref;
css->id = err;
if (visible) {
- err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ err = css_populate_dir(css, NULL);
if (err)
goto err_free_id;
}
@@ -4611,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
err_list_del:
list_del_rcu(&css->sibling);
- cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+ css_clear_dir(css, NULL);
err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
@@ -4628,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
- struct cftype *base_files;
int ssid, ret;
/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4656,7 +4877,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
- cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+ cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
if (cgrp->id < 0) {
ret = -ENOMEM;
goto out_cancel_ref;
@@ -4704,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
- if (cgroup_on_dfl(cgrp))
- base_files = cgroup_dfl_base_files;
- else
- base_files = cgroup_legacy_base_files;
-
- ret = cgroup_addrm_files(cgrp, base_files, true);
+ ret = css_populate_dir(&cgrp->self, NULL);
if (ret)
goto out_destroy;
@@ -4796,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css)
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
*/
- cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+ css_clear_dir(css, NULL);
/*
* Killing would put the base ref, but we need to keep it alive
@@ -4845,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup_subsys_state *css;
- bool empty;
int ssid;
lockdep_assert_held(&cgroup_mutex);
/*
- * css_set_rwsem synchronizes access to ->cset_links and prevents
- * @cgrp from being removed while put_css_set() is in progress.
+ * Only migration can raise populated from zero and we're already
+ * holding cgroup_mutex.
*/
- down_read(&css_set_rwsem);
- empty = list_empty(&cgrp->cset_links);
- up_read(&css_set_rwsem);
- if (!empty)
+ if (cgroup_is_populated(cgrp))
return -EBUSY;
/*
@@ -4955,6 +5167,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
+ have_free_callback |= (bool)ss->free << ss->id;
+ have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
@@ -4993,6 +5207,8 @@ int __init cgroup_init_early(void)
ss->id = i;
ss->name = cgroup_subsys_name[i];
+ if (!ss->legacy_name)
+ ss->legacy_name = cgroup_subsys_name[i];
if (ss->early_init)
cgroup_init_subsys(ss, true);
@@ -5000,6 +5216,8 @@ int __init cgroup_init_early(void)
return 0;
}
+static unsigned long cgroup_disable_mask __initdata;
+
/**
* cgroup_init - cgroup initialization
*
@@ -5010,7 +5228,7 @@ int __init cgroup_init(void)
{
struct cgroup_subsys *ss;
unsigned long key;
- int ssid, err;
+ int ssid;
BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
@@ -5046,14 +5264,15 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (ss->disabled)
+ if (cgroup_disable_mask & (1 << ssid)) {
+ static_branch_disable(cgroup_subsys_enabled_key[ssid]);
+ printk(KERN_INFO "Disabling %s control group subsystem\n",
+ ss->name);
continue;
+ }
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
- ss->dfl_cftypes = ss->legacy_cftypes;
-
if (!ss->dfl_cftypes)
cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
@@ -5068,17 +5287,10 @@ int __init cgroup_init(void)
ss->bind(init_css_set.subsys[ssid]);
}
- err = sysfs_create_mount_point(fs_kobj, "cgroup");
- if (err)
- return err;
-
- err = register_filesystem(&cgroup_fs_type);
- if (err < 0) {
- sysfs_remove_mount_point(fs_kobj, "cgroup");
- return err;
- }
+ WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
+ WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
- proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
return 0;
}
@@ -5125,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
goto out;
mutex_lock(&cgroup_mutex);
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
for_each_root(root) {
struct cgroup_subsys *ss;
@@ -5136,26 +5348,48 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
continue;
seq_printf(m, "%d:", root->hierarchy_id);
- for_each_subsys(ss, ssid)
- if (root->subsys_mask & (1 << ssid))
- seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+ if (root != &cgrp_dfl_root)
+ for_each_subsys(ss, ssid)
+ if (root->subsys_mask & (1 << ssid))
+ seq_printf(m, "%s%s", count++ ? "," : "",
+ ss->legacy_name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
+
cgrp = task_cgroup_from_root(tsk, root);
- path = cgroup_path(cgrp, buf, PATH_MAX);
- if (!path) {
- retval = -ENAMETOOLONG;
- goto out_unlock;
+
+ /*
+ * On traditional hierarchies, all zombie tasks show up as
+ * belonging to the root cgroup. On the default hierarchy,
+ * while a zombie doesn't show up in "cgroup.procs" and
+ * thus can't be migrated, its /proc/PID/cgroup keeps
+ * reporting the cgroup it belonged to before exiting. If
+ * the cgroup is removed before the zombie is reaped,
+ * " (deleted)" is appended to the cgroup path.
+ */
+ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
+ goto out_unlock;
+ }
+ } else {
+ path = "/";
}
+
seq_puts(m, path);
- seq_putc(m, '\n');
+
+ if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+ seq_puts(m, " (deleted)\n");
+ else
+ seq_putc(m, '\n');
}
retval = 0;
out_unlock:
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);
kfree(buf);
out:
@@ -5178,8 +5412,9 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
- ss->name, ss->root->hierarchy_id,
- atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+ ss->legacy_name, ss->root->hierarchy_id,
+ atomic_read(&ss->root->nr_cgrps),
+ cgroup_ssid_enabled(i));
mutex_unlock(&cgroup_mutex);
return 0;
@@ -5197,6 +5432,19 @@ static const struct file_operations proc_cgroupstats_operations = {
.release = single_release,
};
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+ return &ss_priv[i - CGROUP_CANFORK_START];
+ return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+ void **private = subsys_canfork_priv_p(ss_priv, i);
+ return private ? *private : NULL;
+}
+
/**
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
@@ -5212,6 +5460,57 @@ void cgroup_fork(struct task_struct *child)
}
/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i, j, ret;
+
+ for_each_subsys_which(ss, i, &have_canfork_callback) {
+ ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+ if (ret)
+ goto out_revert;
+ }
+
+ return 0;
+
+out_revert:
+ for_each_subsys(ss, j) {
+ if (j >= i)
+ break;
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+ }
+
+ return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+ void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+ struct cgroup_subsys *ss;
+ int i;
+
+ for_each_subsys(ss, i)
+ if (ss->cancel_fork)
+ ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
+/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
@@ -5221,7 +5520,8 @@ void cgroup_fork(struct task_struct *child)
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ void *old_ss_priv[CGROUP_CANFORK_COUNT])
{
struct cgroup_subsys *ss;
int i;
@@ -5235,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child)
* @child during its iteration.
*
* If we won the race, @child is associated with %current's
- * css_set. Grabbing css_set_rwsem guarantees both that the
+ * css_set. Grabbing css_set_lock guarantees both that the
* association is stable, and, on completion of the parent's
* migration, @child is visible in the source of migration or
* already in the destination cgroup. This guarantee is necessary
@@ -5250,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child)
if (use_task_css_set_links) {
struct css_set *cset;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
- rcu_assign_pointer(child->cgroups, cset);
- list_add(&child->cg_list, &cset->tasks);
get_css_set(cset);
+ css_set_move_task(child, NULL, cset, false);
}
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
/*
@@ -5266,7 +5565,7 @@ void cgroup_post_fork(struct task_struct *child)
* and addition to css_set.
*/
for_each_subsys_which(ss, i, &have_fork_callback)
- ss->fork(child);
+ ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
}
/**
@@ -5292,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
struct css_set *cset;
- bool put_cset = false;
int i;
/*
* Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check cg_list without grabbing css_set_rwsem.
+ * with us, we can check css_set and cg_list without synchronization.
*/
+ cset = task_css_set(tsk);
+
if (!list_empty(&tsk->cg_list)) {
- down_write(&css_set_rwsem);
- list_del_init(&tsk->cg_list);
- up_write(&css_set_rwsem);
- put_cset = true;
+ spin_lock_bh(&css_set_lock);
+ css_set_move_task(tsk, cset, NULL, false);
+ spin_unlock_bh(&css_set_lock);
+ } else {
+ get_css_set(cset);
}
- /* Reassign the task to the init_css_set. */
- cset = task_css_set(tsk);
- RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
-
/* see cgroup_post_fork() for details */
- for_each_subsys_which(ss, i, &have_exit_callback) {
- struct cgroup_subsys_state *old_css = cset->subsys[i];
- struct cgroup_subsys_state *css = task_css(tsk, i);
+ for_each_subsys_which(ss, i, &have_exit_callback)
+ ss->exit(tsk);
+}
- ss->exit(css, old_css, tsk);
- }
+void cgroup_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ for_each_subsys_which(ss, ssid, &have_free_callback)
+ ss->free(task);
- if (put_cset)
- put_css_set(cset);
+ put_css_set(cset);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
schedule_work(&cgrp->release_agent_work);
}
@@ -5400,26 +5702,16 @@ static int __init cgroup_disable(char *str)
continue;
for_each_subsys(ss, i) {
- if (!strcmp(token, ss->name)) {
- ss->disabled = 1;
- printk(KERN_INFO "Disabling %s control group"
- " subsystem\n", ss->name);
- break;
- }
+ if (strcmp(token, ss->name) &&
+ strcmp(token, ss->legacy_name))
+ continue;
+ cgroup_disable_mask |= 1 << i;
}
}
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
-static int __init cgroup_set_legacy_files_on_dfl(char *str)
-{
- printk("cgroup: using legacy files on the default hierarchy\n");
- cgroup_legacy_files_on_dfl = true;
- return 0;
-}
-__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
-
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5523,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
if (!name_buf)
return -ENOMEM;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5534,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
kfree(name_buf);
return 0;
}
@@ -5545,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
@@ -5568,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
overflow:
seq_puts(seq, " ...\n");
}
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
return 0;
}
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return (!cgroup_has_tasks(css->cgroup) &&
+ return (!cgroup_is_populated(css->cgroup) &&
!css_has_online_children(&css->cgroup->self));
}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
* to do anything as freezer_attach() will put @task into the appropriate
* state.
*/
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
{
struct freezer *freezer;
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..cdd8df4e991c
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,353 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+ struct cgroup_subsys_state css;
+
+ /*
+ * Use 64-bit types so that we can safely represent "max" as
+ * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+ */
+ atomic64_t counter;
+ int64_t limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+ return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+ struct pids_cgroup *pids;
+
+ pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+ if (!pids)
+ return ERR_PTR(-ENOMEM);
+
+ pids->limit = PIDS_MAX;
+ atomic64_set(&pids->counter, 0);
+ return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+ /*
+ * A negative count (or overflow for that matter) is invalid,
+ * and indicates a bug in the `pids` controller proper.
+ */
+ WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p;
+
+ for (p = pids; p; p = parent_pids(p))
+ atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+ struct pids_cgroup *p, *q;
+
+ for (p = pids; p; p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ /*
+ * Since new is capped to the maximum number of pid_t, if
+ * p->limit is %PIDS_MAX then we know that this test will never
+ * fail.
+ */
+ if (new > p->limit)
+ goto revert;
+ }
+
+ return 0;
+
+revert:
+ for (q = pids; q != p; q = parent_pids(q))
+ pids_cancel(q, num);
+ pids_cancel(p, num);
+
+ return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ /*
+ * No need to pin @old_css between here and cancel_attach()
+ * because cgroup core protects it from being freed before
+ * the migration completes or fails.
+ */
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(pids, 1);
+ pids_uncharge(old_pids, 1);
+ }
+
+ return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+ struct cgroup_taskset *tset)
+{
+ struct pids_cgroup *pids = css_pids(css);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, tset) {
+ struct cgroup_subsys_state *old_css;
+ struct pids_cgroup *old_pids;
+
+ old_css = task_css(task, pids_cgrp_id);
+ old_pids = css_pids(old_css);
+
+ pids_charge(old_pids, 1);
+ pids_uncharge(pids, 1);
+ }
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+ struct cgroup_subsys_state *css;
+ struct pids_cgroup *pids;
+ int err;
+
+ /*
+ * Use the "current" task_css for the pids subsystem as the tentative
+ * css. It is possible we will charge the wrong hierarchy, in which
+ * case we will forcefully revert/reapply the charge on the right
+ * hierarchy after it is committed to the task proper.
+ */
+ css = task_get_css(current, pids_cgrp_id);
+ pids = css_pids(css);
+
+ err = pids_try_charge(pids, 1);
+ if (err)
+ goto err_css_put;
+
+ *priv_p = css;
+ return 0;
+
+err_css_put:
+ css_put(css);
+ return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css = priv;
+ struct pids_cgroup *pids = css_pids(css);
+
+ pids_uncharge(pids, 1);
+ css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup_subsys_state *old_css = priv;
+ struct pids_cgroup *pids;
+ struct pids_cgroup *old_pids = css_pids(old_css);
+
+ css = task_get_css(task, pids_cgrp_id);
+ pids = css_pids(css);
+
+ /*
+ * If the association has changed, we have to revert and reapply the
+ * charge/uncharge on the wrong hierarchy to the current one. Since
+ * the association can only change due to an organisation event, its
+ * okay for us to ignore the limit in this case.
+ */
+ if (pids != old_pids) {
+ pids_uncharge(old_pids, 1);
+ pids_charge(pids, 1);
+ }
+
+ css_put(css);
+ css_put(old_css);
+}
+
+static void pids_free(struct task_struct *task)
+{
+ struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
+
+ pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cgroup_subsys_state *css = of_css(of);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit;
+ int err;
+
+ buf = strstrip(buf);
+ if (!strcmp(buf, PIDS_MAX_STR)) {
+ limit = PIDS_MAX;
+ goto set_limit;
+ }
+
+ err = kstrtoll(buf, 0, &limit);
+ if (err)
+ return err;
+
+ if (limit < 0 || limit >= PIDS_MAX)
+ return -EINVAL;
+
+set_limit:
+ /*
+ * Limit updates don't need to be mutex'd, since it isn't
+ * critical that any racing fork()s follow the new limit.
+ */
+ pids->limit = limit;
+ return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(sf);
+ struct pids_cgroup *pids = css_pids(css);
+ int64_t limit = pids->limit;
+
+ if (limit >= PIDS_MAX)
+ seq_printf(sf, "%s\n", PIDS_MAX_STR);
+ else
+ seq_printf(sf, "%lld\n", limit);
+
+ return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return atomic64_read(&pids->counter);
+}
+
+static struct cftype pids_files[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+ .css_alloc = pids_css_alloc,
+ .css_free = pids_css_free,
+ .can_attach = pids_can_attach,
+ .cancel_attach = pids_cancel_attach,
+ .can_fork = pids_can_fork,
+ .cancel_fork = pids_cancel_fork,
+ .fork = pids_fork,
+ .free = pids_free,
+ .legacy_cftypes = pids_files,
+ .dfl_cftypes = pids_files,
+};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5644ec5582b9..85ff5e26e23b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -102,19 +102,6 @@ void get_online_cpus(void)
}
EXPORT_SYMBOL_GPL(get_online_cpus);
-bool try_get_online_cpus(void)
-{
- if (cpu_hotplug.active_writer == current)
- return true;
- if (!mutex_trylock(&cpu_hotplug.lock))
- return false;
- cpuhp_lock_acquire_tryread();
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
- return true;
-}
-EXPORT_SYMBOL_GPL(try_get_online_cpus);
-
void put_online_cpus(void)
{
int refcount;
@@ -191,21 +178,22 @@ void cpu_hotplug_done(void)
void cpu_hotplug_disable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
+ cpu_hotplug_disabled++;
cpu_maps_update_done();
}
+EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ WARN_ON(--cpu_hotplug_disabled < 0);
cpu_maps_update_done();
}
-
+EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
-int __ref register_cpu_notifier(struct notifier_block *nb)
+int register_cpu_notifier(struct notifier_block *nb)
{
int ret;
cpu_maps_update_begin();
@@ -214,7 +202,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
return ret;
}
-int __ref __register_cpu_notifier(struct notifier_block *nb)
+int __register_cpu_notifier(struct notifier_block *nb)
{
return raw_notifier_chain_register(&cpu_chain, nb);
}
@@ -244,7 +232,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
EXPORT_SYMBOL(register_cpu_notifier);
EXPORT_SYMBOL(__register_cpu_notifier);
-void __ref unregister_cpu_notifier(struct notifier_block *nb)
+void unregister_cpu_notifier(struct notifier_block *nb)
{
cpu_maps_update_begin();
raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -252,7 +240,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_cpu_notifier);
-void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+void __unregister_cpu_notifier(struct notifier_block *nb)
{
raw_notifier_chain_unregister(&cpu_chain, nb);
}
@@ -303,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
{
struct task_struct *g, *p;
- read_lock_irq(&tasklist_lock);
- do_each_thread(g, p) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
if (!p->on_rq)
continue;
/*
@@ -319,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
- } while_each_thread(g, p);
- read_unlock_irq(&tasklist_lock);
+ }
+ read_unlock(&tasklist_lock);
}
struct take_cpu_down_param {
@@ -329,7 +317,7 @@ struct take_cpu_down_param {
};
/* Take this CPU down. */
-static int __ref take_cpu_down(void *_param)
+static int take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
int err;
@@ -343,12 +331,12 @@ static int __ref take_cpu_down(void *_param)
/* Give up timekeeping duties */
tick_handover_do_timer();
/* Park the stopper thread */
- kthread_park(current);
+ stop_machine_park((long)param->hcpu);
return 0;
}
/* Requires cpu_add_remove_lock to be held */
-static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
+static int _cpu_down(unsigned int cpu, int tasks_frozen)
{
int err, nr_calls = 0;
void *hcpu = (void *)(long)cpu;
@@ -381,14 +369,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
* will observe it.
*
* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so explicitly call both.
+ * not imply sync_sched(), so wait for both.
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
-#ifdef CONFIG_PREEMPT
- synchronize_sched();
-#endif
- synchronize_rcu();
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
smpboot_park_threads(cpu);
@@ -401,7 +389,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
/*
* So now all preempt/rcu users must observe !cpu_active().
*/
- err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+ err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
@@ -442,7 +430,7 @@ out_release:
return err;
}
-int __ref cpu_down(unsigned int cpu)
+int cpu_down(unsigned int cpu)
{
int err;
@@ -608,13 +596,18 @@ int disable_nonboot_cpus(void)
}
}
- if (!error) {
+ if (!error)
BUG_ON(num_online_cpus() > 1);
- /* Make sure the CPUs won't be enabled by someone else */
- cpu_hotplug_disabled = 1;
- } else {
+ else
pr_err("Non-boot CPUs are not disabled\n");
- }
+
+ /*
+ * Make sure the CPUs won't be enabled by someone else. We need to do
+ * this even in case of failure as all disable_nonboot_cpus() users are
+ * supposed to do enable_nonboot_cpus() on the failure path.
+ */
+ cpu_hotplug_disabled++;
+
cpu_maps_update_done();
return error;
}
@@ -627,13 +620,13 @@ void __weak arch_enable_nonboot_cpus_end(void)
{
}
-void __ref enable_nonboot_cpus(void)
+void enable_nonboot_cpus(void)
{
int cpu, error;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
+ WARN_ON(--cpu_hotplug_disabled < 0);
if (cpumask_empty(frozen_cpus))
goto out;
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 9656a3c36503..009cc9a17d95 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
* low power state that may have caused some blocks in the same power domain
* to reset.
*
- * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * Must be called after cpu_cluster_pm_enter has been called for the power
* domain, and before cpu_pm_exit has been called on any cpu in the power
* domain. Notified drivers can include VFP co-processor, interrupt controller
* and its PM extensions, local CPU timers context save/restore which
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee14e3a35a29..10ae73611d80 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_cpuset_subset(trial, par))
goto out;
/*
@@ -497,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
@@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1223,7 +1226,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
spin_unlock_irq(&callback_lock);
/* use trialcs->mems_allowed as a temp variable */
- update_nodemasks_hier(cs, &cs->mems_allowed);
+ update_nodemasks_hier(cs, &trialcs->mems_allowed);
done:
return retval;
}
@@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_on_dfl(css->cgroup) &&
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
{
/* static buf protected by cpuset_mutex */
static nodemask_t cpuset_attach_nodemask_to;
- struct mm_struct *mm;
struct task_struct *task;
- struct task_struct *leader = cgroup_taskset_first(tset);
+ struct task_struct *leader;
struct cpuset *cs = css_cs(css);
struct cpuset *oldcs = cpuset_attach_old_cs;
@@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
}
/*
- * Change mm, possibly for multiple threads in a threadgroup. This is
- * expensive and may sleep.
+ * Change mm for all threadgroup leaders. This is expensive and may
+ * sleep and should be moved outside migration path proper.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
- mm = get_task_mm(leader);
- if (mm) {
- mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-
- /*
- * old_mems_allowed is the same with mems_allowed here, except
- * if this task is being moved automatically due to hotplug.
- * In that case @mems_allowed has been updated and is empty,
- * so @old_mems_allowed is the right nodesets that we migrate
- * mm from.
- */
- if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
- &cpuset_attach_nodemask_to);
+ cgroup_taskset_for_each_leader(leader, tset) {
+ struct mm_struct *mm = get_task_mm(leader);
+
+ if (mm) {
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+ /*
+ * old_mems_allowed is the same with mems_allowed
+ * here, except if this task is being moved
+ * automatically due to hotplug. In that case
+ * @mems_allowed has been updated and is empty, so
+ * @old_mems_allowed is the right nodesets that we
+ * migrate mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+ &cpuset_attach_nodemask_to);
+ }
+ mmput(mm);
}
- mmput(mm);
}
cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
case FILE_MEMORY_PRESSURE_ENABLED:
cpuset_memory_pressure_enabled = !!val;
break;
- case FILE_MEMORY_PRESSURE:
- retval = -EACCES;
- break;
case FILE_SPREAD_PAGE:
retval = update_flag(CS_SPREAD_PAGE, cs, val);
break;
@@ -1863,9 +1866,6 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE,
- .mode = S_IRUGO,
},
{
@@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_on_dfl(cs->css.cgroup)) {
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_on_dfl(root_css->cgroup)) {
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2210,7 +2210,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_on_dfl(cs->css.cgroup))
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+ bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
mutex_lock(&cpuset_mutex);
@@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
}
/**
- * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @tsk: pointer to task_struct of some task.
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
*
- * Description: Prints @task's name, cpuset name, and cached copy of its
+ * Description: Prints current's name, cpuset name, and cached copy of its
* mems_allowed to the kernel log.
*/
-void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+void cpuset_print_current_mems_allowed(void)
{
struct cgroup *cgrp;
rcu_read_lock();
- cgrp = task_cs(tsk)->css.cgroup;
- pr_info("%s cpuset=", tsk->comm);
+ cgrp = task_cs(current)->css.cgroup;
+ pr_info("%s cpuset=", current->comm);
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+ pr_cont(" mems_allowed=%*pbl\n",
+ nodemask_pr_args(&current->mems_allowed));
rcu_read_unlock();
}
diff --git a/kernel/cred.c b/kernel/cred.c
index ec1c07667ec1..71179a09c1d6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -20,11 +20,16 @@
#include <linux/cn_proc.h>
#if 0
-#define kdebug(FMT, ...) \
- printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+ printk("[%-5.5s%5u] " FMT "\n", \
+ current->comm, current->pid, ##__VA_ARGS__)
#else
-#define kdebug(FMT, ...) \
- no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
+#define kdebug(FMT, ...) \
+do { \
+ if (0) \
+ no_printk("[%-5.5s%5u] " FMT "\n", \
+ current->comm, current->pid, ##__VA_ARGS__); \
+} while (0)
#endif
static struct kmem_cache *cred_jar;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..1a734e0adfa7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -195,7 +196,7 @@ static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
static int perf_sample_allowed_ns __read_mostly =
DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
-void update_perf_cpu_limits(void)
+static void update_perf_cpu_limits(void)
{
u64 tmp = perf_sample_period_ns;
@@ -471,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
* mode SWOUT : schedule out everything
* mode SWIN : schedule in based on cgroup for next
*/
-void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
@@ -1242,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)
PERF_EVENT_STATE_INACTIVE;
}
-/*
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
{
int entry = sizeof(u64); /* value */
int size = 0;
@@ -1262,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)
entry += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_GROUP) {
- nr += event->group_leader->nr_siblings;
+ nr += nr_siblings;
size += sizeof(u64);
}
@@ -1270,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)
event->read_size = size;
}
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
struct perf_sample_data *data;
- u64 sample_type = event->attr.sample_type;
u16 size = 0;
- perf_event__read_size(event);
-
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
@@ -1302,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)
event->header_size = size;
}
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+ __perf_event_read_size(event,
+ event->group_leader->nr_siblings);
+ __perf_event_header_size(event, event->attr.sample_type);
+}
+
static void perf_event__id_header_size(struct perf_event *event)
{
struct perf_sample_data *data;
@@ -1329,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)
event->id_header_size = size;
}
+static bool perf_event_validate_size(struct perf_event *event)
+{
+ /*
+ * The values computed here will be over-written when we actually
+ * attach the event.
+ */
+ __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+ __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+ perf_event__id_header_size(event);
+
+ /*
+ * Sum the lot; should not exceed the 64k limit we have on records.
+ * Conservative limit to allow for callchains and other variable fields.
+ */
+ if (event->read_size + event->header_size +
+ event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+ return false;
+
+ return true;
+}
+
static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
@@ -1868,8 +1894,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
perf_set_shadow_time(event, ctx, tstamp);
perf_log_itrace_start(event);
@@ -1881,6 +1905,8 @@ event_sched_in(struct perf_event *event,
goto out;
}
+ event->tstamp_running += tstamp - event->tstamp_stopped;
+
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -1913,7 +1939,7 @@ group_sched_in(struct perf_event *group_event,
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
- pmu->start_txn(pmu);
+ pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
@@ -2619,6 +2645,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
local_irq_restore(flags);
}
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in);
+
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
@@ -2641,6 +2670,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, next, false);
+
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
@@ -2831,6 +2863,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
+ if (atomic_read(&nr_switch_events))
+ perf_event_switch(task, prev, true);
+
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
@@ -3174,14 +3209,22 @@ void perf_event_exec(void)
rcu_read_unlock();
}
+struct perf_read_data {
+ struct perf_event *event;
+ bool group;
+ int ret;
+};
+
/*
* Cross CPU call to read the hardware event
*/
static void __perf_event_read(void *info)
{
- struct perf_event *event = info;
+ struct perf_read_data *data = info;
+ struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct pmu *pmu = event->pmu;
/*
* If this is a task context, we need to check whether it is
@@ -3198,9 +3241,35 @@ static void __perf_event_read(void *info)
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
+
update_event_times(event);
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ goto unlock;
+
+ if (!data->group) {
+ pmu->read(event);
+ data->ret = 0;
+ goto unlock;
+ }
+
+ pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+
+ pmu->read(event);
+
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ update_event_times(sub);
+ if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+ /*
+ * Use sibling's PMU rather than @event's since
+ * sibling could be on different (eg: software) PMU.
+ */
+ sub->pmu->read(sub);
+ }
+ }
+
+ data->ret = pmu->commit_txn(pmu);
+
+unlock:
raw_spin_unlock(&ctx->lock);
}
@@ -3212,15 +3281,76 @@ static inline u64 perf_event_count(struct perf_event *event)
return __perf_event_count(event);
}
-static u64 perf_event_read(struct perf_event *event)
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ * - either for the current task, or for this CPU
+ * - does not have inherit set, for inherited task events
+ * will not be local and we cannot read them atomically
+ * - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
+{
+ unsigned long flags;
+ u64 val;
+
+ /*
+ * Disabling interrupts avoids all counter scheduling (context
+ * switches, timer based rotation and IPIs).
+ */
+ local_irq_save(flags);
+
+ /* If this is a per-task event, it must be for current */
+ WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current);
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id());
+
+ /*
+ * It must not be an event with inherit set, we cannot read
+ * all child counters from atomic context.
+ */
+ WARN_ON_ONCE(event->attr.inherit);
+
+ /*
+ * It must not have a pmu::count method, those are not
+ * NMI safe.
+ */
+ WARN_ON_ONCE(event->pmu->count);
+
+ /*
+ * If the event is currently on this CPU, its either a per-task event,
+ * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+ * oncpu == -1).
+ */
+ if (event->oncpu == smp_processor_id())
+ event->pmu->read(event);
+
+ val = local64_read(&event->count);
+ local_irq_restore(flags);
+
+ return val;
+}
+
+static int perf_event_read(struct perf_event *event, bool group)
{
+ int ret = 0;
+
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ struct perf_read_data data = {
+ .event = event,
+ .group = group,
+ .ret = 0,
+ };
smp_call_function_single(event->oncpu,
- __perf_event_read, event, 1);
+ __perf_event_read, &data, 1);
+ ret = data.ret;
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
@@ -3235,11 +3365,14 @@ static u64 perf_event_read(struct perf_event *event)
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
- update_event_times(event);
+ if (group)
+ update_group_times(event);
+ else
+ update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
- return perf_event_count(event);
+ return ret;
}
/*
@@ -3454,6 +3587,10 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
+ if (event->attr.context_switch) {
+ static_key_slow_dec_deferred(&perf_sched_events);
+ atomic_dec(&nr_switch_events);
+ }
if (is_cgroup_event(event))
static_key_slow_dec_deferred(&perf_sched_events);
if (has_branch_stack(event))
@@ -3677,7 +3814,7 @@ static void put_event(struct perf_event *event)
* see the comment there.
*
* 2) there is a lock-inversion with mmap_sem through
- * perf_event_read_group(), which takes faults while
+ * perf_read_group(), which takes faults while
* holding ctx->mutex, however this is called after
* the last filedesc died, so there is no possibility
* to trigger the AB-BA case.
@@ -3751,14 +3888,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
*running = 0;
mutex_lock(&event->child_mutex);
- total += perf_event_read(event);
+
+ (void)perf_event_read(event, false);
+ total += perf_event_count(event);
+
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
*running += event->total_time_running +
atomic64_read(&event->child_total_time_running);
list_for_each_entry(child, &event->child_list, child_list) {
- total += perf_event_read(child);
+ (void)perf_event_read(child, false);
+ total += perf_event_count(child);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
@@ -3768,55 +3909,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
}
EXPORT_SYMBOL_GPL(perf_event_read_value);
-static int perf_event_read_group(struct perf_event *event,
- u64 read_format, char __user *buf)
+static int __perf_read_group_add(struct perf_event *leader,
+ u64 read_format, u64 *values)
{
- struct perf_event *leader = event->group_leader, *sub;
- struct perf_event_context *ctx = leader->ctx;
- int n = 0, size = 0, ret;
- u64 count, enabled, running;
- u64 values[5];
+ struct perf_event *sub;
+ int n = 1; /* skip @nr */
+ int ret;
- lockdep_assert_held(&ctx->mutex);
+ ret = perf_event_read(leader, true);
+ if (ret)
+ return ret;
- count = perf_event_read_value(leader, &enabled, &running);
+ /*
+ * Since we co-schedule groups, {enabled,running} times of siblings
+ * will be identical to those of the leader, so we only publish one
+ * set.
+ */
+ if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+ values[n++] += leader->total_time_enabled +
+ atomic64_read(&leader->child_total_time_enabled);
+ }
- values[n++] = 1 + leader->nr_siblings;
- if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- values[n++] = enabled;
- if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- values[n++] = running;
- values[n++] = count;
+ if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+ values[n++] += leader->total_time_running +
+ atomic64_read(&leader->child_total_time_running);
+ }
+
+ /*
+ * Write {count,id} tuples for every sibling.
+ */
+ values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- size = n * sizeof(u64);
+ list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ values[n++] += perf_event_count(sub);
+ if (read_format & PERF_FORMAT_ID)
+ values[n++] = primary_event_id(sub);
+ }
- if (copy_to_user(buf, values, size))
- return -EFAULT;
+ return 0;
+}
- ret = size;
+static int perf_read_group(struct perf_event *event,
+ u64 read_format, char __user *buf)
+{
+ struct perf_event *leader = event->group_leader, *child;
+ struct perf_event_context *ctx = leader->ctx;
+ int ret;
+ u64 *values;
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- n = 0;
+ lockdep_assert_held(&ctx->mutex);
- values[n++] = perf_event_read_value(sub, &enabled, &running);
- if (read_format & PERF_FORMAT_ID)
- values[n++] = primary_event_id(sub);
+ values = kzalloc(event->read_size, GFP_KERNEL);
+ if (!values)
+ return -ENOMEM;
- size = n * sizeof(u64);
+ values[0] = 1 + leader->nr_siblings;
- if (copy_to_user(buf + ret, values, size)) {
- return -EFAULT;
- }
+ /*
+ * By locking the child_mutex of the leader we effectively
+ * lock the child list of all siblings.. XXX explain how.
+ */
+ mutex_lock(&leader->child_mutex);
- ret += size;
+ ret = __perf_read_group_add(leader, read_format, values);
+ if (ret)
+ goto unlock;
+
+ list_for_each_entry(child, &leader->child_list, child_list) {
+ ret = __perf_read_group_add(child, read_format, values);
+ if (ret)
+ goto unlock;
}
+ mutex_unlock(&leader->child_mutex);
+
+ ret = event->read_size;
+ if (copy_to_user(buf, values, event->read_size))
+ ret = -EFAULT;
+ goto out;
+
+unlock:
+ mutex_unlock(&leader->child_mutex);
+out:
+ kfree(values);
return ret;
}
-static int perf_event_read_one(struct perf_event *event,
+static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
@@ -3854,7 +4035,7 @@ static bool is_event_hup(struct perf_event *event)
* Read the performance event - simple non blocking version for now
*/
static ssize_t
-perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
u64 read_format = event->attr.read_format;
int ret;
@@ -3872,9 +4053,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
WARN_ON_ONCE(event->ctx->parent_ctx);
if (read_format & PERF_FORMAT_GROUP)
- ret = perf_event_read_group(event, read_format, buf);
+ ret = perf_read_group(event, read_format, buf);
else
- ret = perf_event_read_one(event, read_format, buf);
+ ret = perf_read_one(event, read_format, buf);
return ret;
}
@@ -3887,7 +4068,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
int ret;
ctx = perf_event_ctx_lock(event);
- ret = perf_read_hw(event, buf, count);
+ ret = __perf_read(event, buf, count);
perf_event_ctx_unlock(event, ctx);
return ret;
@@ -3918,7 +4099,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
static void _perf_event_reset(struct perf_event *event)
{
- (void)perf_event_read(event);
+ (void)perf_event_read(event, false);
local64_set(&event->count, 0);
perf_event_update_userpage(event);
}
@@ -3958,28 +4139,21 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func);
}
-static int perf_event_period(struct perf_event *event, u64 __user *arg)
-{
- struct perf_event_context *ctx = event->ctx;
- int ret = 0, active;
+struct period_event {
+ struct perf_event *event;
u64 value;
+};
- if (!is_sampling_event(event))
- return -EINVAL;
-
- if (copy_from_user(&value, arg, sizeof(value)))
- return -EFAULT;
-
- if (!value)
- return -EINVAL;
+static int __perf_event_period(void *info)
+{
+ struct period_event *pe = info;
+ struct perf_event *event = pe->event;
+ struct perf_event_context *ctx = event->ctx;
+ u64 value = pe->value;
+ bool active;
- raw_spin_lock_irq(&ctx->lock);
+ raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
- if (value > sysctl_perf_event_sample_rate) {
- ret = -EINVAL;
- goto unlock;
- }
-
event->attr.sample_freq = value;
} else {
event->attr.sample_period = value;
@@ -3998,11 +4172,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
+ raw_spin_unlock(&ctx->lock);
-unlock:
+ return 0;
+}
+
+static int perf_event_period(struct perf_event *event, u64 __user *arg)
+{
+ struct period_event pe = { .event = event, };
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task;
+ u64 value;
+
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ if (copy_from_user(&value, arg, sizeof(value)))
+ return -EFAULT;
+
+ if (!value)
+ return -EINVAL;
+
+ if (event->attr.freq && value > sysctl_perf_event_sample_rate)
+ return -EINVAL;
+
+ task = ctx->task;
+ pe.value = value;
+
+ if (!task) {
+ cpu_function_call(event->cpu, __perf_event_period, &pe);
+ return 0;
+ }
+
+retry:
+ if (!task_function_call(task, __perf_event_period, &pe))
+ return 0;
+
+ raw_spin_lock_irq(&ctx->lock);
+ if (ctx->is_active) {
+ raw_spin_unlock_irq(&ctx->lock);
+ task = ctx->task;
+ goto retry;
+ }
+
+ __perf_event_period(&pe);
raw_spin_unlock_irq(&ctx->lock);
- return ret;
+ return 0;
}
static const struct file_operations perf_fops;
@@ -4740,12 +4956,20 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up.
*/
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+ /* only the parent has fasync state */
+ if (event->parent)
+ event = event->parent;
+ return &event->fasync;
+}
+
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
if (event->pending_kill) {
- kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+ kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
event->pending_kill = 0;
}
}
@@ -5151,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
- perf_output_put(handle, data->raw->size);
- __output_copy(handle, data->raw->data,
- data->raw->size);
+ u32 raw_size = data->raw->size;
+ u32 real_size = round_up(raw_size + sizeof(u32),
+ sizeof(u64)) - sizeof(u32);
+ u64 zero = 0;
+
+ perf_output_put(handle, real_size);
+ __output_copy(handle, data->raw->data, raw_size);
+ if (real_size - raw_size)
+ __output_copy(handle, &zero, real_size - raw_size);
} else {
struct {
u32 size;
@@ -5285,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header,
else
size += sizeof(u32);
- WARN_ON_ONCE(size & (sizeof(u64)-1));
- header->size += size;
+ header->size += round_up(size, sizeof(u64));
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5982,6 +6211,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
}
/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+ struct task_struct *task;
+ struct task_struct *next_prev;
+
+ struct {
+ struct perf_event_header header;
+ u32 next_prev_pid;
+ u32 next_prev_tid;
+ } event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+ return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+ struct perf_switch_event *se = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ int ret;
+
+ if (!perf_event_switch_match(event))
+ return;
+
+ /* Only CPU-wide events are allowed to see next/prev pid/tid */
+ if (event->ctx->task) {
+ se->event_id.header.type = PERF_RECORD_SWITCH;
+ se->event_id.header.size = sizeof(se->event_id.header);
+ } else {
+ se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+ se->event_id.header.size = sizeof(se->event_id);
+ se->event_id.next_prev_pid =
+ perf_event_pid(event, se->next_prev);
+ se->event_id.next_prev_tid =
+ perf_event_tid(event, se->next_prev);
+ }
+
+ perf_event_header__init_id(&se->event_id.header, &sample, event);
+
+ ret = perf_output_begin(&handle, event, se->event_id.header.size);
+ if (ret)
+ return;
+
+ if (event->ctx->task)
+ perf_output_put(&handle, se->event_id.header);
+ else
+ perf_output_put(&handle, se->event_id);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+}
+
+static void perf_event_switch(struct task_struct *task,
+ struct task_struct *next_prev, bool sched_in)
+{
+ struct perf_switch_event switch_event;
+
+ /* N.B. caller checks nr_switch_events != 0 */
+
+ switch_event = (struct perf_switch_event){
+ .task = task,
+ .next_prev = next_prev,
+ .event_id = {
+ .header = {
+ /* .type */
+ .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+ /* .size */
+ },
+ /* .next_prev_pid */
+ /* .next_prev_tid */
+ },
+ };
+
+ perf_event_aux(perf_event_switch_output,
+ &switch_event,
+ NULL);
+}
+
+/*
* IRQ throttle logging
*/
@@ -6040,8 +6354,6 @@ static void perf_log_itrace_start(struct perf_event *event)
event->hw.itrace_started)
return;
- event->hw.itrace_started = 1;
-
rec.header.type = PERF_RECORD_ITRACE_START;
rec.header.misc = 0;
rec.header.size = sizeof(rec);
@@ -6124,7 +6436,7 @@ static int __perf_event_overflow(struct perf_event *event,
else
perf_event_output(event, data, regs);
- if (event->fasync && event->pending_kill) {
+ if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
irq_work_queue(&event->pending);
}
@@ -6749,8 +7061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (event->tp_event->prog)
return -EEXIST;
- if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
- /* bpf programs can only be attached to kprobes */
+ if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
+ /* bpf programs can only be attached to u/kprobes */
return -EINVAL;
prog = bpf_prog_get(prog_fd);
@@ -7074,24 +7386,49 @@ static void perf_pmu_nop_void(struct pmu *pmu)
{
}
+static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
+{
+}
+
static int perf_pmu_nop_int(struct pmu *pmu)
{
return 0;
}
-static void perf_pmu_start_txn(struct pmu *pmu)
+static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
+
+static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
+ __this_cpu_write(nop_txn_flags, flags);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_disable(pmu);
}
static int perf_pmu_commit_txn(struct pmu *pmu)
{
+ unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+ __this_cpu_write(nop_txn_flags, 0);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return 0;
+
perf_pmu_enable(pmu);
return 0;
}
static void perf_pmu_cancel_txn(struct pmu *pmu)
{
+ unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+ __this_cpu_write(nop_txn_flags, 0);
+
+ if (flags & ~PERF_PMU_TXN_ADD)
+ return;
+
perf_pmu_enable(pmu);
}
@@ -7330,7 +7667,7 @@ got_cpu_context:
pmu->commit_txn = perf_pmu_commit_txn;
pmu->cancel_txn = perf_pmu_cancel_txn;
} else {
- pmu->start_txn = perf_pmu_nop_void;
+ pmu->start_txn = perf_pmu_nop_txn;
pmu->commit_txn = perf_pmu_nop_int;
pmu->cancel_txn = perf_pmu_nop_void;
}
@@ -7418,7 +7755,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
return ret;
}
-struct pmu *perf_init_event(struct perf_event *event)
+static struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
int idx;
@@ -7479,6 +7816,10 @@ static void account_event(struct perf_event *event)
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
+ if (event->attr.context_switch) {
+ atomic_inc(&nr_switch_events);
+ static_key_slow_inc(&perf_sched_events.key);
+ }
if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
if (is_cgroup_event(event))
@@ -8100,13 +8441,35 @@ SYSCALL_DEFINE5(perf_event_open,
if (move_group) {
gctx = group_leader->ctx;
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ } else {
+ mutex_lock(&ctx->mutex);
+ }
+
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_locked;
+ }
+
+ /*
+ * Must be under the same ctx::mutex as perf_install_in_context(),
+ * because we need to serialize with concurrent event creation.
+ */
+ if (!exclusive_event_installable(event, ctx)) {
+ /* exclusive and group stuff are assumed mutually exclusive */
+ WARN_ON_ONCE(move_group);
+
+ err = -EBUSY;
+ goto err_locked;
+ }
+ WARN_ON_ONCE(ctx->parent_ctx);
+
+ if (move_group) {
/*
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
perf_remove_from_context(group_leader, false);
list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8114,13 +8477,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_remove_from_context(sibling, false);
put_ctx(gctx);
}
- } else {
- mutex_lock(&ctx->mutex);
- }
- WARN_ON_ONCE(ctx->parent_ctx);
-
- if (move_group) {
/*
* Wait for everybody to stop referencing the events through
* the old lists, before installing it on new lists.
@@ -8152,22 +8509,29 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
- }
- if (!exclusive_event_installable(event, ctx)) {
- err = -EBUSY;
- mutex_unlock(&ctx->mutex);
- fput(event_file);
- goto err_context;
+ /*
+ * Now that all events are installed in @ctx, nothing
+ * references @gctx anymore, so drop the last reference we have
+ * on it.
+ */
+ put_ctx(gctx);
}
+ /*
+ * Precalculate sample_data sizes; do while holding ctx::mutex such
+ * that we're serialized against further additions and before
+ * perf_install_in_context() which is the point the event is active and
+ * can use these values.
+ */
+ perf_event__header_size(event);
+ perf_event__id_header_size(event);
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group) {
+ if (move_group)
mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- }
mutex_unlock(&ctx->mutex);
put_online_cpus();
@@ -8179,12 +8543,6 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&current->perf_event_mutex);
/*
- * Precalculate sample_data sizes
- */
- perf_event__header_size(event);
- perf_event__id_header_size(event);
-
- /*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
* of the group leader will find the pointer to itself in
@@ -8194,6 +8552,12 @@ SYSCALL_DEFINE5(perf_event_open,
fd_install(event_fd, event_file);
return event_fd;
+err_locked:
+ if (move_group)
+ mutex_unlock(&gctx->mutex);
+ mutex_unlock(&ctx->mutex);
+/* err_file: */
+ fput(event_file);
err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
@@ -8574,6 +8938,31 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
+struct perf_event *perf_event_get(unsigned int fd)
+{
+ int err;
+ struct fd f;
+ struct perf_event *event;
+
+ err = perf_fget_light(fd, &f);
+ if (err)
+ return ERR_PTR(err);
+
+ event = f.file->private_data;
+ atomic_long_inc(&event->refcount);
+ fdput(f);
+
+ return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+ if (!event)
+ return ERR_PTR(-EINVAL);
+
+ return &event->attr;
+}
+
/*
* inherit a event from parent task to child task:
*/
@@ -8872,7 +9261,7 @@ static void perf_event_init_cpu(int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
struct remove_event re = { .detach_group = true };
@@ -9071,25 +9460,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
task_function_call(task, __perf_cgroup_move, task);
}
-static void perf_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
-{
- /*
- * cgroup_exit() is called in the copy_process() failure path.
- * Ignore this case since the task hasn't ran yet, this avoids
- * trying to poke a half freed task state from generic code.
- */
- if (!(task->flags & PF_EXITING))
- return;
-
- task_function_call(task, __perf_cgroup_move, task);
-}
-
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
- .exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index b2be01b1aa9d..b5d1ea79c595 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -141,7 +141,7 @@ int perf_output_begin(struct perf_output_handle *handle,
perf_output_get_handle(handle);
do {
- tail = READ_ONCE_CTRL(rb->user_page->data_tail);
+ tail = READ_ONCE(rb->user_page->data_tail);
offset = head = local_read(&rb->head);
if (!rb->overwrite &&
unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
@@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order)
if (page && order) {
/*
- * Communicate the allocation size to the driver
+ * Communicate the allocation size to the driver:
+ * if we managed to secure a high-order allocation,
+ * set its first page's private to this order;
+ * !PagePrivate(page) means it's just a normal page.
*/
split_page(page, order);
SetPagePrivate(page);
@@ -559,11 +562,13 @@ static void __rb_free_aux(struct ring_buffer *rb)
rb->aux_priv = NULL;
}
- for (pg = 0; pg < rb->aux_nr_pages; pg++)
- rb_free_aux_page(rb, pg);
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
- kfree(rb->aux_pages);
- rb->aux_nr_pages = 0;
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
}
void rb_free_aux(struct ring_buffer *rb)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..4e5e9798aa0c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -86,15 +86,6 @@ struct uprobe {
struct arch_uprobe arch;
};
-struct return_instance {
- struct uprobe *uprobe;
- unsigned long func;
- unsigned long orig_ret_vaddr; /* original return address */
- bool chained; /* true, if instance is nested */
-
- struct return_instance *next; /* keep as stack */
-};
-
/*
* Execute out of line area: anonymous executable mapping installed
* by the probed task to execute the copy of the original instruction
@@ -105,17 +96,18 @@ struct return_instance {
* allocated.
*/
struct xol_area {
- wait_queue_head_t wq; /* if all slots are busy */
- atomic_t slot_count; /* number of in-use slots */
- unsigned long *bitmap; /* 0 = free slot */
- struct page *page;
+ wait_queue_head_t wq; /* if all slots are busy */
+ atomic_t slot_count; /* number of in-use slots */
+ unsigned long *bitmap; /* 0 = free slot */
+ struct vm_special_mapping xol_mapping;
+ struct page *pages[2];
/*
* We keep the vma's vm_start rather than a pointer to the vma
* itself. The probed process or a naughty kernel module could make
* the vma go away, and we must handle that reasonably gracefully.
*/
- unsigned long vaddr; /* Page(s) of instruction slots */
+ unsigned long vaddr; /* Page(s) of instruction slots */
};
/*
@@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
}
+static struct uprobe *get_uprobe(struct uprobe *uprobe)
+{
+ atomic_inc(&uprobe->ref);
+ return uprobe;
+}
+
+static void put_uprobe(struct uprobe *uprobe)
+{
+ if (atomic_dec_and_test(&uprobe->ref))
+ kfree(uprobe);
+}
+
static int match_uprobe(struct uprobe *l, struct uprobe *r)
{
if (l->inode < r->inode)
@@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
while (n) {
uprobe = rb_entry(n, struct uprobe, rb_node);
match = match_uprobe(&u, uprobe);
- if (!match) {
- atomic_inc(&uprobe->ref);
- return uprobe;
- }
+ if (!match)
+ return get_uprobe(uprobe);
if (match < 0)
n = n->rb_left;
@@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
parent = *p;
u = rb_entry(parent, struct uprobe, rb_node);
match = match_uprobe(uprobe, u);
- if (!match) {
- atomic_inc(&u->ref);
- return u;
- }
+ if (!match)
+ return get_uprobe(u);
if (match < 0)
p = &parent->rb_left;
@@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
return u;
}
-static void put_uprobe(struct uprobe *uprobe)
-{
- if (atomic_dec_and_test(&uprobe->ref))
- kfree(uprobe);
-}
-
static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe, *cur_uprobe;
@@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode,
if (u->inode != inode || u->offset < min)
break;
list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ get_uprobe(u);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
list_add(&u->pending_list, head);
- atomic_inc(&u->ref);
+ get_uprobe(u);
}
}
spin_unlock(&uprobes_treelock);
@@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
/* Slot allocation for XOL */
static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{
- int ret = -EALREADY;
+ struct vm_area_struct *vma;
+ int ret;
down_write(&mm->mmap_sem);
- if (mm->uprobes_state.xol_area)
+ if (mm->uprobes_state.xol_area) {
+ ret = -EALREADY;
goto fail;
+ }
if (!area->vaddr) {
/* Try to map as high as possible, this is only a hint. */
@@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
}
}
- ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
- VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page);
- if (ret)
+ vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
+ VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
+ &area->xol_mapping);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
goto fail;
+ }
+ ret = 0;
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
fail:
@@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->page = alloc_page(GFP_HIGHUSER);
- if (!area->page)
+ area->xol_mapping.name = "[uprobes]";
+ area->xol_mapping.pages = area->pages;
+ area->pages[0] = alloc_page(GFP_HIGHUSER);
+ if (!area->pages[0])
goto free_bitmap;
+ area->pages[1] = NULL;
area->vaddr = vaddr;
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
atomic_set(&area->slot_count, 1);
- copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
if (!xol_add_vma(mm, area))
return area;
- __free_page(area->page);
+ __free_page(area->pages[0]);
free_bitmap:
kfree(area->bitmap);
free_area:
@@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm)
if (!area)
return;
- put_page(area->page);
+ put_page(area->pages[0]);
kfree(area->bitmap);
kfree(area);
}
@@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
if (unlikely(!xol_vaddr))
return 0;
- arch_uprobe_copy_ixol(area->page, xol_vaddr,
+ arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
&uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
return xol_vaddr;
@@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
clear_bit(slot_nr, area->bitmap);
atomic_dec(&area->slot_count);
+ smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
if (waitqueue_active(&area->wq))
wake_up(&area->wq);
@@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
+static struct return_instance *free_ret_instance(struct return_instance *ri)
+{
+ struct return_instance *next = ri->next;
+ put_uprobe(ri->uprobe);
+ kfree(ri);
+ return next;
+}
+
/*
* Called with no locks held.
* Called in context of a exiting or a exec-ing thread.
@@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- struct return_instance *ri, *tmp;
+ struct return_instance *ri;
if (!utask)
return;
@@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t)
put_uprobe(utask->active_uprobe);
ri = utask->return_instances;
- while (ri) {
- tmp = ri;
- ri = ri->next;
-
- put_uprobe(tmp->uprobe);
- kfree(tmp);
- }
+ while (ri)
+ ri = free_ret_instance(ri);
xol_free_insn_slot(t);
kfree(utask);
@@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return -ENOMEM;
*n = *o;
- atomic_inc(&n->uprobe->ref);
+ get_uprobe(n->uprobe);
n->next = NULL;
*p = n;
@@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void)
return trampoline_vaddr;
}
+static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
+ struct pt_regs *regs)
+{
+ struct return_instance *ri = utask->return_instances;
+ enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
+
+ while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
+ ri = free_ret_instance(ri);
+ utask->depth--;
+ }
+ utask->return_instances = ri;
+}
+
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
{
struct return_instance *ri;
struct uprobe_task *utask;
unsigned long orig_ret_vaddr, trampoline_vaddr;
- bool chained = false;
+ bool chained;
if (!get_xol_area())
return;
@@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
return;
}
- ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
- goto fail;
+ return;
trampoline_vaddr = get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
if (orig_ret_vaddr == -1)
goto fail;
+ /* drop the entries invalidated by longjmp() */
+ chained = (orig_ret_vaddr == trampoline_vaddr);
+ cleanup_return_instances(utask, chained, regs);
+
/*
* We don't want to keep trampoline address in stack, rather keep the
* original return address of first caller thru all the consequent
* instances. This also makes breakpoint unwrapping easier.
*/
- if (orig_ret_vaddr == trampoline_vaddr) {
+ if (chained) {
if (!utask->return_instances) {
/*
* This situation is not possible. Likely we have an
* attack from user-space.
*/
- pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
+ uprobe_warn(current, "handle tail call");
goto fail;
}
-
- chained = true;
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
- atomic_inc(&uprobe->ref);
- ri->uprobe = uprobe;
+ ri->uprobe = get_uprobe(uprobe);
ri->func = instruction_pointer(regs);
+ ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
ri->chained = chained;
utask->depth++;
-
- /* add instance to the stack */
ri->next = utask->return_instances;
utask->return_instances = ri;
return;
-
fail:
kfree(ri);
}
@@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
up_read(&uprobe->register_rwsem);
}
-static bool handle_trampoline(struct pt_regs *regs)
+static struct return_instance *find_next_ret_chain(struct return_instance *ri)
{
- struct uprobe_task *utask;
- struct return_instance *ri, *tmp;
bool chained;
+ do {
+ chained = ri->chained;
+ ri = ri->next; /* can't be NULL if chained */
+ } while (chained);
+
+ return ri;
+}
+
+static void handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *next;
+ bool valid;
+
utask = current->utask;
if (!utask)
- return false;
+ goto sigill;
ri = utask->return_instances;
if (!ri)
- return false;
-
- /*
- * TODO: we should throw out return_instance's invalidated by
- * longjmp(), currently we assume that the probed function always
- * returns.
- */
- instruction_pointer_set(regs, ri->orig_ret_vaddr);
-
- for (;;) {
- handle_uretprobe_chain(ri, regs);
-
- chained = ri->chained;
- put_uprobe(ri->uprobe);
-
- tmp = ri;
- ri = ri->next;
- kfree(tmp);
- utask->depth--;
+ goto sigill;
- if (!chained)
- break;
- BUG_ON(!ri);
- }
+ do {
+ /*
+ * We should throw out the frames invalidated by longjmp().
+ * If this chain is valid, then the next one should be alive
+ * or NULL; the latter case means that nobody but ri->func
+ * could hit this trampoline on return. TODO: sigaltstack().
+ */
+ next = find_next_ret_chain(ri);
+ valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+ do {
+ if (valid)
+ handle_uretprobe_chain(ri, regs);
+ ri = free_ret_instance(ri);
+ utask->depth--;
+ } while (ri != next);
+ } while (!valid);
utask->return_instances = ri;
+ return;
+
+ sigill:
+ uprobe_warn(current, "handle uretprobe, sending SIGILL.");
+ force_sig_info(SIGILL, SEND_SIG_FORCED, current);
- return true;
}
bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
return false;
}
+bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ return true;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs)
int uninitialized_var(is_swbp);
bp_vaddr = uprobe_get_swbp_addr(regs);
- if (bp_vaddr == get_trampoline_vaddr()) {
- if (handle_trampoline(regs))
- return;
-
- pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
- current->pid, current->tgid);
- }
+ if (bp_vaddr == get_trampoline_vaddr())
+ return handle_trampoline(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 031325e9acf9..07110c6020a0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
- if (unlikely(in_atomic()))
+ if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
@@ -761,7 +763,9 @@ void do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);
+ TASKS_RCU(preempt_disable());
TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
+ TASKS_RCU(preempt_enable());
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
#ifdef CONFIG_NUMA
@@ -1471,7 +1475,7 @@ static long do_wait(struct wait_opts *wo)
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
repeat:
/*
- * If there is nothing that can match our critiera just get out.
+ * If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
* might later match our criteria, even if we are not able to reap
* it yet.
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..e820ccee9846 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,7 +18,6 @@
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/module.h>
-#include <linux/ftrace.h>
#include <linux/mutex.h>
#include <linux/init.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index dbd9b8d7b7cc..f97f2c449f5c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ cgroup_free(tsk);
task_numa_free(tsk);
security_task_free(tsk);
exit_creds(tsk);
@@ -454,8 +455,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_flags &=
+ ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
+ tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
@@ -1072,6 +1075,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
rcu_assign_pointer(tsk->sighand, sig);
if (!sig)
return -ENOMEM;
+
atomic_set(&sig->count, 1);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
return 0;
@@ -1099,7 +1103,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
if (cpu_limit != RLIM_INFINITY) {
sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit);
- sig->cputimer.running = 1;
+ sig->cputimer.running = true;
}
/* The timer lists. */
@@ -1133,6 +1137,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
seqlock_init(&sig->stats_lock);
+ prev_cputime_init(&sig->prev_cputime);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
@@ -1244,6 +1249,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
+ void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1278,10 +1284,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/*
* If the new process will be in a different pid or user namespace
- * do not allow it to share a thread group or signal handlers or
- * parent with the forking task.
+ * do not allow it to share a thread group with the forking task.
*/
- if (clone_flags & CLONE_SIGHAND) {
+ if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
@@ -1340,9 +1345,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+ prev_cputime_init(&p->prev_cputime);
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_init(&p->vtime_seqlock);
p->vtime_snap = 0;
@@ -1518,6 +1522,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->task_works = NULL;
/*
+ * Ensure that the cgroup subsystem policies allow the new process to be
+ * forked. It should be noted the the new process's css_set can be changed
+ * between here and cgroup_post_fork() if an organisation operation is in
+ * progress.
+ */
+ retval = cgroup_can_fork(p, cgrp_ss_priv);
+ if (retval)
+ goto bad_fork_free_pid;
+
+ /*
* Make it visible to the rest of the system, but dont wake it up yet.
* Need tasklist lock for parent etc handling!
*/
@@ -1553,7 +1567,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
- goto bad_fork_free_pid;
+ goto bad_fork_cancel_cgroup;
}
if (likely(p->pid)) {
@@ -1595,7 +1609,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
- cgroup_post_fork(p);
+ cgroup_post_fork(p, cgrp_ss_priv);
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
perf_event_fork(p);
@@ -1605,6 +1619,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return p;
+bad_fork_cancel_cgroup:
+ cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -1871,13 +1887,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
CLONE_NEWUSER|CLONE_NEWPID))
return -EINVAL;
/*
- * Not implemented, but pretend it works if there is nothing to
- * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
- * needs to unshare vm.
+ * Not implemented, but pretend it works if there is nothing
+ * to unshare. Note that unsharing the address space or the
+ * signal handlers also need to unshare the signal queues (aka
+ * CLONE_THREAD).
*/
if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
- /* FIXME: get_task_mm() increments ->mm_users */
- if (atomic_read(&current->mm->mm_users) > 1)
+ if (!thread_group_empty(current))
+ return -EINVAL;
+ }
+ if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
+ if (atomic_read(&current->sighand->count) > 1)
+ return -EINVAL;
+ }
+ if (unshare_flags & CLONE_VM) {
+ if (!current_is_single_threaded())
return -EINVAL;
}
@@ -1941,21 +1965,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
int err;
/*
- * If unsharing a user namespace must also unshare the thread.
+ * If unsharing a user namespace must also unshare the thread group
+ * and unshare the filesystem root and working directories.
*/
if (unshare_flags & CLONE_NEWUSER)
unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
- * If unsharing a thread from a thread group, must also unshare vm.
- */
- if (unshare_flags & CLONE_THREAD)
- unshare_flags |= CLONE_VM;
- /*
* If unsharing vm, must also unshare signal handlers.
*/
if (unshare_flags & CLONE_VM)
unshare_flags |= CLONE_SIGHAND;
/*
+ * If unsharing a signal handlers, must also unshare the signal queues.
+ */
+ if (unshare_flags & CLONE_SIGHAND)
+ unshare_flags |= CLONE_THREAD;
+ /*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index c4a182f5357e..684d7549825a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
+#include <linux/fault-inject.h>
#include <asm/futex.h>
@@ -254,9 +255,78 @@ struct futex_hash_bucket {
struct plist_head chain;
} ____cacheline_aligned_in_smp;
-static unsigned long __read_mostly futex_hashsize;
+/*
+ * The base of the bucket array and its size are always used together
+ * (after initialization only in hash_futex()), so ensure that they
+ * reside in the same cacheline.
+ */
+static struct {
+ struct futex_hash_bucket *queues;
+ unsigned long hashsize;
+} __futex_data __read_mostly __aligned(2*sizeof(long));
+#define futex_queues (__futex_data.queues)
+#define futex_hashsize (__futex_data.hashsize)
+
+
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
-static struct futex_hash_bucket *futex_queues;
+ bool ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = false,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+static bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private)) {
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif /* CONFIG_FAIL_FUTEX */
static inline void futex_get_mm(union futex_key *key)
{
@@ -413,6 +483,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
return -EFAULT;
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -428,6 +501,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
err = get_user_pages_fast(address, 1, 1, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
@@ -516,7 +593,7 @@ again:
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
*/
- if (ro) {
+ if (unlikely(should_fail_futex(fshared)) || ro) {
err = -EFAULT;
goto out;
}
@@ -974,6 +1051,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
u32 uninitialized_var(curval);
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
@@ -1015,12 +1095,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
if (get_futex_value_locked(&uval, uaddr))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Detect deadlocks.
*/
if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
/*
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
@@ -1155,6 +1241,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ if (unlikely(should_fail_futex(true)))
+ ret = -EFAULT;
+
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
@@ -1457,6 +1546,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Find the top_waiter and determine if there are additional waiters.
* If the caller intends to requeue more than 1 waiter to pifutex,
@@ -2268,8 +2360,11 @@ static long futex_wait_restart(struct restart_block *restart)
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
- * races the kernel might see a 0 value of the futex too.)
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
*/
static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
@@ -2300,6 +2395,10 @@ retry_private:
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
switch (ret) {
case 1:
/* We got the lock. */
@@ -2530,7 +2629,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
* @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
+ * the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
@@ -3005,6 +3104,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 9a76e3beda54..3b48dab80164 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -30,6 +30,10 @@ config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
config GENERIC_PENDING_IRQ
bool
+# Support for generic irq migrating off cpu before the cpu is offline.
+config GENERIC_IRQ_MIGRATION
+ bool
+
# Alpha specific irq affinity mechanism
config AUTO_IRQ_AFFINITY
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index d12123526e2b..2fc9cbdf35b6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -5,5 +5,6 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 27f4332c7f84..15206453b12a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -21,6 +21,20 @@
#include "internals.h"
+static irqreturn_t bad_chained_irq(int irq, void *dev_id)
+{
+ WARN_ONCE(1, "Chained irq %d should not call an action\n", irq);
+ return IRQ_NONE;
+}
+
+/*
+ * Chained handlers should never call action on their IRQ. This default
+ * action will emit warning if such thing happens.
+ */
+struct irqaction chained_action = {
+ .handler = bad_chained_irq,
+};
+
/**
* irq_set_chip - set the irq chip for an irq
* @irq: irq number
@@ -63,7 +77,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
return -EINVAL;
type &= IRQ_TYPE_SENSE_MASK;
- ret = __irq_set_trigger(desc, irq, type);
+ ret = __irq_set_trigger(desc, type);
irq_put_desc_busunlock(desc, flags);
return ret;
}
@@ -83,7 +97,7 @@ int irq_set_handler_data(unsigned int irq, void *data)
if (!desc)
return -EINVAL;
- desc->irq_data.handler_data = data;
+ desc->irq_common_data.handler_data = data;
irq_put_desc_unlock(desc, flags);
return 0;
}
@@ -105,7 +119,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
if (!desc)
return -EINVAL;
- desc->irq_data.msi_desc = entry;
+ desc->irq_common_data.msi_desc = entry;
if (entry && !irq_offset)
entry->irq = irq_base;
irq_put_desc_unlock(desc, flags);
@@ -187,7 +201,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
irq_enable(desc);
}
if (resend)
- check_irq_resend(desc, desc->irq_data.irq);
+ check_irq_resend(desc);
return ret;
}
@@ -227,6 +241,13 @@ void irq_enable(struct irq_desc *desc)
* disabled. If an interrupt happens, then the interrupt flow
* handler masks the line at the hardware level and marks it
* pending.
+ *
+ * If the interrupt chip does not implement the irq_disable callback,
+ * a driver can disable the lazy approach for a particular irq line by
+ * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can
+ * be used for devices which cannot disable the interrupt at the
+ * device level under certain circumstances and have to use
+ * disable_irq[_nosync] instead.
*/
void irq_disable(struct irq_desc *desc)
{
@@ -234,6 +255,8 @@ void irq_disable(struct irq_desc *desc)
if (desc->irq_data.chip->irq_disable) {
desc->irq_data.chip->irq_disable(&desc->irq_data);
irq_state_set_masked(desc);
+ } else if (irq_settings_disable_unlazy(desc)) {
+ mask_irq(desc);
}
}
@@ -315,7 +338,7 @@ void handle_nested_irq(unsigned int irq)
raw_spin_lock_irq(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
action = desc->action;
if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -328,7 +351,7 @@ void handle_nested_irq(unsigned int irq)
action_ret = action->thread_fn(action->irq, action->dev_id);
if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ note_interrupt(desc, action_ret);
raw_spin_lock_irq(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -372,7 +395,6 @@ static bool irq_may_run(struct irq_desc *desc)
/**
* handle_simple_irq - Simple and software-decoded IRQs.
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Simple interrupts are either sent from a demultiplexing interrupt
@@ -382,8 +404,7 @@ static bool irq_may_run(struct irq_desc *desc)
* Note: The caller is expected to handle the ack, clear, mask and
* unmask issues if necessary.
*/
-void
-handle_simple_irq(unsigned int irq, struct irq_desc *desc)
+void handle_simple_irq(struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
@@ -391,7 +412,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
desc->istate |= IRQS_PENDING;
@@ -425,7 +446,6 @@ static void cond_unmask_irq(struct irq_desc *desc)
/**
* handle_level_irq - Level type irq handler
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Level type interrupts are active as long as the hardware line has
@@ -433,8 +453,7 @@ static void cond_unmask_irq(struct irq_desc *desc)
* it after the associated handler has acknowledged the device, so the
* interrupt line is back to inactive.
*/
-void
-handle_level_irq(unsigned int irq, struct irq_desc *desc)
+void handle_level_irq(struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
@@ -443,7 +462,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -496,7 +515,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
/**
* handle_fasteoi_irq - irq handler for transparent controllers
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Only a single callback will be issued to the chip: an ->eoi()
@@ -504,8 +522,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
* for modern forms of interrupt handlers, which handle the flow
* details in hardware, transparently.
*/
-void
-handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_fasteoi_irq(struct irq_desc *desc)
{
struct irq_chip *chip = desc->irq_data.chip;
@@ -515,7 +532,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/*
* If its disabled or no action available
@@ -546,7 +563,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
/**
* handle_edge_irq - edge type IRQ handler
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Interrupt occures on the falling and/or rising edge of a hardware
@@ -560,8 +576,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
* the handler was running. If all pending interrupts are handled, the
* loop is left.
*/
-void
-handle_edge_irq(unsigned int irq, struct irq_desc *desc)
+void handle_edge_irq(struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
@@ -583,7 +598,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
goto out_unlock;
}
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
@@ -618,13 +633,12 @@ EXPORT_SYMBOL(handle_edge_irq);
#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
/**
* handle_edge_eoi_irq - edge eoi type IRQ handler
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Similar as the above handle_edge_irq, but using eoi and w/o the
* mask/unmask logic.
*/
-void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_edge_eoi_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
@@ -646,7 +660,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
goto out_eoi;
}
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
do {
if (unlikely(!desc->action))
@@ -665,22 +679,20 @@ out_eoi:
/**
* handle_percpu_irq - Per CPU local irq handler
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Per CPU interrupts on SMP machines without locking requirements
*/
-void
-handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
- handle_irq_event_percpu(desc, desc->action);
+ handle_irq_event_percpu(desc);
if (chip->irq_eoi)
chip->irq_eoi(&desc->irq_data);
@@ -688,7 +700,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
/**
* handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
- * @irq: the interrupt number
* @desc: the interrupt description structure for this irq
*
* Per CPU interrupts on SMP machines without locking requirements. Same as
@@ -698,14 +709,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
* contain the real device id for the cpu on which this handler is
* called
*/
-void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_devid_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct irqaction *action = desc->action;
void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
+ unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -757,6 +769,8 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
irq_state_set_disabled(desc);
+ if (is_chained)
+ desc->action = NULL;
desc->depth = 1;
}
desc->handle_irq = handle;
@@ -766,6 +780,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
+ desc->action = &chained_action;
irq_startup(desc, true);
}
}
@@ -796,7 +811,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
return;
__irq_do_set_handler(desc, handle, 1, NULL);
- desc->irq_data.handler_data = data;
+ desc->irq_common_data.handler_data = data;
irq_put_desc_busunlock(desc, flags);
}
@@ -985,6 +1000,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
}
/**
+ * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
+ *
+ * Conditional, as the underlying parent chip might not implement it.
+ */
+int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_set_type)
+ return data->chip->irq_set_type(data, type);
+
+ return -ENOSYS;
+}
+
+/**
* irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
* @data: Pointer to interrupt specific data
*
@@ -997,13 +1029,13 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
if (data->chip && data->chip->irq_retrigger)
return data->chip->irq_retrigger(data);
- return -ENOSYS;
+ return 0;
}
/**
* irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
* @data: Pointer to interrupt specific data
- * @dest: The vcpu affinity information
+ * @vcpu_info: The vcpu affinity information
*/
int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
{
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
new file mode 100644
index 000000000000..011f8c4c63da
--- /dev/null
+++ b/kernel/irq/cpuhotplug.c
@@ -0,0 +1,82 @@
+/*
+ * Generic cpu hotunplug interrupt migration code copied from the
+ * arch/arm implementation
+ *
+ * Copyright (C) Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/interrupt.h>
+#include <linux/ratelimit.h>
+#include <linux/irq.h>
+
+#include "internals.h"
+
+static bool migrate_one_irq(struct irq_desc *desc)
+{
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ const struct cpumask *affinity = d->common->affinity;
+ struct irq_chip *c;
+ bool ret = false;
+
+ /*
+ * If this is a per-CPU interrupt, or the affinity does not
+ * include this CPU, then we have nothing to do.
+ */
+ if (irqd_is_per_cpu(d) ||
+ !cpumask_test_cpu(smp_processor_id(), affinity))
+ return false;
+
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+ affinity = cpu_online_mask;
+ ret = true;
+ }
+
+ c = irq_data_get_irq_chip(d);
+ if (!c->irq_set_affinity) {
+ pr_debug("IRQ%u: unable to set affinity\n", d->irq);
+ } else {
+ int r = irq_do_set_affinity(d, affinity, false);
+ if (r)
+ pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
+ d->irq, r);
+ }
+
+ return ret;
+}
+
+/**
+ * irq_migrate_all_off_this_cpu - Migrate irqs away from offline cpu
+ *
+ * The current CPU has been marked offline. Migrate IRQs off this CPU.
+ * If the affinity settings do not allow other CPUs, force them onto any
+ * available CPU.
+ *
+ * Note: we must iterate over all IRQs, whether they have an attached
+ * action structure or not, as we need to get chained interrupts too.
+ */
+void irq_migrate_all_off_this_cpu(void)
+{
+ unsigned int irq;
+ struct irq_desc *desc;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ for_each_active_irq(irq) {
+ bool affinity_broken;
+
+ desc = irq_to_desc(irq);
+ raw_spin_lock(&desc->lock);
+ affinity_broken = migrate_one_irq(desc);
+ raw_spin_unlock(&desc->lock);
+
+ if (affinity_broken)
+ pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n",
+ irq, smp_processor_id());
+ }
+
+ local_irq_restore(flags);
+}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 15b370daf234..abd286afbd27 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -553,6 +553,9 @@ static int irq_gc_suspend(void)
if (data)
ct->chip.irq_suspend(data);
}
+
+ if (gc->suspend)
+ gc->suspend(gc);
}
return 0;
}
@@ -564,6 +567,9 @@ static void irq_gc_resume(void)
list_for_each_entry(gc, &gc_list, list) {
struct irq_chip_type *ct = gc->chip_types;
+ if (gc->resume)
+ gc->resume(gc);
+
if (ct->chip.irq_resume) {
struct irq_data *data = irq_gc_get_irq_data(gc);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 635480270858..a302cf9a2126 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,17 +22,19 @@
/**
* handle_bad_irq - handle spurious and unhandled irqs
- * @irq: the interrupt number
* @desc: description of the interrupt
*
* Handles spurious and unhandled IRQ's. It also prints a debugmessage.
*/
-void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+void handle_bad_irq(struct irq_desc *desc)
{
+ unsigned int irq = irq_desc_get_irq(desc);
+
print_irq_desc(irq, desc);
- kstat_incr_irqs_this_cpu(irq, desc);
+ kstat_incr_irqs_this_cpu(desc);
ack_bad_irq(irq);
}
+EXPORT_SYMBOL_GPL(handle_bad_irq);
/*
* Special, empty irq handler:
@@ -130,11 +132,11 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t
-handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int flags = 0, irq = desc->irq_data.irq;
+ struct irqaction *action = desc->action;
do {
irqreturn_t res;
@@ -176,20 +178,19 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
add_interrupt_randomness(irq, flags);
if (!noirqdebug)
- note_interrupt(irq, desc, retval);
+ note_interrupt(desc, retval);
return retval;
}
irqreturn_t handle_irq_event(struct irq_desc *desc)
{
- struct irqaction *action = desc->action;
irqreturn_t ret;
desc->istate &= ~IRQS_PENDING;
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
- ret = handle_irq_event_percpu(desc, action);
+ ret = handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 61008b8433ab..05c2188271b8 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,6 +18,8 @@
extern bool noirqdebug;
+extern struct irqaction chained_action;
+
/*
* Bits used by threaded handlers:
* IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
@@ -59,10 +61,9 @@ enum {
#include "debug.h"
#include "settings.h"
-extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
+extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc);
+extern void __enable_irq(struct irq_desc *desc);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
@@ -82,11 +83,11 @@ extern void irq_mark_irq(unsigned int irq);
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
/* Resending of interrupts :*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+void check_irq_resend(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
@@ -187,7 +188,7 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
return __irqd_to_state(d) & mask;
}
-static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
@@ -195,7 +196,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
static inline int irq_desc_get_node(struct irq_desc *desc)
{
- return irq_data_get_node(&desc->irq_data);
+ return irq_common_data_get_node(&desc->irq_common_data);
}
#ifdef CONFIG_PM_SLEEP
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4afc457613dd..239e2ae2c947 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void)
#ifdef CONFIG_SMP
static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
{
- if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+ if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
+ gfp, node))
return -ENOMEM;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
- free_cpumask_var(desc->irq_data.affinity);
+ free_cpumask_var(desc->irq_common_data.affinity);
return -ENOMEM;
}
#endif
@@ -52,11 +53,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
static void desc_smp_init(struct irq_desc *desc, int node)
{
- desc->irq_data.node = node;
- cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+ cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_clear(desc->pending_mask);
#endif
+#ifdef CONFIG_NUMA
+ desc->irq_common_data.node = node;
+#endif
}
#else
@@ -70,12 +73,13 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
{
int cpu;
+ desc->irq_common_data.handler_data = NULL;
+ desc->irq_common_data.msi_desc = NULL;
+
desc->irq_data.common = &desc->irq_common_data;
desc->irq_data.irq = irq;
desc->irq_data.chip = &no_irq_chip;
desc->irq_data.chip_data = NULL;
- desc->irq_data.handler_data = NULL;
- desc->irq_data.msi_desc = NULL;
irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
desc->handle_irq = handle_bad_irq;
@@ -121,7 +125,7 @@ static void free_masks(struct irq_desc *desc)
#ifdef CONFIG_GENERIC_PENDING_IRQ
free_cpumask_var(desc->pending_mask);
#endif
- free_cpumask_var(desc->irq_data.affinity);
+ free_cpumask_var(desc->irq_common_data.affinity);
}
#else
static inline void free_masks(struct irq_desc *desc) { }
@@ -343,7 +347,7 @@ int generic_handle_irq(unsigned int irq)
if (!desc)
return -EINVAL;
- generic_handle_irq_desc(irq, desc);
+ generic_handle_irq_desc(desc);
return 0;
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
@@ -582,7 +586,7 @@ int irq_set_percpu_devid(unsigned int irq)
void kstat_incr_irq_this_cpu(unsigned int irq)
{
- kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+ kstat_incr_irqs_this_cpu(irq_to_desc(irq));
}
/**
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c3577fef78c..22aa9612ef7c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -27,6 +27,57 @@ static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
irq_hw_number_t hwirq, int node);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
+struct irqchip_fwid {
+ struct fwnode_handle fwnode;
+ char *name;
+ void *data;
+};
+
+/**
+ * irq_domain_alloc_fwnode - Allocate a fwnode_handle suitable for
+ * identifying an irq domain
+ * @data: optional user-provided data
+ *
+ * Allocate a struct device_node, and return a poiner to the embedded
+ * fwnode_handle (or NULL on failure).
+ */
+struct fwnode_handle *irq_domain_alloc_fwnode(void *data)
+{
+ struct irqchip_fwid *fwid;
+ char *name;
+
+ fwid = kzalloc(sizeof(*fwid), GFP_KERNEL);
+ name = kasprintf(GFP_KERNEL, "irqchip@%p", data);
+
+ if (!fwid || !name) {
+ kfree(fwid);
+ kfree(name);
+ return NULL;
+ }
+
+ fwid->name = name;
+ fwid->data = data;
+ fwid->fwnode.type = FWNODE_IRQCHIP;
+ return &fwid->fwnode;
+}
+
+/**
+ * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle
+ *
+ * Free a fwnode_handle allocated with irq_domain_alloc_fwnode.
+ */
+void irq_domain_free_fwnode(struct fwnode_handle *fwnode)
+{
+ struct irqchip_fwid *fwid;
+
+ if (WARN_ON(fwnode->type != FWNODE_IRQCHIP))
+ return;
+
+ fwid = container_of(fwnode, struct irqchip_fwid, fwnode);
+ kfree(fwid->name);
+ kfree(fwid);
+}
+
/**
* __irq_domain_add() - Allocate a new irq_domain data structure
* @of_node: optional device-tree node of the interrupt controller
@@ -40,23 +91,28 @@ static void irq_domain_check_hierarchy(struct irq_domain *domain);
* Allocates and initialize and irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
-struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
irq_hw_number_t hwirq_max, int direct_max,
const struct irq_domain_ops *ops,
void *host_data)
{
struct irq_domain *domain;
+ struct device_node *of_node;
+
+ of_node = to_of_node(fwnode);
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
GFP_KERNEL, of_node_to_nid(of_node));
if (WARN_ON(!domain))
return NULL;
+ of_node_get(of_node);
+
/* Fill structure */
INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
domain->ops = ops;
domain->host_data = host_data;
- domain->of_node = of_node_get(of_node);
+ domain->fwnode = fwnode;
domain->hwirq_max = hwirq_max;
domain->revmap_size = size;
domain->revmap_direct_max_irq = direct_max;
@@ -102,7 +158,7 @@ void irq_domain_remove(struct irq_domain *domain)
pr_debug("Removed domain %s\n", domain->name);
- of_node_put(domain->of_node);
+ of_node_put(irq_domain_get_of_node(domain));
kfree(domain);
}
EXPORT_SYMBOL_GPL(irq_domain_remove);
@@ -133,7 +189,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
{
struct irq_domain *domain;
- domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
+ domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data);
if (!domain)
return NULL;
@@ -177,7 +233,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
{
struct irq_domain *domain;
- domain = __irq_domain_add(of_node, first_hwirq + size,
+ domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size,
first_hwirq + size, 0, ops, host_data);
if (domain)
irq_domain_associate_many(domain, first_irq, first_hwirq, size);
@@ -187,10 +243,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
/**
- * irq_find_host() - Locates a domain for a given device node
- * @node: device-tree node of the interrupt controller
+ * irq_find_matching_fwnode() - Locates a domain for a given fwnode
+ * @fwnode: FW descriptor of the interrupt controller
+ * @bus_token: domain-specific data
*/
-struct irq_domain *irq_find_host(struct device_node *node)
+struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
+ enum irq_domain_bus_token bus_token)
{
struct irq_domain *h, *found = NULL;
int rc;
@@ -199,13 +257,19 @@ struct irq_domain *irq_find_host(struct device_node *node)
* it might potentially be set to match all interrupts in
* the absence of a device node. This isn't a problem so far
* yet though...
+ *
+ * bus_token == DOMAIN_BUS_ANY matches any domain, any other
+ * values must generate an exact match for the domain to be
+ * selected.
*/
mutex_lock(&irq_domain_mutex);
list_for_each_entry(h, &irq_domain_list, link) {
if (h->ops->match)
- rc = h->ops->match(h, node);
+ rc = h->ops->match(h, to_of_node(fwnode), bus_token);
else
- rc = (h->of_node != NULL) && (h->of_node == node);
+ rc = ((fwnode != NULL) && (h->fwnode == fwnode) &&
+ ((bus_token == DOMAIN_BUS_ANY) ||
+ (h->bus_token == bus_token)));
if (rc) {
found = h;
@@ -215,7 +279,7 @@ struct irq_domain *irq_find_host(struct device_node *node)
mutex_unlock(&irq_domain_mutex);
return found;
}
-EXPORT_SYMBOL_GPL(irq_find_host);
+EXPORT_SYMBOL_GPL(irq_find_matching_fwnode);
/**
* irq_set_default_host() - Set a "default" irq domain
@@ -328,10 +392,12 @@ EXPORT_SYMBOL_GPL(irq_domain_associate);
void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
irq_hw_number_t hwirq_base, int count)
{
+ struct device_node *of_node;
int i;
+ of_node = irq_domain_get_of_node(domain);
pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
- of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
+ of_node_full_name(of_node), irq_base, (int)hwirq_base, count);
for (i = 0; i < count; i++) {
irq_domain_associate(domain, irq_base + i, hwirq_base + i);
@@ -351,12 +417,14 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
*/
unsigned int irq_create_direct_mapping(struct irq_domain *domain)
{
+ struct device_node *of_node;
unsigned int virq;
if (domain == NULL)
domain = irq_default_domain;
- virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
+ of_node = irq_domain_get_of_node(domain);
+ virq = irq_alloc_desc_from(1, of_node_to_nid(of_node));
if (!virq) {
pr_debug("create_direct virq allocation failed\n");
return 0;
@@ -391,6 +459,7 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
unsigned int irq_create_mapping(struct irq_domain *domain,
irq_hw_number_t hwirq)
{
+ struct device_node *of_node;
int virq;
pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
@@ -404,6 +473,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
pr_debug("-> using domain @%p\n", domain);
+ of_node = irq_domain_get_of_node(domain);
+
/* Check if mapping already exists */
virq = irq_find_mapping(domain, hwirq);
if (virq) {
@@ -412,8 +483,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq,
- of_node_to_nid(domain->of_node));
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node));
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -425,7 +495,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
- hwirq, of_node_full_name(domain->of_node), virq);
+ hwirq, of_node_full_name(of_node), virq);
return virq;
}
@@ -452,10 +522,12 @@ EXPORT_SYMBOL_GPL(irq_create_mapping);
int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
irq_hw_number_t hwirq_base, int count)
{
+ struct device_node *of_node;
int ret;
+ of_node = irq_domain_get_of_node(domain);
ret = irq_alloc_descs(irq_base, irq_base, count,
- of_node_to_nid(domain->of_node));
+ of_node_to_nid(of_node));
if (unlikely(ret < 0))
return ret;
@@ -464,28 +536,56 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
}
EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
-unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+static int irq_domain_translate(struct irq_domain *d,
+ struct irq_fwspec *fwspec,
+ irq_hw_number_t *hwirq, unsigned int *type)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ if (d->ops->translate)
+ return d->ops->translate(d, fwspec, hwirq, type);
+#endif
+ if (d->ops->xlate)
+ return d->ops->xlate(d, to_of_node(fwspec->fwnode),
+ fwspec->param, fwspec->param_count,
+ hwirq, type);
+
+ /* If domain has no translation, then we assume interrupt line */
+ *hwirq = fwspec->param[0];
+ return 0;
+}
+
+static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data,
+ struct irq_fwspec *fwspec)
+{
+ int i;
+
+ fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL;
+ fwspec->param_count = irq_data->args_count;
+
+ for (i = 0; i < irq_data->args_count; i++)
+ fwspec->param[i] = irq_data->args[i];
+}
+
+unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
int virq;
- domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain;
+ if (fwspec->fwnode)
+ domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
+ else
+ domain = irq_default_domain;
+
if (!domain) {
pr_warn("no irq domain found for %s !\n",
- of_node_full_name(irq_data->np));
+ of_node_full_name(to_of_node(fwspec->fwnode)));
return 0;
}
- /* If domain has no translation, then we assume interrupt line */
- if (domain->ops->xlate == NULL)
- hwirq = irq_data->args[0];
- else {
- if (domain->ops->xlate(domain, irq_data->np, irq_data->args,
- irq_data->args_count, &hwirq, &type))
- return 0;
- }
+ if (irq_domain_translate(domain, fwspec, &hwirq, &type))
+ return 0;
if (irq_domain_is_hierarchy(domain)) {
/*
@@ -496,7 +596,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
if (virq)
return virq;
- virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data);
+ virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
if (virq <= 0)
return 0;
} else {
@@ -512,6 +612,15 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
irq_set_irq_type(virq, type);
return virq;
}
+EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);
+
+unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data)
+{
+ struct irq_fwspec fwspec;
+
+ of_phandle_args_to_fwspec(irq_data, &fwspec);
+ return irq_create_fwspec_mapping(&fwspec);
+}
EXPORT_SYMBOL_GPL(irq_create_of_mapping);
/**
@@ -582,14 +691,16 @@ static int virq_debug_show(struct seq_file *m, void *private)
"name", "mapped", "linear-max", "direct-max", "devtree-node");
mutex_lock(&irq_domain_mutex);
list_for_each_entry(domain, &irq_domain_list, link) {
+ struct device_node *of_node;
int count = 0;
+ of_node = irq_domain_get_of_node(domain);
radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
count++;
seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
domain == irq_default_domain ? '*' : ' ', domain->name,
domain->revmap_size + count, domain->revmap_size,
domain->revmap_direct_max_irq,
- domain->of_node ? of_node_full_name(domain->of_node) : "");
+ of_node ? of_node_full_name(of_node) : "");
}
mutex_unlock(&irq_domain_mutex);
@@ -743,11 +854,11 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt,
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
/**
- * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy
+ * irq_domain_create_hierarchy - Add a irqdomain into the hierarchy
* @parent: Parent irq domain to associate with the new domain
* @flags: Irq domain flags associated to the domain
* @size: Size of the domain. See below
- * @node: Optional device-tree node of the interrupt controller
+ * @fwnode: Optional fwnode of the interrupt controller
* @ops: Pointer to the interrupt domain callbacks
* @host_data: Controller private data pointer
*
@@ -757,19 +868,19 @@ static int irq_domain_alloc_descs(int virq, unsigned int cnt,
* domain flags are set.
* Returns pointer to IRQ domain, or NULL on failure.
*/
-struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent,
+struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
unsigned int flags,
unsigned int size,
- struct device_node *node,
+ struct fwnode_handle *fwnode,
const struct irq_domain_ops *ops,
void *host_data)
{
struct irq_domain *domain;
if (size)
- domain = irq_domain_add_linear(node, size, ops, host_data);
+ domain = irq_domain_create_linear(fwnode, size, ops, host_data);
else
- domain = irq_domain_add_tree(node, ops, host_data);
+ domain = irq_domain_create_tree(fwnode, ops, host_data);
if (domain) {
domain->parent = parent;
domain->flags |= flags;
@@ -836,7 +947,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
child->parent_data = irq_data;
irq_data->irq = child->irq;
irq_data->common = child->common;
- irq_data->node = child->node;
irq_data->domain = domain;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9744853b656..0eebaeef317b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq);
#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;
+static int __irq_can_set_affinity(struct irq_desc *desc)
+{
+ if (!desc || !irqd_can_balance(&desc->irq_data) ||
+ !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
+ return 0;
+ return 1;
+}
+
/**
* irq_can_set_affinity - Check if the affinity of a given irq can be set
* @irq: Interrupt to check
@@ -122,13 +130,7 @@ cpumask_var_t irq_default_affinity;
*/
int irq_can_set_affinity(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc || !irqd_can_balance(&desc->irq_data) ||
- !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
- return 0;
-
- return 1;
+ return __irq_can_set_affinity(irq_to_desc(irq));
}
/**
@@ -190,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
switch (ret) {
case IRQ_SET_MASK_OK:
case IRQ_SET_MASK_OK_DONE:
- cpumask_copy(data->affinity, mask);
+ cpumask_copy(desc->irq_common_data.affinity, mask);
case IRQ_SET_MASK_OK_NOCOPY:
irq_set_thread_affinity(desc);
ret = 0;
@@ -256,37 +258,6 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
-/**
- * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
- * @irq: interrupt number to set affinity
- * @vcpu_info: vCPU specific data
- *
- * This function uses the vCPU specific data to set the vCPU
- * affinity for an irq. The vCPU specific data is passed from
- * outside, such as KVM. One example code path is as below:
- * KVM -> IOMMU -> irq_set_vcpu_affinity().
- */
-int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
-{
- unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
- struct irq_data *data;
- struct irq_chip *chip;
- int ret = -ENOSYS;
-
- if (!desc)
- return -EINVAL;
-
- data = irq_desc_get_irq_data(desc);
- chip = irq_data_get_irq_chip(data);
- if (chip && chip->irq_set_vcpu_affinity)
- ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
- irq_put_desc_unlock(desc, flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
-
static void irq_affinity_notify(struct work_struct *work)
{
struct irq_affinity_notify *notify =
@@ -302,7 +273,7 @@ static void irq_affinity_notify(struct work_struct *work)
if (irq_move_pending(&desc->irq_data))
irq_get_pending(cpumask, desc);
else
- cpumask_copy(cpumask, desc->irq_data.affinity);
+ cpumask_copy(cpumask, desc->irq_common_data.affinity);
raw_spin_unlock_irqrestore(&desc->lock, flags);
notify->notify(notify, cpumask);
@@ -359,14 +330,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
/*
* Generic version of the affinity autoselector.
*/
-static int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
{
struct cpumask *set = irq_default_affinity;
int node = irq_desc_get_node(desc);
/* Excludes PER_CPU and NO_BALANCE interrupts */
- if (!irq_can_set_affinity(irq))
+ if (!__irq_can_set_affinity(desc))
return 0;
/*
@@ -374,9 +344,9 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
* one of the targets is online.
*/
if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
- if (cpumask_intersects(desc->irq_data.affinity,
+ if (cpumask_intersects(desc->irq_common_data.affinity,
cpu_online_mask))
- set = desc->irq_data.affinity;
+ set = desc->irq_common_data.affinity;
else
irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
}
@@ -393,10 +363,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
return 0;
}
#else
-static inline int
-setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
+/* Wrapper for ALPHA specific affinity selector magic */
+static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask)
{
- return irq_select_affinity(irq);
+ return irq_select_affinity(irq_desc_get_irq(d));
}
#endif
@@ -410,20 +380,51 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
int ret;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = setup_affinity(irq, desc, mask);
+ ret = setup_affinity(desc, mask);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
#else
static inline int
-setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
+setup_affinity(struct irq_desc *desc, struct cpumask *mask)
{
return 0;
}
#endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq)
+/**
+ * irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
+ * @irq: interrupt number to set affinity
+ * @vcpu_info: vCPU specific data
+ *
+ * This function uses the vCPU specific data to set the vCPU
+ * affinity for an irq. The vCPU specific data is passed from
+ * outside, such as KVM. One example code path is as below:
+ * KVM -> IOMMU -> irq_set_vcpu_affinity().
+ */
+int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
+ struct irq_data *data;
+ struct irq_chip *chip;
+ int ret = -ENOSYS;
+
+ if (!desc)
+ return -EINVAL;
+
+ data = irq_desc_get_irq_data(desc);
+ chip = irq_data_get_irq_chip(data);
+ if (chip && chip->irq_set_vcpu_affinity)
+ ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
+ irq_put_desc_unlock(desc, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
+
+void __disable_irq(struct irq_desc *desc)
{
if (!desc->depth++)
irq_disable(desc);
@@ -436,7 +437,7 @@ static int __disable_irq_nosync(unsigned int irq)
if (!desc)
return -EINVAL;
- __disable_irq(desc, irq);
+ __disable_irq(desc);
irq_put_desc_busunlock(desc, flags);
return 0;
}
@@ -503,12 +504,13 @@ bool disable_hardirq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(disable_hardirq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq)
+void __enable_irq(struct irq_desc *desc)
{
switch (desc->depth) {
case 0:
err_out:
- WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
+ WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n",
+ irq_desc_get_irq(desc));
break;
case 1: {
if (desc->istate & IRQS_SUSPENDED)
@@ -516,7 +518,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq)
/* Prevent probing on this irq: */
irq_settings_set_noprobe(desc);
irq_enable(desc);
- check_irq_resend(desc, irq);
+ check_irq_resend(desc);
/* fall-through */
}
default:
@@ -546,7 +548,7 @@ void enable_irq(unsigned int irq)
KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
goto out;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
out:
irq_put_desc_busunlock(desc, flags);
}
@@ -637,8 +639,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
return canrequest;
}
-int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
- unsigned long flags)
+int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
{
struct irq_chip *chip = desc->irq_data.chip;
int ret, unmask = 0;
@@ -648,7 +649,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
* IRQF_TRIGGER_* but the PIC does not support multiple
* flow-types?
*/
- pr_debug("No set_type function for IRQ %d (%s)\n", irq,
+ pr_debug("No set_type function for IRQ %d (%s)\n",
+ irq_desc_get_irq(desc),
chip ? (chip->name ? : "unknown") : "unknown");
return 0;
}
@@ -685,7 +687,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
break;
default:
pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
- flags, irq, chip->irq_set_type);
+ flags, irq_desc_get_irq(desc), chip->irq_set_type);
}
if (unmask)
unmask_irq(desc);
@@ -728,6 +730,12 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
return IRQ_NONE;
}
+static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
+{
+ WARN(1, "Secondary action handler called for irq %d\n", irq);
+ return IRQ_NONE;
+}
+
static int irq_wait_for_interrupt(struct irqaction *action)
{
set_current_state(TASK_INTERRUPTIBLE);
@@ -754,7 +762,8 @@ static int irq_wait_for_interrupt(struct irqaction *action)
static void irq_finalize_oneshot(struct irq_desc *desc,
struct irqaction *action)
{
- if (!(desc->istate & IRQS_ONESHOT))
+ if (!(desc->istate & IRQS_ONESHOT) ||
+ action->handler == irq_forced_secondary_handler)
return;
again:
chip_bus_lock(desc);
@@ -827,8 +836,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
* This code is triggered unconditionally. Check the affinity
* mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
*/
- if (desc->irq_data.affinity)
- cpumask_copy(mask, desc->irq_data.affinity);
+ if (desc->irq_common_data.affinity)
+ cpumask_copy(mask, desc->irq_common_data.affinity);
else
valid = false;
raw_spin_unlock_irq(&desc->lock);
@@ -908,6 +917,18 @@ static void irq_thread_dtor(struct callback_head *unused)
irq_finalize_oneshot(desc, action);
}
+static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
+{
+ struct irqaction *secondary = action->secondary;
+
+ if (WARN_ON_ONCE(!secondary))
+ return;
+
+ raw_spin_lock_irq(&desc->lock);
+ __irq_wake_thread(desc, secondary);
+ raw_spin_unlock_irq(&desc->lock);
+}
+
/*
* Interrupt handler thread
*/
@@ -938,6 +959,8 @@ static int irq_thread(void *data)
action_ret = handler_fn(desc, action);
if (action_ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
+ if (action_ret == IRQ_WAKE_THREAD)
+ irq_wake_secondary(desc, action);
wake_threads_waitq(desc);
}
@@ -982,20 +1005,36 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
}
EXPORT_SYMBOL_GPL(irq_wake_thread);
-static void irq_setup_forced_threading(struct irqaction *new)
+static int irq_setup_forced_threading(struct irqaction *new)
{
if (!force_irqthreads)
- return;
+ return 0;
if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
- return;
+ return 0;
new->flags |= IRQF_ONESHOT;
- if (!new->thread_fn) {
- set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
- new->thread_fn = new->handler;
- new->handler = irq_default_primary_handler;
+ /*
+ * Handle the case where we have a real primary handler and a
+ * thread handler. We force thread them as well by creating a
+ * secondary action.
+ */
+ if (new->handler != irq_default_primary_handler && new->thread_fn) {
+ /* Allocate the secondary action */
+ new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!new->secondary)
+ return -ENOMEM;
+ new->secondary->handler = irq_forced_secondary_handler;
+ new->secondary->thread_fn = new->thread_fn;
+ new->secondary->dev_id = new->dev_id;
+ new->secondary->irq = new->irq;
+ new->secondary->name = new->name;
}
+ /* Deal with the primary handler */
+ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+ new->thread_fn = new->handler;
+ new->handler = irq_default_primary_handler;
+ return 0;
}
static int irq_request_resources(struct irq_desc *desc)
@@ -1015,6 +1054,48 @@ static void irq_release_resources(struct irq_desc *desc)
c->irq_release_resources(d);
}
+static int
+setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
+{
+ struct task_struct *t;
+ struct sched_param param = {
+ .sched_priority = MAX_USER_RT_PRIO/2,
+ };
+
+ if (!secondary) {
+ t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+ new->name);
+ } else {
+ t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
+ new->name);
+ param.sched_priority -= 1;
+ }
+
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
+
+ /*
+ * We keep the reference to the task struct even if
+ * the thread dies to avoid that the interrupt code
+ * references an already freed task_struct.
+ */
+ get_task_struct(t);
+ new->thread = t;
+ /*
+ * Tell the thread to set its affinity. This is
+ * important for shared interrupt handlers as we do
+ * not invoke setup_affinity() for the secondary
+ * handlers as everything is already set up. Even for
+ * interrupts marked with IRQF_NO_BALANCE this is
+ * correct as we want the thread to move to the cpu(s)
+ * on which the requesting code placed the interrupt.
+ */
+ set_bit(IRQTF_AFFINITY, &new->thread_flags);
+ return 0;
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -1035,6 +1116,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (!try_module_get(desc->owner))
return -ENODEV;
+ new->irq = irq;
+
/*
* Check whether the interrupt nests into another interrupt
* thread.
@@ -1052,8 +1135,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
new->handler = irq_nested_primary_handler;
} else {
- if (irq_settings_can_thread(desc))
- irq_setup_forced_threading(new);
+ if (irq_settings_can_thread(desc)) {
+ ret = irq_setup_forced_threading(new);
+ if (ret)
+ goto out_mput;
+ }
}
/*
@@ -1062,37 +1148,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* thread.
*/
if (new->thread_fn && !nested) {
- struct task_struct *t;
- static const struct sched_param param = {
- .sched_priority = MAX_USER_RT_PRIO/2,
- };
-
- t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
- new->name);
- if (IS_ERR(t)) {
- ret = PTR_ERR(t);
+ ret = setup_irq_thread(new, irq, false);
+ if (ret)
goto out_mput;
+ if (new->secondary) {
+ ret = setup_irq_thread(new->secondary, irq, true);
+ if (ret)
+ goto out_thread;
}
-
- sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
-
- /*
- * We keep the reference to the task struct even if
- * the thread dies to avoid that the interrupt code
- * references an already freed task_struct.
- */
- get_task_struct(t);
- new->thread = t;
- /*
- * Tell the thread to set its affinity. This is
- * important for shared interrupt handlers as we do
- * not invoke setup_affinity() for the secondary
- * handlers as everything is already set up. Even for
- * interrupts marked with IRQF_NO_BALANCE this is
- * correct as we want the thread to move to the cpu(s)
- * on which the requesting code placed the interrupt.
- */
- set_bit(IRQTF_AFFINITY, &new->thread_flags);
}
if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1221,8 +1284,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
- ret = __irq_set_trigger(desc, irq,
- new->flags & IRQF_TRIGGER_MASK);
+ ret = __irq_set_trigger(desc,
+ new->flags & IRQF_TRIGGER_MASK);
if (ret)
goto out_mask;
@@ -1253,7 +1316,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
/* Set default affinity mask once everything is setup */
- setup_affinity(irq, desc, mask);
+ setup_affinity(desc, mask);
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1265,7 +1328,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irq, nmsk, omsk);
}
- new->irq = irq;
*old_ptr = new;
irq_pm_install_action(desc, new);
@@ -1280,7 +1342,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1291,6 +1353,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (new->thread)
wake_up_process(new->thread);
+ if (new->secondary)
+ wake_up_process(new->secondary->thread);
register_irq_proc(irq, desc);
new->dir = NULL;
@@ -1321,6 +1385,13 @@ out_thread:
kthread_stop(t);
put_task_struct(t);
}
+ if (new->secondary && new->secondary->thread) {
+ struct task_struct *t = new->secondary->thread;
+
+ new->secondary->thread = NULL;
+ kthread_stop(t);
+ put_task_struct(t);
+ }
out_mput:
module_put(desc->owner);
return ret;
@@ -1392,6 +1463,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
+ irq_settings_clr_disable_unlazy(desc);
irq_shutdown(desc);
irq_release_resources(desc);
}
@@ -1428,9 +1500,14 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (action->thread) {
kthread_stop(action->thread);
put_task_struct(action->thread);
+ if (action->secondary && action->secondary->thread) {
+ kthread_stop(action->secondary->thread);
+ put_task_struct(action->secondary->thread);
+ }
}
module_put(desc->owner);
+ kfree(action->secondary);
return action;
}
@@ -1574,8 +1651,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
retval = __setup_irq(irq, desc, action);
chip_bus_sync_unlock(desc);
- if (retval)
+ if (retval) {
+ kfree(action->secondary);
kfree(action);
+ }
#ifdef CONFIG_DEBUG_SHIRQ_FIXME
if (!retval && (irqflags & IRQF_SHARED)) {
@@ -1650,7 +1729,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
if (type != IRQ_TYPE_NONE) {
int ret;
- ret = __irq_set_trigger(desc, irq, type);
+ ret = __irq_set_trigger(desc, type);
if (ret) {
WARN(1, "failed to set type for IRQ%d\n", irq);
@@ -1759,6 +1838,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
kfree(__free_percpu_irq(irq, dev_id));
chip_bus_sync_unlock(desc);
}
+EXPORT_SYMBOL_GPL(free_percpu_irq);
/**
* setup_percpu_irq - setup a per-cpu interrupt
@@ -1788,9 +1868,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
* @devname: An ascii name for the claiming device
* @dev_id: A percpu cookie passed back to the handler function
*
- * This call allocates interrupt resources, but doesn't
- * automatically enable the interrupt. It has to be done on each
- * CPU using enable_percpu_irq().
+ * This call allocates interrupt resources and enables the
+ * interrupt on the local CPU. If the interrupt is supposed to be
+ * enabled on other CPUs, it has to be done on each CPU using
+ * enable_percpu_irq().
*
* Dev_id must be globally unique. It is a per-cpu variable, and
* the handler gets called with the interrupted CPU's instance of
@@ -1829,6 +1910,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
+EXPORT_SYMBOL_GPL(request_percpu_irq);
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
@@ -1875,6 +1957,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
irq_put_desc_busunlock(desc, flags);
return err;
}
+EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
/**
* irq_set_irqchip_state - set the state of a forwarded interrupt.
@@ -1920,3 +2003,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
irq_put_desc_busunlock(desc, flags);
return err;
}
+EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7bf1f1bbb7fa..6b0c0b74a2a1 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,6 +18,23 @@
/* Temparory solution for building, will be removed later */
#include <linux/pci.h>
+struct msi_desc *alloc_msi_entry(struct device *dev)
+{
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+ if (!desc)
+ return NULL;
+
+ INIT_LIST_HEAD(&desc->list);
+ desc->dev = dev;
+
+ return desc;
+}
+
+void free_msi_entry(struct msi_desc *entry)
+{
+ kfree(entry);
+}
+
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
{
*msg = entry->msg;
@@ -211,22 +228,18 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
{
struct irq_chip *chip = info->chip;
- BUG_ON(!chip);
- if (!chip->irq_mask)
- chip->irq_mask = pci_msi_mask_irq;
- if (!chip->irq_unmask)
- chip->irq_unmask = pci_msi_unmask_irq;
+ BUG_ON(!chip || !chip->irq_mask || !chip->irq_unmask);
if (!chip->irq_set_affinity)
chip->irq_set_affinity = msi_domain_set_affinity;
}
/**
* msi_create_irq_domain - Create a MSI interrupt domain
- * @of_node: Optional device-tree node of the interrupt controller
+ * @fwnode: Optional fwnode of the interrupt controller
* @info: MSI domain info
* @parent: Parent irq domain
*/
-struct irq_domain *msi_create_irq_domain(struct device_node *node,
+struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
@@ -235,8 +248,8 @@ struct irq_domain *msi_create_irq_domain(struct device_node *node,
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
msi_domain_update_chip_ops(info);
- return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops,
- info);
+ return irq_domain_create_hierarchy(parent, 0, 0, fwnode,
+ &msi_domain_ops, info);
}
/**
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d22786a6dbde..e80c4400118a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc)
desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
desc->depth++;
irq_disable(desc);
- pm_system_wakeup();
+ pm_system_irq_wakeup(irq_desc_get_irq(desc));
return true;
}
return false;
@@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
desc->cond_suspend_depth--;
}
-static bool suspend_device_irq(struct irq_desc *desc, int irq)
+static bool suspend_device_irq(struct irq_desc *desc)
{
if (!desc->action || desc->no_suspend_depth)
return false;
@@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq)
}
desc->istate |= IRQS_SUSPENDED;
- __disable_irq(desc, irq);
+ __disable_irq(desc);
/*
* Hardware which has no wakeup source configuration facility
@@ -126,7 +126,7 @@ void suspend_device_irqs(void)
if (irq_settings_is_nested_thread(desc))
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- sync = suspend_device_irq(desc, irq);
+ sync = suspend_device_irq(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
if (sync)
@@ -135,7 +135,7 @@ void suspend_device_irqs(void)
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
-static void resume_irq(struct irq_desc *desc, int irq)
+static void resume_irq(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
@@ -150,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq)
desc->depth++;
resume:
desc->istate &= ~IRQS_SUSPENDED;
- __enable_irq(desc, irq);
+ __enable_irq(desc);
}
static void resume_irqs(bool want_early)
@@ -169,7 +169,7 @@ static void resume_irqs(bool want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- resume_irq(desc, irq);
+ resume_irq(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0e97c142ce40..a916cf144b65 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/mutex.h>
#include "internals.h"
@@ -39,7 +40,7 @@ static struct proc_dir_entry *root_irq_dir;
static int show_irq_affinity(int type, struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
- const struct cpumask *mask = desc->irq_data.affinity;
+ const struct cpumask *mask = desc->irq_common_data.affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (irqd_is_setaffinity_pending(&desc->irq_data))
@@ -323,18 +324,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
void register_irq_proc(unsigned int irq, struct irq_desc *desc)
{
+ static DEFINE_MUTEX(register_lock);
char name [MAX_NAMELEN];
- if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
+ if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip))
return;
+ /*
+ * irq directories are registered only when a handler is
+ * added, not when the descriptor is created, so multiple
+ * tasks might try to register at the same time.
+ */
+ mutex_lock(&register_lock);
+
+ if (desc->dir)
+ goto out_unlock;
+
memset(name, 0, MAX_NAMELEN);
sprintf(name, "%d", irq);
/* create /proc/irq/1234 */
desc->dir = proc_mkdir(name, root_irq_dir);
if (!desc->dir)
- return;
+ goto out_unlock;
#ifdef CONFIG_SMP
/* create /proc/irq/<irq>/smp_affinity */
@@ -355,6 +367,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
proc_create_data("spurious", 0444, desc->dir,
&irq_spurious_proc_fops, (void *)(long)irq);
+
+out_unlock:
+ mutex_unlock(&register_lock);
}
void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
@@ -460,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
action = desc->action;
- if (!action && !any_count)
+ if ((!action || action == &chained_action) && !any_count)
goto out;
seq_printf(p, "%*d: ", prec, i);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 7a5237a1bce5..b86886beee4f 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg)
clear_bit(irq, irqs_resend);
desc = irq_to_desc(irq);
local_irq_disable();
- desc->handle_irq(irq, desc);
+ desc->handle_irq(desc);
local_irq_enable();
}
}
@@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
*
* Is called with interrupts disabled and desc->lock held.
*/
-void check_irq_resend(struct irq_desc *desc, unsigned int irq)
+void check_irq_resend(struct irq_desc *desc)
{
/*
* We do not resend level type interrupts. Level type
@@ -74,6 +74,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
if (!desc->irq_data.chip->irq_retrigger ||
!desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
#ifdef CONFIG_HARDIRQS_SW_RESEND
+ unsigned int irq = irq_desc_get_irq(desc);
+
/*
* If the interrupt is running in the thread
* context of the parent irq we need to be
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 3320b84cc60f..320579d89091 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -15,6 +15,7 @@ enum {
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQ_IS_POLLED = IRQ_IS_POLLED,
+ _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -28,6 +29,7 @@ enum {
#define IRQ_NESTED_THREAD GOT_YOU_MORON
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#define IRQ_IS_POLLED GOT_YOU_MORON
+#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -154,3 +156,13 @@ static inline bool irq_settings_is_polled(struct irq_desc *desc)
{
return desc->status_use_accessors & _IRQ_IS_POLLED;
}
+
+static inline bool irq_settings_disable_unlazy(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_DISABLE_UNLAZY;
+}
+
+static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..32144175458d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(int irq, struct irq_desc *desc, bool force)
+static int try_one_irq(struct irq_desc *desc, bool force)
{
irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
@@ -133,7 +133,7 @@ static int misrouted_irq(int irq)
if (i == irq) /* Already tried */
continue;
- if (try_one_irq(i, desc, false))
+ if (try_one_irq(desc, false))
ok = 1;
}
out:
@@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy)
continue;
local_irq_disable();
- try_one_irq(i, desc, true);
+ try_one_irq(desc, true);
local_irq_enable();
}
out:
@@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret)
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
-static void
-__report_bad_irq(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
+ unsigned int irq = irq_desc_get_irq(desc);
struct irqaction *action;
unsigned long flags;
@@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
-static void
-report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
+static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
}
}
@@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
#define SPURIOUS_DEFERRED 0x80000000
-void note_interrupt(unsigned int irq, struct irq_desc *desc,
- irqreturn_t action_ret)
+void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
{
+ unsigned int irq;
+
if (desc->istate & IRQS_POLL_INPROGRESS ||
irq_settings_is_polled(desc))
return;
if (bad_action_ret(action_ret)) {
- report_bad_irq(irq, desc, action_ret);
+ report_bad_irq(desc, action_ret);
return;
}
@@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
desc->last_unhandled = jiffies;
}
+ irq = irq_desc_get_irq(desc);
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
@@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
/*
* The interrupt is stuck
*/
- __report_bad_irq(irq, desc, action_ret);
+ __report_bad_irq(desc, action_ret);
/*
* Now kill the IRQ
*/
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..f7dd15d537f9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
}
-static void jump_label_update(struct static_key *key, int enable);
+static void jump_label_update(struct static_key *key);
void static_key_slow_inc(struct static_key *key)
{
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key)
return;
jump_label_lock();
- if (atomic_read(&key->enabled) == 0) {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_ENABLE);
- else
- jump_label_update(key, JUMP_LABEL_DISABLE);
- }
- atomic_inc(&key->enabled);
+ if (atomic_inc_return(&key->enabled) == 1)
+ jump_label_update(key);
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key,
atomic_inc(&key->enabled);
schedule_delayed_work(work, rate_limit);
} else {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_DISABLE);
- else
- jump_label_update(key, JUMP_LABEL_ENABLE);
+ jump_label_update(key);
}
jump_label_unlock();
}
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
+/*
* Update code which is definitely not currently executing.
* Architectures which need heavyweight synchronization to modify
* running code can override this to make the non-live update case
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ arch_jump_label_transform(entry, type);
+}
+
+static inline struct jump_entry *static_key_entries(struct static_key *key)
+{
+ return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+}
+
+static inline bool static_key_type(struct static_key *key)
+{
+ return (unsigned long)key->entries & JUMP_TYPE_MASK;
+}
+
+static inline struct static_key *jump_entry_key(struct jump_entry *entry)
+{
+ return (struct static_key *)((unsigned long)entry->key & ~1UL);
+}
+
+static bool jump_entry_branch(struct jump_entry *entry)
+{
+ return (unsigned long)entry->key & 1UL;
+}
+
+static enum jump_label_type jump_label_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool enabled = static_key_enabled(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return enabled ^ branch;
}
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
- struct jump_entry *stop, int enable)
+ struct jump_entry *stop)
{
- for (; (entry < stop) &&
- (entry->key == (jump_label_t)(unsigned long)key);
- entry++) {
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
/*
* entry->code set to 0 invalidates module init text sections
* kernel_text_address() verifies we are not in core kernel
* init code, see jump_label_invalidate_module_init().
*/
if (entry->code && kernel_text_address(entry->code))
- arch_jump_label_transform(entry, enable);
+ arch_jump_label_transform(entry, jump_label_type(entry));
}
}
-static enum jump_label_type jump_label_type(struct static_key *key)
-{
- bool true_branch = jump_label_get_branch_default(key);
- bool state = static_key_enabled(key);
-
- if ((!true_branch && state) || (true_branch && !state))
- return JUMP_LABEL_ENABLE;
-
- return JUMP_LABEL_DISABLE;
-}
-
void __init jump_label_init(void)
{
struct jump_entry *iter_start = __start___jump_table;
@@ -202,8 +211,11 @@ void __init jump_label_init(void)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
- arch_jump_label_transform_static(iter, jump_label_type(iterk));
+ /* rewrite NOPs */
+ if (jump_label_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
@@ -222,6 +234,16 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
+static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool type = static_key_type(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return type ^ branch;
+}
+
struct static_key_mod {
struct static_key_mod *next;
struct jump_entry *entries;
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
start, end);
}
-static void __jump_label_mod_update(struct static_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key)
{
- struct static_key_mod *mod = key->next;
+ struct static_key_mod *mod;
- while (mod) {
+ for (mod = key->next; mod; mod = mod->next) {
struct module *m = mod->mod;
__jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries,
- enable);
- mod = mod->next;
+ m->jump_entries + m->num_jump_entries);
}
}
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod)
return;
for (iter = iter_start; iter < iter_stop; iter++) {
- arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+ /* Only write NOPs for arch_branch_static(). */
+ if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
}
}
@@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod)
jlm->next = key->next;
key->next = jlm;
- if (jump_label_type(key) == JUMP_LABEL_ENABLE)
- __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+ /* Only update if we've changed from our initial state */
+ if (jump_label_type(iter) != jump_label_init_type(iter))
+ __jump_label_update(key, iter, iter_stop);
}
return 0;
@@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod)
struct static_key_mod *jlm, **prev;
for (iter = iter_start; iter < iter_stop; iter++) {
- if (iter->key == (jump_label_t)(unsigned long)key)
+ if (jump_entry_key(iter) == key)
continue;
- key = (struct static_key *)(unsigned long)iter->key;
+ key = jump_entry_key(iter);
if (within_module(iter->key, mod))
continue;
@@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end)
return ret;
}
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = jump_label_get_entries(key);
+ struct jump_entry *entry = static_key_entries(key);
#ifdef CONFIG_MODULES
struct module *mod;
- __jump_label_mod_update(key, enable);
+ __jump_label_mod_update(key);
preempt_disable();
mod = __module_address((unsigned long)key);
@@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable)
#endif
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, stop, enable);
+ __jump_label_update(key, entry, stop);
}
-#endif
+#ifdef CONFIG_STATIC_KEYS_SELFTEST
+static DEFINE_STATIC_KEY_TRUE(sk_true);
+static DEFINE_STATIC_KEY_FALSE(sk_false);
+
+static __init int jump_label_test(void)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ WARN_ON(static_key_enabled(&sk_true.key) != true);
+ WARN_ON(static_key_enabled(&sk_false.key) != false);
+
+ WARN_ON(!static_branch_likely(&sk_true));
+ WARN_ON(!static_branch_unlikely(&sk_true));
+ WARN_ON(static_branch_likely(&sk_false));
+ WARN_ON(static_branch_unlikely(&sk_false));
+
+ static_branch_disable(&sk_true);
+ static_branch_enable(&sk_false);
+
+ WARN_ON(static_key_enabled(&sk_true.key) == true);
+ WARN_ON(static_key_enabled(&sk_false.key) == false);
+
+ WARN_ON(static_branch_likely(&sk_true));
+ WARN_ON(static_branch_unlikely(&sk_true));
+ WARN_ON(!static_branch_likely(&sk_false));
+ WARN_ON(!static_branch_unlikely(&sk_false));
+
+ static_branch_enable(&sk_true);
+ static_branch_disable(&sk_false);
+ }
+
+ return 0;
+}
+late_initcall(jump_label_test);
+#endif /* STATIC_KEYS_SELFTEST */
+
+#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..d873b64fbddc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,156 +1,24 @@
/*
- * kexec.c - kexec system call
+ * kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
*/
-#define pr_fmt(fmt) "kexec: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
#include <linux/kexec.h>
#include <linux/mutex.h>
#include <linux/list.h>
-#include <linux/highmem.h>
#include <linux/syscalls.h>
-#include <linux/reboot.h>
-#include <linux/ioport.h>
-#include <linux/hardirq.h>
-#include <linux/elf.h>
-#include <linux/elfcore.h>
-#include <linux/utsname.h>
-#include <linux/numa.h>
-#include <linux/suspend.h>
-#include <linux/device.h>
-#include <linux/freezer.h>
-#include <linux/pm.h>
-#include <linux/cpu.h>
-#include <linux/console.h>
#include <linux/vmalloc.h>
-#include <linux/swap.h>
-#include <linux/syscore_ops.h>
-#include <linux/compiler.h>
-#include <linux/hugetlb.h>
-
-#include <asm/page.h>
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/sections.h>
-
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-
-/* Per cpu memory for storing cpu states in case of system crash. */
-note_buf_t __percpu *crash_notes;
-
-/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-
-/* Flag to indicate we are going to kexec a new kernel */
-bool kexec_in_progress = false;
-
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
-#ifdef CONFIG_KEXEC_FILE
-static int kexec_calculate_store_digests(struct kimage *image);
-#endif
-
-/* Location of the reserved area for the crash kernel */
-struct resource crashk_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-struct resource crashk_low_res = {
- .name = "Crash kernel",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_MEM
-};
-
-int kexec_should_crash(struct task_struct *p)
-{
- /*
- * If crash_kexec_post_notifiers is enabled, don't run
- * crash_kexec() here yet, which must be run after panic
- * notifiers in panic().
- */
- if (crash_kexec_post_notifiers)
- return 0;
- /*
- * There are 4 panic() calls in do_exit() path, each of which
- * corresponds to each of these 4 conditions.
- */
- if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
- return 1;
- return 0;
-}
-
-/*
- * When kexec transitions to the new kernel there is a one-to-one
- * mapping between physical and virtual addresses. On processors
- * where you can disable the MMU this is trivial, and easy. For
- * others it is still a simple predictable page table to setup.
- *
- * In that environment kexec copies the new kernel to its final
- * resting place. This means I can only support memory whose
- * physical address can fit in an unsigned long. In particular
- * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
- * If the assembly stub has more restrictive requirements
- * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
- * defined more restrictively in <asm/kexec.h>.
- *
- * The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
- * page of memory is necessary, but some architectures require more.
- * Because this memory must be identity mapped in the transition from
- * virtual to physical addresses it must live in the range
- * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
- * modifiable.
- *
- * The assembly stub in the control code buffer is passed a linked list
- * of descriptor pages detailing the source pages of the new kernel,
- * and the destination addresses of those source pages. As this data
- * structure is not used in the context of the current OS, it must
- * be self-contained.
- *
- * The code has been made to work with highmem pages and will use a
- * destination page in its final resting place (if it happens
- * to allocate it). The end product of this is that most of the
- * physical address space, and most of RAM can be used.
- *
- * Future directions include:
- * - allocating a page table with the control code buffer identity
- * mapped, to simplify machine_kexec and make kexec_on_panic more
- * reliable.
- */
-
-/*
- * KIMAGE_NO_DEST is an impossible destination address..., for
- * allocating pages whose destination address we do not care about.
- */
-#define KIMAGE_NO_DEST (-1UL)
+#include <linux/slab.h>
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long dest);
+#include "kexec_internal.h"
static int copy_user_segment_list(struct kimage *image,
unsigned long nr_segments,
@@ -169,125 +37,6 @@ static int copy_user_segment_list(struct kimage *image,
return ret;
}
-static int sanity_check_segment_list(struct kimage *image)
-{
- int result, i;
- unsigned long nr_segments = image->nr_segments;
-
- /*
- * Verify we have good destination addresses. The caller is
- * responsible for making certain we don't attempt to load
- * the new image into invalid or reserved areas of RAM. This
- * just verifies it is an address we can use.
- *
- * Since the kernel does everything in page size chunks ensure
- * the destination addresses are page aligned. Too many
- * special cases crop of when we don't do this. The most
- * insidious is getting overlapping destination addresses
- * simply because addresses are changed to page size
- * granularity.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- return result;
- if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- return result;
- }
-
- /* Verify our destination addresses do not overlap.
- * If we alloed overlapping destination addresses
- * through very weird things can happen with no
- * easy explanation as one segment stops on another.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
- unsigned long j;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- for (j = 0; j < i; j++) {
- unsigned long pstart, pend;
- pstart = image->segment[j].mem;
- pend = pstart + image->segment[j].memsz;
- /* Do the segments overlap ? */
- if ((mend > pstart) && (mstart < pend))
- return result;
- }
- }
-
- /* Ensure our buffer sizes are strictly less than
- * our memory sizes. This should always be the case,
- * and it is easier to check up front than to be surprised
- * later on.
- */
- result = -EINVAL;
- for (i = 0; i < nr_segments; i++) {
- if (image->segment[i].bufsz > image->segment[i].memsz)
- return result;
- }
-
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
-
- if (image->type == KEXEC_TYPE_CRASH) {
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) ||
- (mend > crashk_res.end))
- return result;
- }
- }
-
- return 0;
-}
-
-static struct kimage *do_kimage_alloc_init(void)
-{
- struct kimage *image;
-
- /* Allocate a controlling structure */
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- return NULL;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unusable_pages);
-
- return image;
-}
-
-static void kimage_free_page_list(struct list_head *list);
-
static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
unsigned long nr_segments,
struct kexec_segment __user *segments,
@@ -354,873 +103,6 @@ out_free_image:
return ret;
}
-#ifdef CONFIG_KEXEC_FILE
-static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
-{
- struct fd f = fdget(fd);
- int ret;
- struct kstat stat;
- loff_t pos;
- ssize_t bytes = 0;
-
- if (!f.file)
- return -EBADF;
-
- ret = vfs_getattr(&f.file->f_path, &stat);
- if (ret)
- goto out;
-
- if (stat.size > INT_MAX) {
- ret = -EFBIG;
- goto out;
- }
-
- /* Don't hand 0 to vmalloc, it whines. */
- if (stat.size == 0) {
- ret = -EINVAL;
- goto out;
- }
-
- *buf = vmalloc(stat.size);
- if (!*buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- pos = 0;
- while (pos < stat.size) {
- bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
- stat.size - pos);
- if (bytes < 0) {
- vfree(*buf);
- ret = bytes;
- goto out;
- }
-
- if (bytes == 0)
- break;
- pos += bytes;
- }
-
- if (pos != stat.size) {
- ret = -EBADF;
- vfree(*buf);
- goto out;
- }
-
- *buf_len = pos;
-out:
- fdput(f);
- return ret;
-}
-
-/* Architectures can provide this probe function */
-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -ENOEXEC;
-}
-
-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
-{
- return ERR_PTR(-ENOEXEC);
-}
-
-void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-}
-
-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
- unsigned long buf_len)
-{
- return -EKEYREJECTED;
-}
-
-/* Apply relocations of type RELA */
-int __weak
-arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("RELA relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/* Apply relocations of type REL */
-int __weak
-arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
- unsigned int relsec)
-{
- pr_err("REL relocation unsupported.\n");
- return -ENOEXEC;
-}
-
-/*
- * Free up memory used by kernel, initrd, and command line. This is temporary
- * memory allocation which is not needed any more after these buffers have
- * been loaded into separate segments and have been copied elsewhere.
- */
-static void kimage_file_post_load_cleanup(struct kimage *image)
-{
- struct purgatory_info *pi = &image->purgatory_info;
-
- vfree(image->kernel_buf);
- image->kernel_buf = NULL;
-
- vfree(image->initrd_buf);
- image->initrd_buf = NULL;
-
- kfree(image->cmdline_buf);
- image->cmdline_buf = NULL;
-
- vfree(pi->purgatory_buf);
- pi->purgatory_buf = NULL;
-
- vfree(pi->sechdrs);
- pi->sechdrs = NULL;
-
- /* See if architecture has anything to cleanup post load */
- arch_kimage_file_post_load_cleanup(image);
-
- /*
- * Above call should have called into bootloader to free up
- * any data stored in kimage->image_loader_data. It should
- * be ok now to free it up.
- */
- kfree(image->image_loader_data);
- image->image_loader_data = NULL;
-}
-
-/*
- * In file mode list of segments is prepared by kernel. Copy relevant
- * data from user space, do error checking, prepare segment list
- */
-static int
-kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
- const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned flags)
-{
- int ret = 0;
- void *ldata;
-
- ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
- &image->kernel_buf_len);
- if (ret)
- return ret;
-
- /* Call arch image probe handlers */
- ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
- image->kernel_buf_len);
-
- if (ret)
- goto out;
-
-#ifdef CONFIG_KEXEC_VERIFY_SIG
- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
- image->kernel_buf_len);
- if (ret) {
- pr_debug("kernel signature verification failed.\n");
- goto out;
- }
- pr_debug("kernel signature verification successful.\n");
-#endif
- /* It is possible that there no initramfs is being loaded */
- if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
- ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
- &image->initrd_buf_len);
- if (ret)
- goto out;
- }
-
- if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
- goto out;
- }
-
- image->cmdline_buf_len = cmdline_len;
-
- /* command line should be a string with last byte null */
- if (image->cmdline_buf[cmdline_len - 1] != '\0') {
- ret = -EINVAL;
- goto out;
- }
- }
-
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
-
- if (IS_ERR(ldata)) {
- ret = PTR_ERR(ldata);
- goto out;
- }
-
- image->image_loader_data = ldata;
-out:
- /* In case of error, free up all allocated memory in this function */
- if (ret)
- kimage_file_post_load_cleanup(image);
- return ret;
-}
-
-static int
-kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
- int initrd_fd, const char __user *cmdline_ptr,
- unsigned long cmdline_len, unsigned long flags)
-{
- int ret;
- struct kimage *image;
- bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
-
- image = do_kimage_alloc_init();
- if (!image)
- return -ENOMEM;
-
- image->file_mode = 1;
-
- if (kexec_on_panic) {
- /* Enable special crash kernel control page alloc policy. */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
- }
-
- ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
- cmdline_ptr, cmdline_len, flags);
- if (ret)
- goto out_free_image;
-
- ret = sanity_check_segment_list(image);
- if (ret)
- goto out_free_post_load_bufs;
-
- ret = -ENOMEM;
- image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_PAGE_SIZE));
- if (!image->control_code_page) {
- pr_err("Could not allocate control_code_buffer\n");
- goto out_free_post_load_bufs;
- }
-
- if (!kexec_on_panic) {
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free_control_pages;
- }
- }
-
- *rimage = image;
- return 0;
-out_free_control_pages:
- kimage_free_page_list(&image->control_pages);
-out_free_post_load_bufs:
- kimage_file_post_load_cleanup(image);
-out_free_image:
- kfree(image);
- return ret;
-}
-#else /* CONFIG_KEXEC_FILE */
-static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
-#endif /* CONFIG_KEXEC_FILE */
-
-static int kimage_is_destination_range(struct kimage *image,
- unsigned long start,
- unsigned long end)
-{
- unsigned long i;
-
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend))
- return 1;
- }
-
- return 0;
-}
-
-static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *pages;
-
- pages = alloc_pages(gfp_mask, order);
- if (pages) {
- unsigned int count, i;
- pages->mapping = NULL;
- set_page_private(pages, order);
- count = 1 << order;
- for (i = 0; i < count; i++)
- SetPageReserved(pages + i);
- }
-
- return pages;
-}
-
-static void kimage_free_pages(struct page *page)
-{
- unsigned int order, count, i;
-
- order = page_private(page);
- count = 1 << order;
- for (i = 0; i < count; i++)
- ClearPageReserved(page + i);
- __free_pages(page, order);
-}
-
-static void kimage_free_page_list(struct list_head *list)
-{
- struct list_head *pos, *next;
-
- list_for_each_safe(pos, next, list) {
- struct page *page;
-
- page = list_entry(pos, struct page, lru);
- list_del(&page->lru);
- kimage_free_pages(page);
- }
-}
-
-static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * At worst this runs in O(N) of the image size.
- */
- struct list_head extra_pages;
- struct page *pages;
- unsigned int count;
-
- count = 1 << order;
- INIT_LIST_HEAD(&extra_pages);
-
- /* Loop while I can allocate a page and the page allocated
- * is a destination page.
- */
- do {
- unsigned long pfn, epfn, addr, eaddr;
-
- pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
- if (!pages)
- break;
- pfn = page_to_pfn(pages);
- epfn = pfn + count;
- addr = pfn << PAGE_SHIFT;
- eaddr = epfn << PAGE_SHIFT;
- if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
- kimage_is_destination_range(image, addr, eaddr)) {
- list_add(&pages->lru, &extra_pages);
- pages = NULL;
- }
- } while (!pages);
-
- if (pages) {
- /* Remember the allocated page... */
- list_add(&pages->lru, &image->control_pages);
-
- /* Because the page is already in it's destination
- * location we will never allocate another page at
- * that address. Therefore kimage_alloc_pages
- * will not return it (again) and we don't need
- * to give it an entry in image->segment[].
- */
- }
- /* Deal with the destination pages I have inadvertently allocated.
- *
- * Ideally I would convert multi-page allocations into single
- * page allocations, and add everything to image->dest_pages.
- *
- * For now it is simpler to just free the pages.
- */
- kimage_free_page_list(&extra_pages);
-
- return pages;
-}
-
-static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
- unsigned int order)
-{
- /* Control pages are special, they are the intermediaries
- * that are needed while we copy the rest of the pages
- * to their final resting place. As such they must
- * not conflict with either the destination addresses
- * or memory the kernel is already using.
- *
- * Control pages are also the only pags we must allocate
- * when loading a crash kernel. All of the other pages
- * are specified by the segments and we just memcpy
- * into them directly.
- *
- * The only case where we really need more than one of
- * these are for architectures where we cannot disable
- * the MMU and must instead generate an identity mapped
- * page table for all of the memory.
- *
- * Given the low demand this implements a very simple
- * allocator that finds the first hole of the appropriate
- * size in the reserved memory region, and allocates all
- * of the memory up to and including the hole.
- */
- unsigned long hole_start, hole_end, size;
- struct page *pages;
-
- pages = NULL;
- size = (1 << order) << PAGE_SHIFT;
- hole_start = (image->control_page + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- while (hole_end <= crashk_res.end) {
- unsigned long i;
-
- if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
- break;
- /* See if I overlap any of the segments */
- for (i = 0; i < image->nr_segments; i++) {
- unsigned long mstart, mend;
-
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- if ((hole_end >= mstart) && (hole_start <= mend)) {
- /* Advance the hole to the end of the segment */
- hole_start = (mend + (size - 1)) & ~(size - 1);
- hole_end = hole_start + size - 1;
- break;
- }
- }
- /* If I don't overlap any segments I have found my hole! */
- if (i == image->nr_segments) {
- pages = pfn_to_page(hole_start >> PAGE_SHIFT);
- break;
- }
- }
- if (pages)
- image->control_page = hole_end;
-
- return pages;
-}
-
-
-struct page *kimage_alloc_control_pages(struct kimage *image,
- unsigned int order)
-{
- struct page *pages = NULL;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- pages = kimage_alloc_normal_control_pages(image, order);
- break;
- case KEXEC_TYPE_CRASH:
- pages = kimage_alloc_crash_control_pages(image, order);
- break;
- }
-
- return pages;
-}
-
-static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
-{
- if (*image->entry != 0)
- image->entry++;
-
- if (image->entry == image->last_entry) {
- kimage_entry_t *ind_page;
- struct page *page;
-
- page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
- if (!page)
- return -ENOMEM;
-
- ind_page = page_address(page);
- *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
- image->entry = ind_page;
- image->last_entry = ind_page +
- ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
- }
- *image->entry = entry;
- image->entry++;
- *image->entry = 0;
-
- return 0;
-}
-
-static int kimage_set_destination(struct kimage *image,
- unsigned long destination)
-{
- int result;
-
- destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
-
- return result;
-}
-
-
-static int kimage_add_page(struct kimage *image, unsigned long page)
-{
- int result;
-
- page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
-
- return result;
-}
-
-
-static void kimage_free_extra_pages(struct kimage *image)
-{
- /* Walk through and free any extra destination pages I may have */
- kimage_free_page_list(&image->dest_pages);
-
- /* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unusable_pages);
-
-}
-static void kimage_terminate(struct kimage *image)
-{
- if (*image->entry != 0)
- image->entry++;
-
- *image->entry = IND_DONE;
-}
-
-#define for_each_kimage_entry(image, ptr, entry) \
- for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
- ptr = (entry & IND_INDIRECTION) ? \
- phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
-
-static void kimage_free_entry(kimage_entry_t entry)
-{
- struct page *page;
-
- page = pfn_to_page(entry >> PAGE_SHIFT);
- kimage_free_pages(page);
-}
-
-static void kimage_free(struct kimage *image)
-{
- kimage_entry_t *ptr, entry;
- kimage_entry_t ind = 0;
-
- if (!image)
- return;
-
- kimage_free_extra_pages(image);
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_INDIRECTION) {
- /* Free the previous indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
- /* Save this indirection page until we are
- * done with it.
- */
- ind = entry;
- } else if (entry & IND_SOURCE)
- kimage_free_entry(entry);
- }
- /* Free the final indirection page */
- if (ind & IND_INDIRECTION)
- kimage_free_entry(ind);
-
- /* Handle any machine specific cleanup */
- machine_kexec_cleanup(image);
-
- /* Free the kexec control pages... */
- kimage_free_page_list(&image->control_pages);
-
- /*
- * Free up any temporary buffers allocated. This might hit if
- * error occurred much later after buffer allocation.
- */
- if (image->file_mode)
- kimage_file_post_load_cleanup(image);
-
- kfree(image);
-}
-
-static kimage_entry_t *kimage_dst_used(struct kimage *image,
- unsigned long page)
-{
- kimage_entry_t *ptr, entry;
- unsigned long destination = 0;
-
- for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_DESTINATION)
- destination = entry & PAGE_MASK;
- else if (entry & IND_SOURCE) {
- if (page == destination)
- return ptr;
- destination += PAGE_SIZE;
- }
- }
-
- return NULL;
-}
-
-static struct page *kimage_alloc_page(struct kimage *image,
- gfp_t gfp_mask,
- unsigned long destination)
-{
- /*
- * Here we implement safeguards to ensure that a source page
- * is not copied to its destination page before the data on
- * the destination page is no longer useful.
- *
- * To do this we maintain the invariant that a source page is
- * either its own destination page, or it is not a
- * destination page at all.
- *
- * That is slightly stronger than required, but the proof
- * that no problems will not occur is trivial, and the
- * implementation is simply to verify.
- *
- * When allocating all pages normally this algorithm will run
- * in O(N) time, but in the worst case it will run in O(N^2)
- * time. If the runtime is a problem the data structures can
- * be fixed.
- */
- struct page *page;
- unsigned long addr;
-
- /*
- * Walk through the list of destination pages, and see if I
- * have a match.
- */
- list_for_each_entry(page, &image->dest_pages, lru) {
- addr = page_to_pfn(page) << PAGE_SHIFT;
- if (addr == destination) {
- list_del(&page->lru);
- return page;
- }
- }
- page = NULL;
- while (1) {
- kimage_entry_t *old;
-
- /* Allocate a page, if we run out of memory give up */
- page = kimage_alloc_pages(gfp_mask, 0);
- if (!page)
- return NULL;
- /* If the page cannot be used file it away */
- if (page_to_pfn(page) >
- (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unusable_pages);
- continue;
- }
- addr = page_to_pfn(page) << PAGE_SHIFT;
-
- /* If it is the destination page we want use it */
- if (addr == destination)
- break;
-
- /* If the page is not a destination page use it */
- if (!kimage_is_destination_range(image, addr,
- addr + PAGE_SIZE))
- break;
-
- /*
- * I know that the page is someones destination page.
- * See if there is already a source page for this
- * destination page. And if so swap the source pages.
- */
- old = kimage_dst_used(image, addr);
- if (old) {
- /* If so move it */
- unsigned long old_addr;
- struct page *old_page;
-
- old_addr = *old & PAGE_MASK;
- old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
- copy_highpage(page, old_page);
- *old = addr | (*old & ~PAGE_MASK);
-
- /* The old page I have found cannot be a
- * destination page, so return it if it's
- * gfp_flags honor the ones passed in.
- */
- if (!(gfp_mask & __GFP_HIGHMEM) &&
- PageHighMem(old_page)) {
- kimage_free_pages(old_page);
- continue;
- }
- addr = old_addr;
- page = old_page;
- break;
- } else {
- /* Place the page on the destination list I
- * will use it later.
- */
- list_add(&page->lru, &image->dest_pages);
- }
- }
-
- return page;
-}
-
-static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
-
- result = kimage_set_destination(image, maddr);
- if (result < 0)
- goto out;
-
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- result = kimage_add_page(image, page_to_pfn(page)
- << PAGE_SHIFT);
- if (result < 0)
- goto out;
-
- ptr = kmap(page);
- /* Start with a clear page */
- clear_page(ptr);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- /* For crash dumps kernels we simply copy the data from
- * user space to it's destination.
- * We do things a page at a time for the sake of kmap.
- */
- unsigned long maddr;
- size_t ubytes, mbytes;
- int result;
- unsigned char __user *buf = NULL;
- unsigned char *kbuf = NULL;
-
- result = 0;
- if (image->file_mode)
- kbuf = segment->kbuf;
- else
- buf = segment->buf;
- ubytes = segment->bufsz;
- mbytes = segment->memsz;
- maddr = segment->mem;
- while (mbytes) {
- struct page *page;
- char *ptr;
- size_t uchunk, mchunk;
-
- page = pfn_to_page(maddr >> PAGE_SHIFT);
- if (!page) {
- result = -ENOMEM;
- goto out;
- }
- ptr = kmap(page);
- ptr += maddr & ~PAGE_MASK;
- mchunk = min_t(size_t, mbytes,
- PAGE_SIZE - (maddr & ~PAGE_MASK));
- uchunk = min(ubytes, mchunk);
- if (mchunk > uchunk) {
- /* Zero the trailing part of the page */
- memset(ptr + uchunk, 0, mchunk - uchunk);
- }
-
- /* For file based kexec, source pages are in kernel memory */
- if (image->file_mode)
- memcpy(ptr, kbuf, uchunk);
- else
- result = copy_from_user(ptr, buf, uchunk);
- kexec_flush_icache_page(page);
- kunmap(page);
- if (result) {
- result = -EFAULT;
- goto out;
- }
- ubytes -= uchunk;
- maddr += mchunk;
- if (image->file_mode)
- kbuf += mchunk;
- else
- buf += mchunk;
- mbytes -= mchunk;
- }
-out:
- return result;
-}
-
-static int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
-{
- int result = -ENOMEM;
-
- switch (image->type) {
- case KEXEC_TYPE_DEFAULT:
- result = kimage_load_normal_segment(image, segment);
- break;
- case KEXEC_TYPE_CRASH:
- result = kimage_load_crash_segment(image, segment);
- break;
- }
-
- return result;
-}
-
/*
* Exec Kernel system call: for obvious reasons only root may call it.
*
@@ -1241,11 +123,6 @@ static int kimage_load_segment(struct kimage *image,
* kexec does not sync, or unmount filesystems so if you need
* that to happen you need to do that yourself.
*/
-struct kimage *kexec_image;
-struct kimage *kexec_crash_image;
-int kexec_load_disabled;
-
-static DEFINE_MUTEX(kexec_mutex);
SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1340,18 +217,6 @@ out:
return result;
}
-/*
- * Add and remove page tables for crashkernel memory
- *
- * Provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
compat_ulong_t, nr_segments,
@@ -1390,1391 +255,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
return sys_kexec_load(entry, nr_segments, ksegments, flags);
}
#endif
-
-#ifdef CONFIG_KEXEC_FILE
-SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
- unsigned long, cmdline_len, const char __user *, cmdline_ptr,
- unsigned long, flags)
-{
- int ret = 0, i;
- struct kimage **dest_image, *image;
-
- /* We only trust the superuser with rebooting the system. */
- if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
- return -EPERM;
-
- /* Make sure we have a legal set of flags */
- if (flags != (flags & KEXEC_FILE_FLAGS))
- return -EINVAL;
-
- image = NULL;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
-
- dest_image = &kexec_image;
- if (flags & KEXEC_FILE_ON_CRASH)
- dest_image = &kexec_crash_image;
-
- if (flags & KEXEC_FILE_UNLOAD)
- goto exchange;
-
- /*
- * In case of crash, new kernel gets loaded in reserved region. It is
- * same memory where old crash kernel might be loaded. Free any
- * current crash dump kernel before we corrupt it.
- */
- if (flags & KEXEC_FILE_ON_CRASH)
- kimage_free(xchg(&kexec_crash_image, NULL));
-
- ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
- cmdline_len, flags);
- if (ret)
- goto out;
-
- ret = machine_kexec_prepare(image);
- if (ret)
- goto out;
-
- ret = kexec_calculate_store_digests(image);
- if (ret)
- goto out;
-
- for (i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
- i, ksegment->buf, ksegment->bufsz, ksegment->mem,
- ksegment->memsz);
-
- ret = kimage_load_segment(image, &image->segment[i]);
- if (ret)
- goto out;
- }
-
- kimage_terminate(image);
-
- /*
- * Free up any temporary buffers allocated which are not needed
- * after image has been loaded
- */
- kimage_file_post_load_cleanup(image);
-exchange:
- image = xchg(dest_image, image);
-out:
- mutex_unlock(&kexec_mutex);
- kimage_free(image);
- return ret;
-}
-
-#endif /* CONFIG_KEXEC_FILE */
-
-void crash_kexec(struct pt_regs *regs)
-{
- /* Take the kexec_mutex here to prevent sys_kexec_load
- * running on one cpu from replacing the crash kernel
- * we are using after a panic on a different cpu.
- *
- * If the crash kernel was not located in a fixed area
- * of memory the xchg(&kexec_crash_image) would be
- * sufficient. But since I reuse the memory...
- */
- if (mutex_trylock(&kexec_mutex)) {
- if (kexec_crash_image) {
- struct pt_regs fixed_regs;
-
- crash_setup_regs(&fixed_regs, regs);
- crash_save_vmcoreinfo();
- machine_crash_shutdown(&fixed_regs);
- machine_kexec(kexec_crash_image);
- }
- mutex_unlock(&kexec_mutex);
- }
-}
-
-size_t crash_get_memory_size(void)
-{
- size_t size = 0;
- mutex_lock(&kexec_mutex);
- if (crashk_res.end != crashk_res.start)
- size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex);
- return size;
-}
-
-void __weak crash_free_reserved_phys_range(unsigned long begin,
- unsigned long end)
-{
- unsigned long addr;
-
- for (addr = begin; addr < end; addr += PAGE_SIZE)
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
-}
-
-int crash_shrink_memory(unsigned long new_size)
-{
- int ret = 0;
- unsigned long start, end;
- unsigned long old_size;
- struct resource *ram_res;
-
- mutex_lock(&kexec_mutex);
-
- if (kexec_crash_image) {
- ret = -ENOENT;
- goto unlock;
- }
- start = crashk_res.start;
- end = crashk_res.end;
- old_size = (end == 0) ? 0 : end - start + 1;
- if (new_size >= old_size) {
- ret = (new_size == old_size) ? 0 : -EINVAL;
- goto unlock;
- }
-
- ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
- if (!ram_res) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
- end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
-
- crash_map_reserved_pages();
- crash_free_reserved_phys_range(end, crashk_res.end);
-
- if ((start == end) && (crashk_res.parent != NULL))
- release_resource(&crashk_res);
-
- ram_res->start = end;
- ram_res->end = crashk_res.end;
- ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
- ram_res->name = "System RAM";
-
- crashk_res.end = end - 1;
-
- insert_resource(&iomem_resource, ram_res);
- crash_unmap_reserved_pages();
-
-unlock:
- mutex_unlock(&kexec_mutex);
- return ret;
-}
-
-static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
- size_t data_len)
-{
- struct elf_note note;
-
- note.n_namesz = strlen(name) + 1;
- note.n_descsz = data_len;
- note.n_type = type;
- memcpy(buf, &note, sizeof(note));
- buf += (sizeof(note) + 3)/4;
- memcpy(buf, name, note.n_namesz);
- buf += (note.n_namesz + 3)/4;
- memcpy(buf, data, note.n_descsz);
- buf += (note.n_descsz + 3)/4;
-
- return buf;
-}
-
-static void final_note(u32 *buf)
-{
- struct elf_note note;
-
- note.n_namesz = 0;
- note.n_descsz = 0;
- note.n_type = 0;
- memcpy(buf, &note, sizeof(note));
-}
-
-void crash_save_cpu(struct pt_regs *regs, int cpu)
-{
- struct elf_prstatus prstatus;
- u32 *buf;
-
- if ((cpu < 0) || (cpu >= nr_cpu_ids))
- return;
-
- /* Using ELF notes here is opportunistic.
- * I need a well defined structure format
- * for the data I pass, and I need tags
- * on the data to indicate what information I have
- * squirrelled away. ELF notes happen to provide
- * all of that, so there is no need to invent something new.
- */
- buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
- if (!buf)
- return;
- memset(&prstatus, 0, sizeof(prstatus));
- prstatus.pr_pid = current->pid;
- elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
- &prstatus, sizeof(prstatus));
- final_note(buf);
-}
-
-static int __init crash_notes_memory_init(void)
-{
- /* Allocate memory for saving cpu registers. */
- crash_notes = alloc_percpu(note_buf_t);
- if (!crash_notes) {
- pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
- return -ENOMEM;
- }
- return 0;
-}
-subsys_initcall(crash_notes_memory_init);
-
-
-/*
- * parsing the "crashkernel" commandline
- *
- * this code is intended to be called from architecture specific code
- */
-
-
-/*
- * This function parses command lines in the format
- *
- * crashkernel=ramsize-range:size[,...][@offset]
- *
- * The function returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_mem(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline, *tmp;
-
- /* for each entry of the comma-separated list */
- do {
- unsigned long long start, end = ULLONG_MAX, size;
-
- /* get the start of the range */
- start = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (*cur != '-') {
- pr_warn("crashkernel: '-' expected\n");
- return -EINVAL;
- }
- cur++;
-
- /* if no ':' is here, than we read the end */
- if (*cur != ':') {
- end = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("crashkernel: Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (end <= start) {
- pr_warn("crashkernel: end <= start\n");
- return -EINVAL;
- }
- }
-
- if (*cur != ':') {
- pr_warn("crashkernel: ':' expected\n");
- return -EINVAL;
- }
- cur++;
-
- size = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected\n");
- return -EINVAL;
- }
- cur = tmp;
- if (size >= system_ram) {
- pr_warn("crashkernel: invalid size\n");
- return -EINVAL;
- }
-
- /* match ? */
- if (system_ram >= start && system_ram < end) {
- *crash_size = size;
- break;
- }
- } while (*cur++ == ',');
-
- if (*crash_size > 0) {
- while (*cur && *cur != ' ' && *cur != '@')
- cur++;
- if (*cur == '@') {
- cur++;
- *crash_base = memparse(cur, &tmp);
- if (cur == tmp) {
- pr_warn("Memory value expected after '@'\n");
- return -EINVAL;
- }
- }
- }
-
- return 0;
-}
-
-/*
- * That function parses "simple" (old) crashkernel command lines like
- *
- * crashkernel=size[@offset]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_simple(char *cmdline,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- if (*cur == '@')
- *crash_base = memparse(cur+1, &cur);
- else if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-#define SUFFIX_HIGH 0
-#define SUFFIX_LOW 1
-#define SUFFIX_NULL 2
-static __initdata char *suffix_tbl[] = {
- [SUFFIX_HIGH] = ",high",
- [SUFFIX_LOW] = ",low",
- [SUFFIX_NULL] = NULL,
-};
-
-/*
- * That function parses "suffix" crashkernel command lines like
- *
- * crashkernel=size,[high|low]
- *
- * It returns 0 on success and -EINVAL on failure.
- */
-static int __init parse_crashkernel_suffix(char *cmdline,
- unsigned long long *crash_size,
- const char *suffix)
-{
- char *cur = cmdline;
-
- *crash_size = memparse(cmdline, &cur);
- if (cmdline == cur) {
- pr_warn("crashkernel: memory value expected\n");
- return -EINVAL;
- }
-
- /* check with suffix */
- if (strncmp(cur, suffix, strlen(suffix))) {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
- cur += strlen(suffix);
- if (*cur != ' ' && *cur != '\0') {
- pr_warn("crashkernel: unrecognized char\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-static __init char *get_last_crashkernel(char *cmdline,
- const char *name,
- const char *suffix)
-{
- char *p = cmdline, *ck_cmdline = NULL;
-
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- char *end_p = strchr(p, ' ');
- char *q;
-
- if (!end_p)
- end_p = p + strlen(p);
-
- if (!suffix) {
- int i;
-
- /* skip the one with any known suffix */
- for (i = 0; suffix_tbl[i]; i++) {
- q = end_p - strlen(suffix_tbl[i]);
- if (!strncmp(q, suffix_tbl[i],
- strlen(suffix_tbl[i])))
- goto next;
- }
- ck_cmdline = p;
- } else {
- q = end_p - strlen(suffix);
- if (!strncmp(q, suffix, strlen(suffix)))
- ck_cmdline = p;
- }
-next:
- p = strstr(p+1, name);
- }
-
- if (!ck_cmdline)
- return NULL;
-
- return ck_cmdline;
-}
-
-static int __init __parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base,
- const char *name,
- const char *suffix)
-{
- char *first_colon, *first_space;
- char *ck_cmdline;
-
- BUG_ON(!crash_size || !crash_base);
- *crash_size = 0;
- *crash_base = 0;
-
- ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
-
- if (!ck_cmdline)
- return -EINVAL;
-
- ck_cmdline += strlen(name);
-
- if (suffix)
- return parse_crashkernel_suffix(ck_cmdline, crash_size,
- suffix);
- /*
- * if the commandline contains a ':', then that's the extended
- * syntax -- if not, it must be the classic syntax
- */
- first_colon = strchr(ck_cmdline, ':');
- first_space = strchr(ck_cmdline, ' ');
- if (first_colon && (!first_space || first_colon < first_space))
- return parse_crashkernel_mem(ck_cmdline, system_ram,
- crash_size, crash_base);
-
- return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
-}
-
-/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
- */
-int __init parse_crashkernel(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", NULL);
-}
-
-int __init parse_crashkernel_high(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
-}
-
-int __init parse_crashkernel_low(char *cmdline,
- unsigned long long system_ram,
- unsigned long long *crash_size,
- unsigned long long *crash_base)
-{
- return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=", suffix_tbl[SUFFIX_LOW]);
-}
-
-static void update_vmcoreinfo_note(void)
-{
- u32 *buf = vmcoreinfo_note;
-
- if (!vmcoreinfo_size)
- return;
- buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
- vmcoreinfo_size);
- final_note(buf);
-}
-
-void crash_save_vmcoreinfo(void)
-{
- vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
- update_vmcoreinfo_note();
-}
-
-void vmcoreinfo_append_str(const char *fmt, ...)
-{
- va_list args;
- char buf[0x50];
- size_t r;
-
- va_start(args, fmt);
- r = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
-
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
-
- memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
-
- vmcoreinfo_size += r;
-}
-
-/*
- * provide an empty default implementation here -- architecture
- * code may override this
- */
-void __weak arch_crash_save_vmcoreinfo(void)
-{}
-
-unsigned long __weak paddr_vmcoreinfo_note(void)
-{
- return __pa((unsigned long)(char *)&vmcoreinfo_note);
-}
-
-static int __init crash_save_vmcoreinfo_init(void)
-{
- VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
- VMCOREINFO_PAGESIZE(PAGE_SIZE);
-
- VMCOREINFO_SYMBOL(init_uts_ns);
- VMCOREINFO_SYMBOL(node_online_map);
-#ifdef CONFIG_MMU
- VMCOREINFO_SYMBOL(swapper_pg_dir);
-#endif
- VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmap_area_list);
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- VMCOREINFO_SYMBOL(mem_map);
- VMCOREINFO_SYMBOL(contig_page_data);
-#endif
-#ifdef CONFIG_SPARSEMEM
- VMCOREINFO_SYMBOL(mem_section);
- VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
- VMCOREINFO_STRUCT_SIZE(mem_section);
- VMCOREINFO_OFFSET(mem_section, section_mem_map);
-#endif
- VMCOREINFO_STRUCT_SIZE(page);
- VMCOREINFO_STRUCT_SIZE(pglist_data);
- VMCOREINFO_STRUCT_SIZE(zone);
- VMCOREINFO_STRUCT_SIZE(free_area);
- VMCOREINFO_STRUCT_SIZE(list_head);
- VMCOREINFO_SIZE(nodemask_t);
- VMCOREINFO_OFFSET(page, flags);
- VMCOREINFO_OFFSET(page, _count);
- VMCOREINFO_OFFSET(page, mapping);
- VMCOREINFO_OFFSET(page, lru);
- VMCOREINFO_OFFSET(page, _mapcount);
- VMCOREINFO_OFFSET(page, private);
- VMCOREINFO_OFFSET(pglist_data, node_zones);
- VMCOREINFO_OFFSET(pglist_data, nr_zones);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- VMCOREINFO_OFFSET(pglist_data, node_mem_map);
-#endif
- VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
- VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
- VMCOREINFO_OFFSET(pglist_data, node_id);
- VMCOREINFO_OFFSET(zone, free_area);
- VMCOREINFO_OFFSET(zone, vm_stat);
- VMCOREINFO_OFFSET(zone, spanned_pages);
- VMCOREINFO_OFFSET(free_area, free_list);
- VMCOREINFO_OFFSET(list_head, next);
- VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vmap_area, va_start);
- VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
- log_buf_kexec_setup();
- VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
- VMCOREINFO_NUMBER(NR_FREE_PAGES);
- VMCOREINFO_NUMBER(PG_lru);
- VMCOREINFO_NUMBER(PG_private);
- VMCOREINFO_NUMBER(PG_swapcache);
- VMCOREINFO_NUMBER(PG_slab);
-#ifdef CONFIG_MEMORY_FAILURE
- VMCOREINFO_NUMBER(PG_hwpoison);
-#endif
- VMCOREINFO_NUMBER(PG_head_mask);
- VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
-#ifdef CONFIG_HUGETLBFS
- VMCOREINFO_SYMBOL(free_huge_page);
-#endif
-
- arch_crash_save_vmcoreinfo();
- update_vmcoreinfo_note();
-
- return 0;
-}
-
-subsys_initcall(crash_save_vmcoreinfo_init);
-
-#ifdef CONFIG_KEXEC_FILE
-static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_end = min(end, kbuf->buf_max);
- temp_start = temp_end - kbuf->memsz;
-
- do {
- /* align down start */
- temp_start = temp_start & (~(kbuf->buf_align - 1));
-
- if (temp_start < start || temp_start < kbuf->buf_min)
- return 0;
-
- temp_end = temp_start + kbuf->memsz - 1;
-
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start - PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
- struct kexec_buf *kbuf)
-{
- struct kimage *image = kbuf->image;
- unsigned long temp_start, temp_end;
-
- temp_start = max(start, kbuf->buf_min);
-
- do {
- temp_start = ALIGN(temp_start, kbuf->buf_align);
- temp_end = temp_start + kbuf->memsz - 1;
-
- if (temp_end > end || temp_end > kbuf->buf_max)
- return 0;
- /*
- * Make sure this does not conflict with any of existing
- * segments
- */
- if (kimage_is_destination_range(image, temp_start, temp_end)) {
- temp_start = temp_start + PAGE_SIZE;
- continue;
- }
-
- /* We found a suitable memory range */
- break;
- } while (1);
-
- /* If we are here, we found a suitable memory range */
- kbuf->mem = temp_start;
-
- /* Success, stop navigating through remaining System RAM ranges */
- return 1;
-}
-
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
-{
- struct kexec_buf *kbuf = (struct kexec_buf *)arg;
- unsigned long sz = end - start + 1;
-
- /* Returning 0 will take to next memory range */
- if (sz < kbuf->memsz)
- return 0;
-
- if (end < kbuf->buf_min || start > kbuf->buf_max)
- return 0;
-
- /*
- * Allocate memory top down with-in ram range. Otherwise bottom up
- * allocation.
- */
- if (kbuf->top_down)
- return locate_mem_hole_top_down(start, end, kbuf);
- return locate_mem_hole_bottom_up(start, end, kbuf);
-}
-
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
- */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
- unsigned long memsz, unsigned long buf_align,
- unsigned long buf_min, unsigned long buf_max,
- bool top_down, unsigned long *load_addr)
-{
-
- struct kexec_segment *ksegment;
- struct kexec_buf buf, *kbuf;
- int ret;
-
- /* Currently adding segment this way is allowed only in file mode */
- if (!image->file_mode)
- return -EINVAL;
-
- if (image->nr_segments >= KEXEC_SEGMENT_MAX)
- return -EINVAL;
-
- /*
- * Make sure we are not trying to add buffer after allocating
- * control pages. All segments need to be placed first before
- * any control pages are allocated. As control page allocation
- * logic goes through list of segments to make sure there are
- * no destination overlaps.
- */
- if (!list_empty(&image->control_pages)) {
- WARN_ON(1);
- return -EINVAL;
- }
-
- memset(&buf, 0, sizeof(struct kexec_buf));
- kbuf = &buf;
- kbuf->image = image;
- kbuf->buffer = buffer;
- kbuf->bufsz = bufsz;
-
- kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
- kbuf->buf_align = max(buf_align, PAGE_SIZE);
- kbuf->buf_min = buf_min;
- kbuf->buf_max = buf_max;
- kbuf->top_down = top_down;
-
- /* Walk the RAM ranges and allocate a suitable range for the buffer */
- if (image->type == KEXEC_TYPE_CRASH)
- ret = walk_iomem_res("Crash kernel",
- IORESOURCE_MEM | IORESOURCE_BUSY,
- crashk_res.start, crashk_res.end, kbuf,
- locate_mem_hole_callback);
- else
- ret = walk_system_ram_res(0, -1, kbuf,
- locate_mem_hole_callback);
- if (ret != 1) {
- /* A suitable memory range could not be found for buffer */
- return -EADDRNOTAVAIL;
- }
-
- /* Found a suitable memory range */
- ksegment = &image->segment[image->nr_segments];
- ksegment->kbuf = kbuf->buffer;
- ksegment->bufsz = kbuf->bufsz;
- ksegment->mem = kbuf->mem;
- ksegment->memsz = kbuf->memsz;
- image->nr_segments++;
- *load_addr = ksegment->mem;
- return 0;
-}
-
-/* Calculate and store the digest of segments */
-static int kexec_calculate_store_digests(struct kimage *image)
-{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int ret = 0, i, j, zero_buf_sz, sha_region_sz;
- size_t desc_size, nullsz;
- char *digest;
- void *zero_buf;
- struct kexec_sha_region *sha_regions;
- struct purgatory_info *pi = &image->purgatory_info;
-
- zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
- zero_buf_sz = PAGE_SIZE;
-
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto out;
- }
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- desc = kzalloc(desc_size, GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto out_free_tfm;
- }
-
- sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
- sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
- goto out_free_desc;
-
- desc->tfm = tfm;
- desc->flags = 0;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto out_free_sha_regions;
-
- digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
- if (!digest) {
- ret = -ENOMEM;
- goto out_free_sha_regions;
- }
-
- for (j = i = 0; i < image->nr_segments; i++) {
- struct kexec_segment *ksegment;
-
- ksegment = &image->segment[i];
- /*
- * Skip purgatory as it will be modified once we put digest
- * info in purgatory.
- */
- if (ksegment->kbuf == pi->purgatory_buf)
- continue;
-
- ret = crypto_shash_update(desc, ksegment->kbuf,
- ksegment->bufsz);
- if (ret)
- break;
-
- /*
- * Assume rest of the buffer is filled with zero and
- * update digest accordingly.
- */
- nullsz = ksegment->memsz - ksegment->bufsz;
- while (nullsz) {
- unsigned long bytes = nullsz;
-
- if (bytes > zero_buf_sz)
- bytes = zero_buf_sz;
- ret = crypto_shash_update(desc, zero_buf, bytes);
- if (ret)
- break;
- nullsz -= bytes;
- }
-
- if (ret)
- break;
-
- sha_regions[j].start = ksegment->mem;
- sha_regions[j].len = ksegment->memsz;
- j++;
- }
-
- if (!ret) {
- ret = crypto_shash_final(desc, digest);
- if (ret)
- goto out_free_digest;
- ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
- sha_regions, sha_region_sz, 0);
- if (ret)
- goto out_free_digest;
-
- ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
- digest, SHA256_DIGEST_SIZE, 0);
- if (ret)
- goto out_free_digest;
- }
-
-out_free_digest:
- kfree(digest);
-out_free_sha_regions:
- vfree(sha_regions);
-out_free_desc:
- kfree(desc);
-out_free_tfm:
- kfree(tfm);
-out:
- return ret;
-}
-
-/* Actually load purgatory. Lot of code taken from kexec-tools */
-static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
- unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
- unsigned char *buf_addr, *src;
- int i, ret = 0, entry_sidx = -1;
- const Elf_Shdr *sechdrs_c;
- Elf_Shdr *sechdrs = NULL;
- void *purgatory_buf = NULL;
-
- /*
- * sechdrs_c points to section headers in purgatory and are read
- * only. No modifications allowed.
- */
- sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
-
- /*
- * We can not modify sechdrs_c[] and its fields. It is read only.
- * Copy it over to a local copy where one can store some temporary
- * data and free it at the end. We need to modify ->sh_addr and
- * ->sh_offset fields to keep track of permanent and temporary
- * locations of sections.
- */
- sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
- if (!sechdrs)
- return -ENOMEM;
-
- memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
-
- /*
- * We seem to have multiple copies of sections. First copy is which
- * is embedded in kernel in read only section. Some of these sections
- * will be copied to a temporary buffer and relocated. And these
- * sections will finally be copied to their final destination at
- * segment load time.
- *
- * Use ->sh_offset to reflect section address in memory. It will
- * point to original read only copy if section is not allocatable.
- * Otherwise it will point to temporary copy which will be relocated.
- *
- * Use ->sh_addr to contain final address of the section where it
- * will go during execution time.
- */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type == SHT_NOBITS)
- continue;
-
- sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
- sechdrs[i].sh_offset;
- }
-
- /*
- * Identify entry point section and make entry relative to section
- * start.
- */
- entry = pi->ehdr->e_entry;
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- /* Make entry section relative */
- if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
- ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
- pi->ehdr->e_entry)) {
- entry_sidx = i;
- entry -= sechdrs[i].sh_addr;
- break;
- }
- }
-
- /* Determine how much memory is needed to load relocatable object. */
- buf_align = 1;
- bss_align = 1;
- buf_sz = 0;
- bss_sz = 0;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- if (buf_align < align)
- buf_align = align;
- buf_sz = ALIGN(buf_sz, align);
- buf_sz += sechdrs[i].sh_size;
- } else {
- /* bss section */
- if (bss_align < align)
- bss_align = align;
- bss_sz = ALIGN(bss_sz, align);
- bss_sz += sechdrs[i].sh_size;
- }
- }
-
- /* Determine the bss padding required to align bss properly */
- bss_pad = 0;
- if (buf_sz & (bss_align - 1))
- bss_pad = bss_align - (buf_sz & (bss_align - 1));
-
- memsz = buf_sz + bss_pad + bss_sz;
-
- /* Allocate buffer for purgatory */
- purgatory_buf = vzalloc(buf_sz);
- if (!purgatory_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (buf_align < bss_align)
- buf_align = bss_align;
-
- /* Add buffer to segment list */
- ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
- buf_align, min, max, top_down,
- &pi->purgatory_load_addr);
- if (ret)
- goto out;
-
- /* Load SHF_ALLOC sections */
- buf_addr = purgatory_buf;
- load_addr = curr_load_addr = pi->purgatory_load_addr;
- bss_addr = load_addr + buf_sz + bss_pad;
-
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- if (!(sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
-
- align = sechdrs[i].sh_addralign;
- if (sechdrs[i].sh_type != SHT_NOBITS) {
- curr_load_addr = ALIGN(curr_load_addr, align);
- offset = curr_load_addr - load_addr;
- /* We already modifed ->sh_offset to keep src addr */
- src = (char *) sechdrs[i].sh_offset;
- memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
-
- /* Store load address and source address of section */
- sechdrs[i].sh_addr = curr_load_addr;
-
- /*
- * This section got copied to temporary buffer. Update
- * ->sh_offset accordingly.
- */
- sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
-
- /* Advance to the next address */
- curr_load_addr += sechdrs[i].sh_size;
- } else {
- bss_addr = ALIGN(bss_addr, align);
- sechdrs[i].sh_addr = bss_addr;
- bss_addr += sechdrs[i].sh_size;
- }
- }
-
- /* Update entry point based on load address of text section */
- if (entry_sidx >= 0)
- entry += sechdrs[entry_sidx].sh_addr;
-
- /* Make kernel jump to purgatory after shutdown */
- image->start = entry;
-
- /* Used later to get/set symbol values */
- pi->sechdrs = sechdrs;
-
- /*
- * Used later to identify which section is purgatory and skip it
- * from checksumming.
- */
- pi->purgatory_buf = purgatory_buf;
- return ret;
-out:
- vfree(sechdrs);
- vfree(purgatory_buf);
- return ret;
-}
-
-static int kexec_apply_relocations(struct kimage *image)
-{
- int i, ret;
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Shdr *sechdrs = pi->sechdrs;
-
- /* Apply relocations */
- for (i = 0; i < pi->ehdr->e_shnum; i++) {
- Elf_Shdr *section, *symtab;
-
- if (sechdrs[i].sh_type != SHT_RELA &&
- sechdrs[i].sh_type != SHT_REL)
- continue;
-
- /*
- * For section of type SHT_RELA/SHT_REL,
- * ->sh_link contains section header index of associated
- * symbol table. And ->sh_info contains section header
- * index of section to which relocations apply.
- */
- if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
- sechdrs[i].sh_link >= pi->ehdr->e_shnum)
- return -ENOEXEC;
-
- section = &sechdrs[sechdrs[i].sh_info];
- symtab = &sechdrs[sechdrs[i].sh_link];
-
- if (!(section->sh_flags & SHF_ALLOC))
- continue;
-
- /*
- * symtab->sh_link contain section header index of associated
- * string table.
- */
- if (symtab->sh_link >= pi->ehdr->e_shnum)
- /* Invalid section number? */
- continue;
-
- /*
- * Respective architecture needs to provide support for applying
- * relocations of type SHT_RELA/SHT_REL.
- */
- if (sechdrs[i].sh_type == SHT_RELA)
- ret = arch_kexec_apply_relocations_add(pi->ehdr,
- sechdrs, i);
- else if (sechdrs[i].sh_type == SHT_REL)
- ret = arch_kexec_apply_relocations(pi->ehdr,
- sechdrs, i);
- if (ret)
- return ret;
- }
-
- return 0;
-}
-
-/* Load relocatable purgatory object and relocate it appropriately */
-int kexec_load_purgatory(struct kimage *image, unsigned long min,
- unsigned long max, int top_down,
- unsigned long *load_addr)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- int ret;
-
- if (kexec_purgatory_size <= 0)
- return -EINVAL;
-
- if (kexec_purgatory_size < sizeof(Elf_Ehdr))
- return -ENOEXEC;
-
- pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
-
- if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
- || pi->ehdr->e_type != ET_REL
- || !elf_check_arch(pi->ehdr)
- || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
-
- if (pi->ehdr->e_shoff >= kexec_purgatory_size
- || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
- kexec_purgatory_size - pi->ehdr->e_shoff))
- return -ENOEXEC;
-
- ret = __kexec_load_purgatory(image, min, max, top_down);
- if (ret)
- return ret;
-
- ret = kexec_apply_relocations(image);
- if (ret)
- goto out;
-
- *load_addr = pi->purgatory_load_addr;
- return 0;
-out:
- vfree(pi->sechdrs);
- vfree(pi->purgatory_buf);
- return ret;
-}
-
-static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
- const char *name)
-{
- Elf_Sym *syms;
- Elf_Shdr *sechdrs;
- Elf_Ehdr *ehdr;
- int i, k;
- const char *strtab;
-
- if (!pi->sechdrs || !pi->ehdr)
- return NULL;
-
- sechdrs = pi->sechdrs;
- ehdr = pi->ehdr;
-
- for (i = 0; i < ehdr->e_shnum; i++) {
- if (sechdrs[i].sh_type != SHT_SYMTAB)
- continue;
-
- if (sechdrs[i].sh_link >= ehdr->e_shnum)
- /* Invalid strtab section number */
- continue;
- strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
- syms = (Elf_Sym *)sechdrs[i].sh_offset;
-
- /* Go through symbols for a match */
- for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
- if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
- continue;
-
- if (strcmp(strtab + syms[k].st_name, name) != 0)
- continue;
-
- if (syms[k].st_shndx == SHN_UNDEF ||
- syms[k].st_shndx >= ehdr->e_shnum) {
- pr_debug("Symbol: %s has bad section index %d.\n",
- name, syms[k].st_shndx);
- return NULL;
- }
-
- /* Found the symbol we are looking for */
- return &syms[k];
- }
- }
-
- return NULL;
-}
-
-void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
-{
- struct purgatory_info *pi = &image->purgatory_info;
- Elf_Sym *sym;
- Elf_Shdr *sechdr;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return ERR_PTR(-EINVAL);
-
- sechdr = &pi->sechdrs[sym->st_shndx];
-
- /*
- * Returns the address where symbol will finally be loaded after
- * kexec_load_segment()
- */
- return (void *)(sechdr->sh_addr + sym->st_value);
-}
-
-/*
- * Get or set value of a symbol. If "get_value" is true, symbol value is
- * returned in buf otherwise symbol value is set based on value in buf.
- */
-int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
- void *buf, unsigned int size, bool get_value)
-{
- Elf_Sym *sym;
- Elf_Shdr *sechdrs;
- struct purgatory_info *pi = &image->purgatory_info;
- char *sym_buf;
-
- sym = kexec_purgatory_find_symbol(pi, name);
- if (!sym)
- return -EINVAL;
-
- if (sym->st_size != size) {
- pr_err("symbol %s size mismatch: expected %lu actual %u\n",
- name, (unsigned long)sym->st_size, size);
- return -EINVAL;
- }
-
- sechdrs = pi->sechdrs;
-
- if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
- pr_err("symbol %s is in a bss section. Cannot %s\n", name,
- get_value ? "get" : "set");
- return -EINVAL;
- }
-
- sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
- sym->st_value;
-
- if (get_value)
- memcpy((void *)buf, sym_buf, size);
- else
- memcpy((void *)sym_buf, buf, size);
-
- return 0;
-}
-#endif /* CONFIG_KEXEC_FILE */
-
-/*
- * Move into place and start executing a preloaded standalone
- * executable. If nothing was preloaded return an error.
- */
-int kernel_kexec(void)
-{
- int error = 0;
-
- if (!mutex_trylock(&kexec_mutex))
- return -EBUSY;
- if (!kexec_image) {
- error = -EINVAL;
- goto Unlock;
- }
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- lock_system_sleep();
- pm_prepare_console();
- error = freeze_processes();
- if (error) {
- error = -EBUSY;
- goto Restore_console;
- }
- suspend_console();
- error = dpm_suspend_start(PMSG_FREEZE);
- if (error)
- goto Resume_console;
- /* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_end(). We *must* call
- * dpm_suspend_end() now. Otherwise, drivers for
- * some devices (e.g. interrupt controllers) become
- * desynchronized with the actual state of the
- * hardware at resume time, and evil weirdness ensues.
- */
- error = dpm_suspend_end(PMSG_FREEZE);
- if (error)
- goto Resume_devices;
- error = disable_nonboot_cpus();
- if (error)
- goto Enable_cpus;
- local_irq_disable();
- error = syscore_suspend();
- if (error)
- goto Enable_irqs;
- } else
-#endif
- {
- kexec_in_progress = true;
- kernel_restart_prepare(NULL);
- migrate_to_reboot_cpu();
-
- /*
- * migrate_to_reboot_cpu() disables CPU hotplug assuming that
- * no further code needs to use CPU hotplug (which is true in
- * the reboot case). However, the kexec path depends on using
- * CPU hotplug again; so re-enable it here.
- */
- cpu_hotplug_enable();
- pr_emerg("Starting new kernel\n");
- machine_shutdown();
- }
-
- machine_kexec(kexec_image);
-
-#ifdef CONFIG_KEXEC_JUMP
- if (kexec_image->preserve_context) {
- syscore_resume();
- Enable_irqs:
- local_irq_enable();
- Enable_cpus:
- enable_nonboot_cpus();
- dpm_resume_start(PMSG_RESTORE);
- Resume_devices:
- dpm_resume_end(PMSG_RESTORE);
- Resume_console:
- resume_console();
- thaw_processes();
- Restore_console:
- pm_restore_console();
- unlock_system_sleep();
- }
-#endif
-
- Unlock:
- mutex_unlock(&kexec_mutex);
- return error;
-}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..11b64a63c0f8
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1534 @@
+/*
+ * kexec.c - kexec system call core code.
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/syscore_ops.h>
+#include <linux/compiler.h>
+#include <linux/hugetlb.h>
+
+#include <asm/page.h>
+#include <asm/sections.h>
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include "kexec_internal.h"
+
+DEFINE_MUTEX(kexec_mutex);
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t __percpu *crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+struct resource crashk_low_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+ /*
+ * If crash_kexec_post_notifiers is enabled, don't run
+ * crash_kexec() here yet, which must be run after panic
+ * notifiers in panic().
+ */
+ if (crash_kexec_post_notifiers)
+ return 0;
+ /*
+ * There are 4 panic() calls in do_exit() path, each of which
+ * corresponds to each of these 4 conditions.
+ */
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
+
+int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addresses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ return result;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ return result;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ return result;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ return result;
+ }
+
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+
+ return 0;
+}
+
+struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
+}
+
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
+{
+ struct page *pages;
+
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+
+ pages->mapping = NULL;
+ set_page_private(pages, order);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page_private(page);
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everything to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ image->control_page = hole_end;
+ break;
+ }
+ }
+
+ return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unusable pages I have cached */
+ kimage_free_page_list(&image->unusable_pages);
+
+}
+void kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION) ? \
+ phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ } else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return NULL;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return NULL;
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unusable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
+ */
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
+ addr = old_addr;
+ page = old_page;
+ break;
+ }
+ /* Place the page on the destination list, to be used later */
+ list_add(&page->lru, &image->dest_pages);
+ }
+
+ return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap(page);
+ /* Start with a clear page */
+ clear_page(ptr);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ unsigned long maddr;
+ size_t ubytes, mbytes;
+ int result;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
+
+ result = 0;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = pfn_to_page(maddr >> PAGE_SHIFT);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ ptr = kmap(page);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
+ kunmap(page);
+ if (result) {
+ result = -EFAULT;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, segment);
+ break;
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, segment);
+ break;
+ }
+
+ return result;
+}
+
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
+int kexec_load_disabled;
+
+void crash_kexec(struct pt_regs *regs)
+{
+ /* Take the kexec_mutex here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
+ }
+ mutex_unlock(&kexec_mutex);
+ }
+}
+
+size_t crash_get_memory_size(void)
+{
+ size_t size = 0;
+
+ mutex_lock(&kexec_mutex);
+ if (crashk_res.end != crashk_res.start)
+ size = resource_size(&crashk_res);
+ mutex_unlock(&kexec_mutex);
+ return size;
+}
+
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+ unsigned long end)
+{
+ unsigned long addr;
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+}
+
+int crash_shrink_memory(unsigned long new_size)
+{
+ int ret = 0;
+ unsigned long start, end;
+ unsigned long old_size;
+ struct resource *ram_res;
+
+ mutex_lock(&kexec_mutex);
+
+ if (kexec_crash_image) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+ start = crashk_res.start;
+ end = crashk_res.end;
+ old_size = (end == 0) ? 0 : end - start + 1;
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
+
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
+ end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+
+ crash_map_reserved_pages();
+ crash_free_reserved_phys_range(end, crashk_res.end);
+
+ if ((start == end) && (crashk_res.parent != NULL))
+ release_resource(&crashk_res);
+
+ ram_res->start = end;
+ ram_res->end = crashk_res.end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->name = "System RAM";
+
+ crashk_res.end = end - 1;
+
+ insert_resource(&iomem_resource, ram_res);
+ crash_unmap_reserved_pages();
+
+unlock:
+ mutex_unlock(&kexec_mutex);
+ return ret;
+}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, &note, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, &note, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ size_t size, align;
+
+ /*
+ * crash_notes could be allocated across 2 vmalloc pages when percpu
+ * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
+ * pages are also on 2 continuous physical pages. In this case the
+ * 2nd part of crash_notes in 2nd page could be lost since only the
+ * starting address and size of crash_notes are exported through sysfs.
+ * Here round up the size of crash_notes to the nearest power of two
+ * and pass it to __alloc_percpu as align value. This can make sure
+ * crash_notes is allocated inside one physical page.
+ */
+ size = sizeof(note_buf_t);
+ align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
+
+ /*
+ * Break compile if size is bigger than PAGE_SIZE since crash_notes
+ * definitely will be in 2 pages with that.
+ */
+ BUILD_BUG_ON(size > PAGE_SIZE);
+
+ crash_notes = __alloc_percpu(size, align);
+ if (!crash_notes) {
+ pr_warn("Memory allocation for saving cpu register states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+subsys_initcall(crash_notes_memory_init);
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warn("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warn("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warn("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warn("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warn("Memory value expected after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
+/*
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char: %c\n", *cur);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
+static int __init __parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *name,
+ const char *suffix)
+{
+ char *first_colon, *first_space;
+ char *ck_cmdline;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += strlen(name);
+
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ suffix);
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+
+ return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
+}
+
+int __init parse_crashkernel_low(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
+}
+
+static void update_vmcoreinfo_note(void)
+{
+ u32 *buf = vmcoreinfo_note;
+
+ if (!vmcoreinfo_size)
+ return;
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+ final_note(buf);
+}
+
+void crash_save_vmcoreinfo(void)
+{
+ vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
+ update_vmcoreinfo_note();
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ size_t r;
+
+ va_start(args, fmt);
+ r = vscnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __weak paddr_vmcoreinfo_note(void)
+{
+ return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmap_area_list);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(page, _mapcount);
+ VMCOREINFO_OFFSET(page, private);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_kexec_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+ VMCOREINFO_NUMBER(PG_slab);
+#ifdef CONFIG_MEMORY_FAILURE
+ VMCOREINFO_NUMBER(PG_hwpoison);
+#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
+ VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+#ifdef CONFIG_X86
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+#endif
+#ifdef CONFIG_HUGETLBFS
+ VMCOREINFO_SYMBOL(free_huge_page);
+#endif
+
+ arch_crash_save_vmcoreinfo();
+ update_vmcoreinfo_note();
+
+ return 0;
+}
+
+subsys_initcall(crash_save_vmcoreinfo_init);
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ lock_system_sleep();
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = dpm_suspend_start(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_end(). We *must* call
+ * dpm_suspend_end() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = dpm_suspend_end(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kexec_in_progress = true;
+ kernel_restart_prepare(NULL);
+ migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
+ pr_emerg("Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ syscore_resume();
+ Enable_irqs:
+ local_irq_enable();
+ Enable_cpus:
+ enable_nonboot_cpus();
+ dpm_resume_start(PMSG_RESTORE);
+ Resume_devices:
+ dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ unlock_system_sleep();
+ }
+#endif
+
+ Unlock:
+ mutex_unlock(&kexec_mutex);
+ return error;
+}
+
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+
+void __weak crash_unmap_reserved_pages(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..b70ada0028d2
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1047 @@
+/*
+ * kexec: kexec_file_load system call
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <linux/syscalls.h>
+#include <linux/vmalloc.h>
+#include "kexec_internal.h"
+
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
+{
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
+ goto out;
+ }
+
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
+
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ return -EINVAL;
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
+ /*
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
+ */
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
+ if (!image->control_code_page) {
+ pr_err("Could not allocate control_code_buffer\n");
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
+ }
+
+ *rimage = image;
+ return 0;
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
+ kfree(image);
+ return ret;
+}
+
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ kbuf->mem = temp_start;
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = kbuf->buffer;
+ ksegment->bufsz = kbuf->bufsz;
+ ksegment->mem = kbuf->mem;
+ ksegment->memsz = kbuf->memsz;
+ image->nr_segments++;
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective architecture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
+#ifndef LINUX_KEXEC_INTERNAL_H
+#define LINUX_KEXEC_INTERNAL_H
+
+#include <linux/kexec.h>
+
+struct kimage *do_kimage_alloc_init(void);
+int sanity_check_segment_list(struct kimage *image);
+void kimage_free_page_list(struct list_head *list);
+void kimage_free(struct kimage *image);
+int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
+void kimage_terminate(struct kimage *image);
+int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+
+extern struct mutex kexec_mutex;
+
+#ifdef CONFIG_KEXEC_FILE
+void kimage_file_post_load_cleanup(struct kimage *image);
+#else /* CONFIG_KEXEC_FILE */
+static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
+#endif /* CONFIG_KEXEC_FILE */
+#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2777f40a9c7b..0277d1216f80 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
extern int max_threads;
-static struct workqueue_struct *khelper_wq;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -114,10 +112,11 @@ out:
* @...: arguments as specified in the format string
*
* Load a module using the user mode module loader. The function returns
- * zero on success or a negative errno code on failure. Note that a
- * successful module load does not mean the module did not then unload
- * and exit on an error of its own. Callers must check that the service
- * they requested is now available not blindly invoke it.
+ * zero on success or a negative errno code or positive exit code from
+ * "modprobe" on failure. Note that a successful module load does not mean
+ * the module did not then unload and exit on an error of its own. Callers
+ * must check that the service they requested is now available not blindly
+ * invoke it.
*
* If module auto-loading support is disabled then this function
* becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
/*
* This is the task which runs the usermode application
*/
-static int ____call_usermodehelper(void *data)
+static int call_usermodehelper_exec_async(void *data)
{
struct subprocess_info *sub_info = data;
struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
flush_signal_handlers(current, 1);
spin_unlock_irq(&current->sighand->siglock);
- /* We can run anywhere, unlike our parent keventd(). */
- set_cpus_allowed_ptr(current, cpu_all_mask);
-
/*
- * Our parent is keventd, which runs with elevated scheduling priority.
- * Avoid propagating that into the userspace child.
+ * Our parent (unbound workqueue) runs with elevated scheduling
+ * priority. Avoid propagating that into the userspace child.
*/
set_user_nice(current, 0);
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
(const char __user *const __user *)sub_info->envp);
out:
sub_info->retval = retval;
- /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+ /*
+ * call_usermodehelper_exec_sync() will call umh_complete
+ * if UHM_WAIT_PROC.
+ */
if (!(sub_info->wait & UMH_WAIT_PROC))
umh_complete(sub_info);
if (!retval)
@@ -266,15 +265,14 @@ out:
do_exit(0);
}
-/* Keventd can't block, but this (a child) can. */
-static int wait_for_helper(void *data)
+/* Handles UMH_WAIT_PROC. */
+static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
{
- struct subprocess_info *sub_info = data;
pid_t pid;
/* If SIGCLD is ignored sys_wait4 won't populate the status. */
kernel_sigaction(SIGCHLD, SIG_DFL);
- pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
if (pid < 0) {
sub_info->retval = pid;
} else {
@@ -282,44 +280,64 @@ static int wait_for_helper(void *data)
/*
* Normally it is bogus to call wait4() from in-kernel because
* wait4() wants to write the exit code to a userspace address.
- * But wait_for_helper() always runs as keventd, and put_user()
- * to a kernel address works OK for kernel threads, due to their
- * having an mm_segment_t which spans the entire address space.
+ * But call_usermodehelper_exec_sync() always runs as kernel
+ * thread (workqueue) and put_user() to a kernel address works
+ * OK for kernel threads, due to their having an mm_segment_t
+ * which spans the entire address space.
*
* Thus the __user pointer cast is valid here.
*/
sys_wait4(pid, (int __user *)&ret, 0, NULL);
/*
- * If ret is 0, either ____call_usermodehelper failed and the
- * real error code is already in sub_info->retval or
+ * If ret is 0, either call_usermodehelper_exec_async failed and
+ * the real error code is already in sub_info->retval or
* sub_info->retval is 0 anyway, so don't mess with it then.
*/
if (ret)
sub_info->retval = ret;
}
+ /* Restore default kernel sig handler */
+ kernel_sigaction(SIGCHLD, SIG_IGN);
+
umh_complete(sub_info);
- do_exit(0);
}
-/* This is run by khelper thread */
-static void __call_usermodehelper(struct work_struct *work)
+/*
+ * We need to create the usermodehelper kernel thread from a task that is affine
+ * to an optimized set of CPUs (or nohz housekeeping ones) such that they
+ * inherit a widest affinity irrespective of call_usermodehelper() callers with
+ * possibly reduced affinity (eg: per-cpu workqueues). We don't want
+ * usermodehelper targets to contend a busy CPU.
+ *
+ * Unbound workqueues provide such wide affinity and allow to block on
+ * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
+ *
+ * Besides, workqueues provide the privilege level that caller might not have
+ * to perform the usermodehelper request.
+ *
+ */
+static void call_usermodehelper_exec_work(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- pid_t pid;
- if (sub_info->wait & UMH_WAIT_PROC)
- pid = kernel_thread(wait_for_helper, sub_info,
- CLONE_FS | CLONE_FILES | SIGCHLD);
- else
- pid = kernel_thread(____call_usermodehelper, sub_info,
- SIGCHLD);
-
- if (pid < 0) {
- sub_info->retval = pid;
- umh_complete(sub_info);
+ if (sub_info->wait & UMH_WAIT_PROC) {
+ call_usermodehelper_exec_sync(sub_info);
+ } else {
+ pid_t pid;
+ /*
+ * Use CLONE_PARENT to reparent it to kthreadd; we do not
+ * want to pollute current->children, and we need a parent
+ * that always ignores SIGCHLD to ensure auto-reaping.
+ */
+ pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
+ CLONE_PARENT | SIGCHLD);
+ if (pid < 0) {
+ sub_info->retval = pid;
+ umh_complete(sub_info);
+ }
}
}
@@ -509,7 +527,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
if (!sub_info)
goto out;
- INIT_WORK(&sub_info->work, __call_usermodehelper);
+ INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
@@ -531,8 +549,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
* from interrupt context.
*
* Runs a user-space application. The application is started
- * asynchronously if wait is not set, and runs as a child of keventd.
- * (ie. it runs with full root capabilities).
+ * asynchronously if wait is not set, and runs as a child of system workqueues.
+ * (ie. it runs with full root capabilities and optimized affinity).
*/
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
@@ -544,7 +562,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
return -EINVAL;
}
helper_lock();
- if (!khelper_wq || usermodehelper_disabled) {
+ if (usermodehelper_disabled) {
retval = -EBUSY;
goto out;
}
@@ -556,7 +574,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
- queue_work(khelper_wq, &sub_info->work);
+ queue_work(system_unbound_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
@@ -686,9 +704,3 @@ struct ctl_table usermodehelper_table[] = {
},
{ }
};
-
-void __init usermodehelper_init(void)
-{
- khelper_wq = create_singlethread_workqueue("khelper");
- BUG_ON(!khelper_wq);
-}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
addr < (unsigned long)__kprobes_text_end;
}
-static bool within_kprobe_blacklist(unsigned long addr)
+bool within_kprobe_blacklist(unsigned long addr)
{
struct kprobe_blacklist_entry *ent;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
KERNEL_ATTR_RW(profiling);
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(vmcoreinfo);
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
/* whether file capabilities are enabled */
static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_PROFILING
&profiling_attr.attr,
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
&kexec_loaded_attr.attr,
&kexec_crash_loaded_attr.attr,
&kexec_crash_size_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fdea0bee7b5a..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run().
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
*
* If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
@@ -327,16 +328,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
+ unsigned long flags;
+
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
+
/* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+ __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+ __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
}
/**
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c40ebcca0495..6e5344112419 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -348,8 +348,10 @@ static void klp_disable_func(struct klp_func *func)
{
struct klp_ops *ops;
- WARN_ON(func->state != KLP_ENABLED);
- WARN_ON(!func->old_addr);
+ if (WARN_ON(func->state != KLP_ENABLED))
+ return;
+ if (WARN_ON(!func->old_addr))
+ return;
ops = klp_find_ops(func->old_addr);
if (WARN_ON(!ops))
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 7dd5c9918e4c..8e96f6cc2a4a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
-obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8acfbf773e06..deae3907ac1e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
return;
/* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_WAIT))
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return;
/* this guy won't enter reclaim */
@@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references)
+ int references, int pin_count)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = lockstat_clock();
#endif
- hlock->pin_count = 0;
+ hlock->pin_count = pin_count;
if (check && !mark_irqflags(curr, hlock))
return 0;
@@ -3343,7 +3343,7 @@ found_it:
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references))
+ hlock->references, hlock->pin_count))
return 0;
}
@@ -3433,7 +3433,7 @@ found_it:
hlock_class(hlock)->subclass, hlock->trylock,
hlock->read, hlock->check, hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references))
+ hlock->references, hlock->pin_count))
return 0;
}
@@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
current->lockdep_recursion = 1;
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 32244186f1f2..8ef1919d63b2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -17,12 +17,14 @@
*
* Copyright (C) IBM Corporation, 2014
*
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ * Davidlohr Bueso <dave@stgolabs.net>
* Based on kernel/rcu/torture.c.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kthread.h>
+#include <linux/sched/rt.h>
#include <linux/spinlock.h>
#include <linux/rwlock.h>
#include <linux/mutex.h>
@@ -34,6 +36,7 @@
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/slab.h>
+#include <linux/percpu-rwsem.h>
#include <linux/torture.h>
MODULE_LICENSE("GPL");
@@ -91,11 +94,13 @@ struct lock_torture_ops {
void (*init)(void);
int (*writelock)(void);
void (*write_delay)(struct torture_random_state *trsp);
+ void (*task_boost)(struct torture_random_state *trsp);
void (*writeunlock)(void);
int (*readlock)(void);
void (*read_delay)(struct torture_random_state *trsp);
void (*readunlock)(void);
- unsigned long flags;
+
+ unsigned long flags; /* for irq spinlocks */
const char *name;
};
@@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void)
/* BUGGY, do not use in real life!!! */
}
+static void torture_boost_dummy(struct torture_random_state *trsp)
+{
+ /* Only rtmutexes care about priority */
+}
+
static struct lock_torture_ops lock_busted_ops = {
.writelock = torture_lock_busted_write_lock,
.write_delay = torture_lock_busted_write_delay,
+ .task_boost = torture_boost_dummy,
.writeunlock = torture_lock_busted_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_ops = {
.writelock = torture_spin_lock_write_lock,
.write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_boost_dummy,
.writeunlock = torture_spin_lock_write_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -211,6 +223,7 @@ __releases(torture_spinlock)
static struct lock_torture_ops spin_lock_irq_ops = {
.writelock = torture_spin_lock_write_lock_irq,
.write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_boost_dummy,
.writeunlock = torture_lock_spin_write_unlock_irq,
.readlock = NULL,
.read_delay = NULL,
@@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_ops = {
.writelock = torture_rwlock_write_lock,
.write_delay = torture_rwlock_write_delay,
+ .task_boost = torture_boost_dummy,
.writeunlock = torture_rwlock_write_unlock,
.readlock = torture_rwlock_read_lock,
.read_delay = torture_rwlock_read_delay,
@@ -315,6 +329,7 @@ __releases(torture_rwlock)
static struct lock_torture_ops rw_lock_irq_ops = {
.writelock = torture_rwlock_write_lock_irq,
.write_delay = torture_rwlock_write_delay,
+ .task_boost = torture_boost_dummy,
.writeunlock = torture_rwlock_write_unlock_irq,
.readlock = torture_rwlock_read_lock_irq,
.read_delay = torture_rwlock_read_delay,
@@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex)
static struct lock_torture_ops mutex_lock_ops = {
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
+ .task_boost = torture_boost_dummy,
.writeunlock = torture_mutex_unlock,
.readlock = NULL,
.read_delay = NULL,
@@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = {
.name = "mutex_lock"
};
+#ifdef CONFIG_RT_MUTEXES
+static DEFINE_RT_MUTEX(torture_rtmutex);
+
+static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+{
+ rt_mutex_lock(&torture_rtmutex);
+ return 0;
+}
+
+static void torture_rtmutex_boost(struct torture_random_state *trsp)
+{
+ int policy;
+ struct sched_param param;
+ const unsigned int factor = 50000; /* yes, quite arbitrary */
+
+ if (!rt_task(current)) {
+ /*
+ * (1) Boost priority once every ~50k operations. When the
+ * task tries to take the lock, the rtmutex it will account
+ * for the new priority, and do any corresponding pi-dance.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor))) {
+ policy = SCHED_FIFO;
+ param.sched_priority = MAX_RT_PRIO - 1;
+ } else /* common case, do nothing */
+ return;
+ } else {
+ /*
+ * The task will remain boosted for another ~500k operations,
+ * then restored back to its original prio, and so forth.
+ *
+ * When @trsp is nil, we want to force-reset the task for
+ * stopping the kthread.
+ */
+ if (!trsp || !(torture_random(trsp) %
+ (cxt.nrealwriters_stress * factor * 2))) {
+ policy = SCHED_NORMAL;
+ param.sched_priority = 0;
+ } else /* common case, do nothing */
+ return;
+ }
+
+ sched_setscheduler_nocheck(current, policy, &param);
+}
+
+static void torture_rtmutex_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_ms = 100;
+
+ /*
+ * We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+ mdelay(longdelay_ms);
+ if (!(torture_random(trsp) %
+ (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+{
+ rt_mutex_unlock(&torture_rtmutex);
+}
+
+static struct lock_torture_ops rtmutex_lock_ops = {
+ .writelock = torture_rtmutex_lock,
+ .write_delay = torture_rtmutex_delay,
+ .task_boost = torture_rtmutex_boost,
+ .writeunlock = torture_rtmutex_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "rtmutex_lock"
+};
+#endif
+
static DECLARE_RWSEM(torture_rwsem);
static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
{
@@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
static struct lock_torture_ops rwsem_lock_ops = {
.writelock = torture_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
+ .task_boost = torture_boost_dummy,
.writeunlock = torture_rwsem_up_write,
.readlock = torture_rwsem_down_read,
.read_delay = torture_rwsem_read_delay,
@@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
.name = "rwsem_lock"
};
+#include <linux/percpu-rwsem.h>
+static struct percpu_rw_semaphore pcpu_rwsem;
+
+void torture_percpu_rwsem_init(void)
+{
+ BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
+}
+
+static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+{
+ percpu_down_write(&pcpu_rwsem);
+ return 0;
+}
+
+static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+{
+ percpu_up_write(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+{
+ percpu_down_read(&pcpu_rwsem);
+ return 0;
+}
+
+static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+{
+ percpu_up_read(&pcpu_rwsem);
+}
+
+static struct lock_torture_ops percpu_rwsem_lock_ops = {
+ .init = torture_percpu_rwsem_init,
+ .writelock = torture_percpu_rwsem_down_write,
+ .write_delay = torture_rwsem_write_delay,
+ .task_boost = torture_boost_dummy,
+ .writeunlock = torture_percpu_rwsem_up_write,
+ .readlock = torture_percpu_rwsem_down_read,
+ .read_delay = torture_rwsem_read_delay,
+ .readunlock = torture_percpu_rwsem_up_read,
+ .name = "percpu_rwsem_lock"
+};
+
/*
* Lock torture writer kthread. Repeatedly acquires and releases
* the lock, checking for duplicate acquisitions.
@@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
+ cxt.cur_ops->task_boost(&rand);
cxt.cur_ops->writelock();
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++;
@@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg)
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
+
+ cxt.cur_ops->task_boost(NULL); /* reset prio */
torture_kthread_stopping("lock_torture_writer");
return 0;
}
@@ -642,7 +788,11 @@ static int __init lock_torture_init(void)
&spin_lock_ops, &spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
+#ifdef CONFIG_RT_MUTEXES
+ &rtmutex_lock_ops,
+#endif
&rwsem_lock_ops,
+ &percpu_rwsem_lock_ops,
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -661,11 +811,11 @@ static int __init lock_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_alert(" %s", torture_ops[i]->name);
pr_alert("\n");
- torture_init_end();
- return -EINVAL;
+ firsterr = -EINVAL;
+ goto unwind;
}
if (cxt.cur_ops->init)
- cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+ cxt.cur_ops->init();
if (nwriters_stress >= 0)
cxt.nrealwriters_stress = nwriters_stress;
@@ -676,6 +826,10 @@ static int __init lock_torture_init(void)
if (strncmp(torture_type, "mutex", 5) == 0)
cxt.debug_lock = true;
#endif
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+ if (strncmp(torture_type, "rtmutex", 7) == 0)
+ cxt.debug_lock = true;
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
if ((strncmp(torture_type, "spin", 4) == 0) ||
(strncmp(torture_type, "rw_lock", 7) == 0))
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index fd91aaa4554c..5b9102a47ea5 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -67,7 +67,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;
- prev = xchg(lock, node);
+ prev = xchg_acquire(lock, node);
if (likely(prev == NULL)) {
/*
* Lock acquired, don't need to set node->locked to 1. Threads
@@ -98,7 +98,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
/*
* Release the lock by setting it to NULL
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(cmpxchg_release(lock, node, NULL) == node))
return;
/* Wait until the next pointer is set */
while (!(next = READ_ONCE(node->next)))
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4cccea6b8934..0551c219c40e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -277,7 +277,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
static inline bool mutex_try_to_acquire(struct mutex *lock)
{
return !mutex_is_locked(lock) &&
- (atomic_cmpxchg(&lock->count, 1, 0) == 1);
+ (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1);
}
/*
@@ -529,7 +529,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* Once more, try to acquire the lock. Only try-lock the mutex if
* it is unlocked to reduce unnecessary xchg() operations.
*/
- if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
+ if (!mutex_is_locked(lock) &&
+ (atomic_xchg_acquire(&lock->count, 0) == 1))
goto skip_wait;
debug_mutex_lock_common(lock, &waiter);
@@ -553,7 +554,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* non-negative in order to avoid unnecessary xchg operations:
*/
if (atomic_read(&lock->count) >= 0 &&
- (atomic_xchg(&lock->count, -1) == 1))
+ (atomic_xchg_acquire(&lock->count, -1) == 1))
break;
/*
@@ -867,7 +868,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
spin_lock_mutex(&lock->wait_lock, flags);
- prev = atomic_xchg(&lock->count, -1);
+ prev = atomic_xchg_acquire(&lock->count, -1);
if (likely(prev == 1)) {
mutex_set_owner(lock);
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index dc85ee23a26f..d092a0c9c2d4 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -50,7 +50,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
for (;;) {
if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+ atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -92,7 +92,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
node->next = NULL;
node->cpu = curr;
- old = atomic_xchg(&lock->tail, curr);
+ /*
+ * ACQUIRE semantics, pairs with corresponding RELEASE
+ * in unlock() uncontended, or fastpath.
+ */
+ old = atomic_xchg_acquire(&lock->tail, curr);
if (old == OSQ_UNLOCKED_VAL)
return true;
@@ -184,7 +188,8 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Fast path for the uncontended case.
*/
- if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+ if (likely(atomic_cmpxchg_release(&lock->tail, curr,
+ OSQ_UNLOCKED_VAL) == curr))
return;
/*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..f231e0bb311c 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
__init_rwsem(&brw->rw_sem, name, rwsem_key);
- atomic_set(&brw->write_ctr, 0);
+ rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
atomic_set(&brw->slow_read_ctr, 0);
init_waitqueue_head(&brw->write_waitq);
return 0;
}
+EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
{
+ /*
+ * XXX: temporary kludge. The error path in alloc_super()
+ * assumes that percpu_free_rwsem() is safe after kzalloc().
+ */
+ if (!brw->fast_read_ctr)
+ return;
+
+ rcu_sync_dtor(&brw->rss);
free_percpu(brw->fast_read_ctr);
brw->fast_read_ctr = NULL; /* catch use after free bugs */
}
/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- * R_W: down_write() comes after up_read(), the writer should see all
- * changes done by the reader
- * or
- * W_R: down_read() comes after up_write(), the reader should see all
- * changes done by the writer
+ * This is the fast-path for down_read/up_read. If it succeeds we rely
+ * on the barriers provided by rcu_sync_enter/exit; see the comments in
+ * percpu_down_write() and percpu_up_write().
*
* If this helper fails the callers rely on the normal rw_semaphore and
* atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
*/
static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
{
- bool success = false;
+ bool success;
preempt_disable();
- if (likely(!atomic_read(&brw->write_ctr))) {
+ success = rcu_sync_is_idle(&brw->rss);
+ if (likely(success))
__this_cpu_add(*brw->fast_read_ctr, val);
- success = true;
- }
preempt_enable();
return success;
@@ -77,16 +70,30 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
void percpu_down_read(struct percpu_rw_semaphore *brw)
{
might_sleep();
- if (likely(update_fast_ctr(brw, +1))) {
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+ if (likely(update_fast_ctr(brw, +1)))
return;
- }
- down_read(&brw->rw_sem);
+ /* Avoid rwsem_acquire_read() and rwsem_release() */
+ __down_read(&brw->rw_sem);
atomic_inc(&brw->slow_read_ctr);
- /* avoid up_read()->rwsem_release() */
__up_read(&brw->rw_sem);
}
+EXPORT_SYMBOL_GPL(percpu_down_read);
+
+int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+{
+ if (unlikely(!update_fast_ctr(brw, +1))) {
+ if (!__down_read_trylock(&brw->rw_sem))
+ return 0;
+ atomic_inc(&brw->slow_read_ctr);
+ __up_read(&brw->rw_sem);
+ }
+
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+ return 1;
+}
void percpu_up_read(struct percpu_rw_semaphore *brw)
{
@@ -99,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw)
if (atomic_dec_and_test(&brw->slow_read_ctr))
wake_up_all(&brw->write_waitq);
}
+EXPORT_SYMBOL_GPL(percpu_up_read);
static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
{
@@ -113,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
return sum;
}
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
void percpu_down_write(struct percpu_rw_semaphore *brw)
{
- /* tell update_fast_ctr() there is a pending writer */
- atomic_inc(&brw->write_ctr);
/*
- * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
- * so that update_fast_ctr() can't succeed.
- *
- * 2. Ensures we see the result of every previous this_cpu_add() in
- * update_fast_ctr().
+ * Make rcu_sync_is_idle() == F and thus disable the fast-path in
+ * percpu_down_read() and percpu_up_read(), and wait for gp pass.
*
- * 3. Ensures that if any reader has exited its critical section via
- * fast-path, it executes a full memory barrier before we return.
- * See R_W case in the comment above update_fast_ctr().
+ * The latter synchronises us with the preceding readers which used
+ * the fast-past, so we can not miss the result of __this_cpu_add()
+ * or anything else inside their criticial sections.
*/
- synchronize_sched_expedited();
+ rcu_sync_enter(&brw->rss);
/* exclude other writers, and block the new readers completely */
down_write(&brw->rw_sem);
@@ -150,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
/* wait for all readers to complete their percpu_up_read() */
wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
}
+EXPORT_SYMBOL_GPL(percpu_down_write);
void percpu_up_write(struct percpu_rw_semaphore *brw)
{
/* release the lock, but the readers can't use the fast-path */
up_write(&brw->rw_sem);
/*
- * Insert the barrier before the next fast-path in down_read,
- * see W_R case in the comment above update_fast_ctr().
+ * Enable the fast-path in percpu_down_read() and percpu_up_read()
+ * but only after another gp pass; this adds the necessary barrier
+ * to ensure the reader can't miss the changes done by us.
*/
- synchronize_sched_expedited();
- /* the last writer unblocks update_fast_ctr() */
- atomic_dec(&brw->write_ctr);
+ rcu_sync_exit(&brw->rss);
}
+EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 6c5da483966b..fec082338668 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
cpu_relax_lowlatency();
- cnts = smp_load_acquire((u32 *)&lock->cnts);
+ cnts = atomic_read_acquire(&lock->cnts);
}
}
/**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
*/
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
- u32 cnts;
-
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
- * Readers in interrupt context will spin until the lock is
- * available without waiting in the queue.
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet).
+ * The rspin_until_writer_unlock() function returns immediately
+ * in this case. Otherwise, they will spin (with ACQUIRE
+ * semantics) until the lock is available without waiting in
+ * the queue.
*/
- cnts = smp_load_acquire((u32 *)&lock->cnts);
rspin_until_writer_unlock(lock, cnts);
return;
}
@@ -84,42 +86,37 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
/*
* Put the reader into the wait queue
*/
- arch_spin_lock(&lock->lock);
+ arch_spin_lock(&lock->wait_lock);
/*
- * At the head of the wait queue now, wait until the writer state
- * goes to 0 and then try to increment the reader count and get
- * the lock. It is possible that an incoming writer may steal the
- * lock in the interim, so it is necessary to check the writer byte
- * to make sure that the write lock isn't taken.
+ * The ACQUIRE semantics of the following spinning code ensure
+ * that accesses can't leak upwards out of our subsequent critical
+ * section in the case that the lock is currently held for write.
*/
- while (atomic_read(&lock->cnts) & _QW_WMASK)
- cpu_relax_lowlatency();
-
- cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);
/*
* Signal the next one in queue to become queue head
*/
- arch_spin_unlock(&lock->lock);
+ arch_spin_unlock(&lock->wait_lock);
}
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;
/* Put the writer into the wait queue */
- arch_spin_lock(&lock->lock);
+ arch_spin_lock(&lock->wait_lock);
/* Try to acquire the lock directly if no reader is present */
if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
/*
@@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
struct __qrwlock *l = (struct __qrwlock *)lock;
if (!READ_ONCE(l->wmode) &&
- (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
+ (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
break;
cpu_relax_lowlatency();
@@ -140,13 +137,13 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
for (;;) {
cnts = atomic_read(&lock->cnts);
if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
+ (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
break;
cpu_relax_lowlatency();
}
unlock:
- arch_spin_unlock(&lock->lock);
+ arch_spin_unlock(&lock->wait_lock);
}
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c49202d532..87e9ce6a63c5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
-
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock,
struct mcs_spinlock *node) { }
@@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (pv_enabled())
goto queue;
- if (virt_queued_spin_lock(lock))
+ if (virt_spin_lock(lock))
return;
/*
@@ -440,7 +440,7 @@ queue:
cpu_relax();
arch_mcs_spin_unlock_contended(&next->locked);
- pv_kick_node(next);
+ pv_kick_node(lock, next);
release:
/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 04ab18151cc8..f0450ff4829b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -4,6 +4,7 @@
#include <linux/hash.h>
#include <linux/bootmem.h>
+#include <linux/debug_locks.h>
/*
* Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
@@ -21,9 +22,14 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
enum vcpu_state {
vcpu_running = 0,
- vcpu_halted,
+ vcpu_halted, /* Used only in pv_wait_node */
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
};
struct pv_node {
@@ -152,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
/*
* Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
*/
static void pv_wait_node(struct mcs_spinlock *node)
{
@@ -171,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
*
* [S] pn->state = vcpu_halted [S] next->locked = 1
* MB MB
- * [L] pn->locked [RmW] pn->state = vcpu_running
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
*
- * Matches the xchg() from pv_kick_node().
+ * Matches the cmpxchg() from pv_kick_node().
*/
smp_store_mb(pn->state, vcpu_halted);
@@ -181,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
pv_wait(&pn->state, vcpu_halted);
/*
- * Reset the vCPU state to avoid unncessary CPU kicking
+ * If pv_kick_node() changed us to vcpu_hashed, retain that value
+ * so that pv_wait_head() knows to not also try to hash this lock.
*/
- WRITE_ONCE(pn->state, vcpu_running);
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);
/*
* If the locked flag is still not set after wakeup, it is a
@@ -193,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* MCS lock will be released soon.
*/
}
+
/*
* By now our node->locked should be 1 and our caller will not actually
* spin-wait for it. We do however rely on our caller to do a
@@ -201,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
}
/*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
*/
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
/*
- * Note that because node->locked is already set, this actual
- * mcs_spinlock entry could be re-used already.
+ * If the vCPU is indeed halted, advance its state to match that of
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+ * observe its next->locked value and advance itself.
*
- * This should be fine however, kicking people for no reason is
- * harmless.
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ */
+ if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ return;
+
+ /*
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
*
- * See the comment in pv_wait_node().
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
*/
- if (xchg(&pn->state, vcpu_running) == vcpu_halted)
- pv_kick(pn->cpu);
+ WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
}
/*
@@ -232,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
struct qspinlock **lp = NULL;
int loop;
+ /*
+ * If pv_kick_node() already advanced our state, we don't need to
+ * insert ourselves into the hash table anymore.
+ */
+ if (READ_ONCE(pn->state) == vcpu_hashed)
+ lp = (struct qspinlock **)1;
+
for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
@@ -239,17 +266,19 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
cpu_relax();
}
- WRITE_ONCE(pn->state, vcpu_halted);
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
+
/*
- * lp must be set before setting _Q_SLOW_VAL
+ * We must hash before setting _Q_SLOW_VAL, such that
+ * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+ * we'll be sure to be able to observe our hash entry.
*
- * [S] lp = lock [RmW] l = l->locked = 0
- * MB MB
- * [S] l->locked = _Q_SLOW_VAL [L] lp
+ * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
+ * MB RMB
+ * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
*
- * Matches the cmpxchg() in __pv_queued_spin_unlock().
+ * Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
/*
@@ -286,14 +315,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
+ u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL))
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
+ return;
+
+ if (unlikely(locked != _Q_SLOW_VAL)) {
+ WARN(!debug_locks_silent,
+ "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+ (unsigned long)lock, atomic_read(&lock->val));
return;
+ }
+
+ /*
+ * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+ * so we need a barrier to order the read of the node data in
+ * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+ *
+ * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ */
+ smp_rmb();
/*
* Since the above failed to release, this must be the SLOW path.
@@ -310,9 +357,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
/*
* At this point the memory pointed at by lock can be freed/reused,
* however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
*/
- if (READ_ONCE(node->state) == vcpu_halted)
- pv_kick(node->cpu);
+ pv_kick(node->cpu);
}
/*
* Include the architecture specific callee-save thunk of the
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/device.h>
-#include <linux/kthread.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-#include <linux/stat.h>
-
-#include "rtmutex.h"
-
-#define MAX_RT_TEST_THREADS 8
-#define MAX_RT_TEST_MUTEXES 8
-
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-
-struct test_thread_data {
- int opcode;
- int opdata;
- int mutexes[MAX_RT_TEST_MUTEXES];
- int event;
- struct device dev;
-};
-
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-
-enum test_opcodes {
- RTTEST_NOP = 0,
- RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
- RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
- RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
- RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
- RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
- RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
- /* 9, 10 - reserved for BKL commemoration */
- RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
- RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
- RTTEST_RESET = 99, /* 99 Reset all pending operations */
-};
-
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
- int i, id, ret = -EINVAL;
-
- switch(td->opcode) {
-
- case RTTEST_NOP:
- return 0;
-
- case RTTEST_LOCKCONT:
- td->mutexes[td->opdata] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return 0;
-
- case RTTEST_RESET:
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
- if (td->mutexes[i] == 4) {
- rt_mutex_unlock(&mutexes[i]);
- td->mutexes[i] = 0;
- }
- }
- return 0;
-
- case RTTEST_RESETEVENT:
- atomic_set(&rttest_event, 0);
- return 0;
-
- default:
- if (lockwakeup)
- return ret;
- }
-
- switch(td->opcode) {
-
- case RTTEST_LOCK:
- case RTTEST_LOCKNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_lock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 4;
- return 0;
-
- case RTTEST_LOCKINT:
- case RTTEST_LOCKINTNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = ret ? 0 : 4;
- return ret ? -EINTR : 0;
-
- case RTTEST_UNLOCK:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
- return ret;
-
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_unlock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 0;
- return 0;
-
- default:
- break;
- }
- return ret;
-}
-
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
- int tid, op, dat;
- struct test_thread_data *td;
-
- /* We have to lookup the task */
- for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
- if (threads[tid] == current)
- break;
- }
-
- BUG_ON(tid == MAX_RT_TEST_THREADS);
-
- td = &thread_data[tid];
-
- op = td->opcode;
- dat = td->opdata;
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- break;
-
- if (td->mutexes[dat] != 1)
- break;
-
- td->mutexes[dat] = 2;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- default:
- break;
- }
-
- schedule();
-
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 3;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return;
-
- default:
- return;
- }
-
- td->opcode = 0;
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- int ret;
-
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 1);
- set_current_state(TASK_INTERRUPTIBLE);
- if (td->opcode == RTTEST_LOCKCONT)
- break;
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- }
-
- /* Restore previous command and data */
- td->opcode = op;
- td->opdata = dat;
-}
-
-static int test_func(void *data)
-{
- struct test_thread_data *td = data;
- int ret;
-
- current->flags |= PF_MUTEX_TESTER;
- set_freezable();
- allow_signal(SIGHUP);
-
- for(;;) {
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 0);
- set_current_state(TASK_INTERRUPTIBLE);
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- try_to_freeze();
-
- if (signal_pending(current))
- flush_signals(current);
-
- if(kthread_should_stop())
- break;
- }
- return 0;
-}
-
-/**
- * sysfs_test_command - interface for test commands
- * @dev: thread reference
- * @buf: command for actual step
- * @count: length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct sched_param schedpar;
- struct test_thread_data *td;
- char cmdbuf[32];
- int op, dat, tid, ret;
-
- td = container_of(dev, struct test_thread_data, dev);
- tid = td->dev.id;
-
- /* strings from sysfs write are not 0 terminated! */
- if (count >= sizeof(cmdbuf))
- return -EINVAL;
-
- /* strip of \n: */
- if (buf[count-1] == '\n')
- count--;
- if (count < 1)
- return -EINVAL;
-
- memcpy(cmdbuf, buf, count);
- cmdbuf[count] = 0;
-
- if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
- return -EINVAL;
-
- switch (op) {
- case RTTEST_SCHEDOT:
- schedpar.sched_priority = 0;
- ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
- if (ret)
- return ret;
- set_user_nice(current, 0);
- break;
-
- case RTTEST_SCHEDRT:
- schedpar.sched_priority = dat;
- ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
- if (ret)
- return ret;
- break;
-
- case RTTEST_SIGNAL:
- send_sig(SIGHUP, threads[tid], 0);
- break;
-
- default:
- if (td->opcode > 0)
- return -EBUSY;
- td->opdata = dat;
- td->opcode = op;
- wake_up_process(threads[tid]);
- }
-
- return count;
-}
-
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev: thread to query
- * @buf: char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct test_thread_data *td;
- struct task_struct *tsk;
- char *curr = buf;
- int i;
-
- td = container_of(dev, struct test_thread_data, dev);
- tsk = threads[td->dev.id];
-
- spin_lock(&rttest_lock);
-
- curr += sprintf(curr,
- "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
- td->opcode, td->event, tsk->state,
- (MAX_RT_PRIO - 1) - tsk->prio,
- (MAX_RT_PRIO - 1) - tsk->normal_prio,
- tsk->pi_blocked_on);
-
- for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
- curr += sprintf(curr, "%d", td->mutexes[i]);
-
- spin_unlock(&rttest_lock);
-
- curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
- mutexes[td->dev.id].owner);
-
- return curr - buf;
-}
-
-static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
-
-static struct bus_type rttest_subsys = {
- .name = "rttest",
- .dev_name = "rttest",
-};
-
-static int init_test_thread(int id)
-{
- thread_data[id].dev.bus = &rttest_subsys;
- thread_data[id].dev.id = id;
-
- threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
- if (IS_ERR(threads[id]))
- return PTR_ERR(threads[id]);
-
- return device_register(&thread_data[id].dev);
-}
-
-static int init_rttest(void)
-{
- int ret, i;
-
- spin_lock_init(&rttest_lock);
-
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
- rt_mutex_init(&mutexes[i]);
-
- ret = subsys_system_register(&rttest_subsys, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
- ret = init_test_thread(i);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
- if (ret)
- break;
- }
-
- printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-
- return ret;
-}
-
-device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5674b073473c..8251e75dd9c0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -74,14 +74,23 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
* set up.
*/
#ifndef CONFIG_DEBUG_RT_MUTEXES
-# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (cmpxchg_relaxed(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (cmpxchg_acquire(&l->owner, c, n) == c)
+# define rt_mutex_cmpxchg_release(l,c,n) (cmpxchg_release(&l->owner, c, n) == c)
+
+/*
+ * Callers must hold the ->wait_lock -- which is the whole purpose as we force
+ * all future threads that attempt to [Rmw] the lock to the slowpath. As such
+ * relaxed semantics suffice.
+ */
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
{
unsigned long owner, *p = (unsigned long *) &lock->owner;
do {
owner = *p;
- } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
+ } while (cmpxchg_relaxed(p, owner,
+ owner | RT_MUTEX_HAS_WAITERS) != owner);
}
/*
@@ -121,11 +130,14 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
* lock(wait_lock);
* acquire(lock);
*/
- return rt_mutex_cmpxchg(lock, owner, NULL);
+ return rt_mutex_cmpxchg_release(lock, owner, NULL);
}
#else
-# define rt_mutex_cmpxchg(l,c,n) (0)
+# define rt_mutex_cmpxchg_relaxed(l,c,n) (0)
+# define rt_mutex_cmpxchg_acquire(l,c,n) (0)
+# define rt_mutex_cmpxchg_release(l,c,n) (0)
+
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
{
lock->owner = (struct task_struct *)
@@ -158,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
- return (left->task->dl.deadline < right->task->dl.deadline);
+ return dl_time_before(left->task->dl.deadline,
+ right->task->dl.deadline);
return 0;
}
@@ -1120,7 +1133,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
debug_rt_mutex_print_deadlock(waiter);
- schedule_rt_mutex(lock);
+ schedule();
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
@@ -1321,7 +1334,7 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk))
{
- if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
@@ -1337,7 +1350,7 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
@@ -1348,7 +1361,7 @@ static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 1;
}
@@ -1362,7 +1375,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
{
WAKE_Q(wake_q);
- if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
rt_mutex_deadlock_account_unlock(current);
} else {
@@ -1484,7 +1497,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
struct wake_q_head *wqh)
{
- if (likely(rt_mutex_cmpxchg(lock, current, NULL))) {
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
rt_mutex_deadlock_account_unlock(current);
return false;
}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7844f8f0e639..4f5f83c7d2d3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,28 +15,6 @@
#include <linux/rtmutex.h>
/*
- * The rtmutex in kernel tester is independent of rtmutex debugging. We
- * call schedule_rt_mutex_test() instead of schedule() for the tasks which
- * belong to the tester. That way we can delay the wakeup path of those
- * threads to provoke lock stealing and testing of complex boosting scenarios.
- */
-#ifdef CONFIG_RT_MUTEX_TESTER
-
-extern void schedule_rt_mutex_test(struct rt_mutex *lock);
-
-#define schedule_rt_mutex(_lock) \
- do { \
- if (!(current->flags & PF_MUTEX_TESTER)) \
- schedule(); \
- else \
- schedule_rt_mutex_test(_lock); \
- } while (0)
-
-#else
-# define schedule_rt_mutex(_lock) schedule()
-#endif
-
-/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 0f189714e457..a4d4de05b2d1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
* to reduce unnecessary expensive cmpxchg() operations.
*/
if (count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
if (!list_is_singular(&sem->wait_list))
rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
@@ -285,7 +285,8 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
- old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+ old = cmpxchg_acquire(&sem->count, count,
+ count + RWSEM_ACTIVE_WRITE_BIAS);
if (old == count) {
rwsem_set_owner(sem);
return true;
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644
index 000000000000..536c727a56e9
--- /dev/null
+++ b/kernel/membarrier.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ return MEMBARRIER_CMD_BITMASK;
+ case MEMBARRIER_CMD_SHARED:
+ if (num_online_cpus() > 1)
+ synchronize_sched();
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/memremap.c b/kernel/memremap.c
new file mode 100644
index 000000000000..9d6b55587eaa
--- /dev/null
+++ b/kernel/memremap.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/memory_hotplug.h>
+
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+ return ioremap(offset, size);
+}
+#endif
+
+static void *try_ram_remap(resource_size_t offset, size_t size)
+{
+ struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
+
+ /* In the simple case just return the existing linear address */
+ if (!PageHighMem(page))
+ return __va(offset);
+ return NULL; /* fallback to ioremap_cache */
+}
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ *
+ * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * the architecture. This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility. Attempts to
+ * map "System RAM" with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+ int is_ram = region_intersects(offset, size, "System RAM");
+ void *addr = NULL;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ /* Try all mapping types requested until one returns non-NULL */
+ if (flags & MEMREMAP_WB) {
+ flags &= ~MEMREMAP_WB;
+ /*
+ * MEMREMAP_WB is special in that it can be satisifed
+ * from the direct map. Some archs depend on the
+ * capability of memremap() to autodetect cases where
+ * the requested range is potentially in "System RAM"
+ */
+ if (is_ram == REGION_INTERSECTS)
+ addr = try_ram_remap(offset, size);
+ if (!addr)
+ addr = ioremap_cache(offset, size);
+ }
+
+ /*
+ * If we don't have a mapping yet and more request flags are
+ * pending then we will be attempting to establish a new virtual
+ * address mapping. Enforce that this mapping is not aliasing
+ * "System RAM"
+ */
+ if (!addr && is_ram == REGION_INTERSECTS && flags) {
+ WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ if (!addr && (flags & MEMREMAP_WT)) {
+ flags &= ~MEMREMAP_WT;
+ addr = ioremap_wt(offset, size);
+ }
+
+ return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+ memunmap(res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+ return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
+{
+ void **ptr, *addr;
+
+ ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return NULL;
+
+ addr = memremap(offset, size, flags);
+ if (addr) {
+ *ptr = addr;
+ devres_add(dev, ptr);
+ } else
+ devres_free(ptr);
+
+ return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+ WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
+ addr));
+ memunmap(addr);
+}
+EXPORT_SYMBOL(devm_memunmap);
+
+#ifdef CONFIG_ZONE_DEVICE
+struct page_map {
+ struct resource res;
+};
+
+static void devm_memremap_pages_release(struct device *dev, void *res)
+{
+ struct page_map *page_map = res;
+
+ /* pages are dead and unused, undo the arch mapping */
+ arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+}
+
+void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+ int is_ram = region_intersects(res->start, resource_size(res),
+ "System RAM");
+ struct page_map *page_map;
+ int error, nid;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+ __func__, res);
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (is_ram == REGION_INTERSECTS)
+ return __va(res->start);
+
+ page_map = devres_alloc(devm_memremap_pages_release,
+ sizeof(*page_map), GFP_KERNEL);
+ if (!page_map)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(&page_map->res, res, sizeof(*res));
+
+ nid = dev_to_node(dev);
+ if (nid < 0)
+ nid = 0;
+
+ error = arch_add_memory(nid, res->start, resource_size(res), true);
+ if (error) {
+ devres_free(page_map);
+ return ERR_PTR(error);
+ }
+
+ devres_add(dev, page_map);
+ return __va(res->start);
+}
+EXPORT_SYMBOL(devm_memremap_pages);
+#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index b86b7bf1be38..8f051a106676 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1063,11 +1063,15 @@ void symbol_put_addr(void *addr)
if (core_kernel_text(a))
return;
- /* module_text_address is safe here: we're supposed to have reference
- * to module from symbol_get, so it can't go away. */
+ /*
+ * Even though we hold a reference on the module; we still need to
+ * disable preemption in order to safely traverse the data structure.
+ */
+ preempt_disable();
modaddr = __module_text_address(a);
BUG_ON(!modaddr);
module_put(modaddr);
+ preempt_enable();
}
EXPORT_SYMBOL_GPL(symbol_put_addr);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index be5b8fac4bd0..6528a79d998d 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,11 +10,9 @@
*/
#include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
+#include <linux/errno.h>
#include <keys/system_keyring.h>
+#include <crypto/public_key.h>
#include "module-internal.h"
/*
@@ -28,170 +26,22 @@
* - Information block
*/
struct module_signature {
- u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
- u8 hash; /* Digest algorithm [enum hash_algo] */
- u8 id_type; /* Key identifier type [enum pkey_id_type] */
- u8 signer_len; /* Length of signer's name */
- u8 key_id_len; /* Length of key identifier */
+ u8 algo; /* Public-key crypto algorithm [0] */
+ u8 hash; /* Digest algorithm [0] */
+ u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */
+ u8 signer_len; /* Length of signer's name [0] */
+ u8 key_id_len; /* Length of key identifier [0] */
u8 __pad[3];
__be32 sig_len; /* Length of signature data */
};
/*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
- const void *mod,
- unsigned long modlen)
-{
- struct public_key_signature *pks;
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- size_t digest_size, desc_size;
- int ret;
-
- pr_devel("==>%s()\n", __func__);
-
- /* Allocate the hashing algorithm we're going to need and find out how
- * big the hash operational data will be.
- */
- tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
- if (IS_ERR(tfm))
- return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- digest_size = crypto_shash_digestsize(tfm);
-
- /* We allocate the hash operational data storage on the end of our
- * context data and the digest output buffer on the end of that.
- */
- ret = -ENOMEM;
- pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
- if (!pks)
- goto error_no_pks;
-
- pks->pkey_hash_algo = hash;
- pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
- pks->digest_size = digest_size;
-
- desc = (void *)pks + sizeof(*pks);
- desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto error;
-
- ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
- if (ret < 0)
- goto error;
-
- crypto_free_shash(tfm);
- pr_devel("<==%s() = ok\n", __func__);
- return pks;
-
-error:
- kfree(pks);
-error_no_pks:
- crypto_free_shash(tfm);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ERR_PTR(ret);
-}
-
-/*
- * Extract an MPI array from the signature data. This represents the actual
- * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
- const void *data, size_t len)
-{
- size_t nbytes;
- MPI mpi;
-
- if (len < 3)
- return -EBADMSG;
- nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
- data += 2;
- len -= 2;
- if (len != nbytes)
- return -EBADMSG;
-
- mpi = mpi_read_raw_data(data, nbytes);
- if (!mpi)
- return -ENOMEM;
- pks->mpi[0] = mpi;
- pks->nr_mpi = 1;
- return 0;
-}
-
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
- const u8 *key_id, size_t key_id_len)
-{
- key_ref_t key;
- size_t i;
- char *id, *q;
-
- pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-
- /* Construct an identifier. */
- id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
- if (!id)
- return ERR_PTR(-ENOKEY);
-
- memcpy(id, signer, signer_len);
-
- q = id + signer_len;
- *q++ = ':';
- *q++ = ' ';
- for (i = 0; i < key_id_len; i++) {
- *q++ = hex_asc[*key_id >> 4];
- *q++ = hex_asc[*key_id++ & 0x0f];
- }
-
- *q = 0;
-
- pr_debug("Look up: \"%s\"\n", id);
-
- key = keyring_search(make_key_ref(system_trusted_keyring, 1),
- &key_type_asymmetric, id);
- if (IS_ERR(key))
- pr_warn("Request for unknown module key '%s' err %ld\n",
- id, PTR_ERR(key));
- kfree(id);
-
- if (IS_ERR(key)) {
- switch (PTR_ERR(key)) {
- /* Hide some search errors */
- case -EACCES:
- case -ENOTDIR:
- case -EAGAIN:
- return ERR_PTR(-ENOKEY);
- default:
- return ERR_CAST(key);
- }
- }
-
- pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
- return key_ref_to_ptr(key);
-}
-
-/*
* Verify the signature on a module.
*/
int mod_verify_sig(const void *mod, unsigned long *_modlen)
{
- struct public_key_signature *pks;
struct module_signature ms;
- struct key *key;
- const void *sig;
size_t modlen = *_modlen, sig_len;
- int ret;
pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -205,46 +55,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
if (sig_len >= modlen)
return -EBADMSG;
modlen -= sig_len;
- if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
- return -EBADMSG;
- modlen -= (size_t)ms.signer_len + ms.key_id_len;
-
*_modlen = modlen;
- sig = mod + modlen;
-
- /* For the moment, only support RSA and X.509 identifiers */
- if (ms.algo != PKEY_ALGO_RSA ||
- ms.id_type != PKEY_ID_X509)
- return -ENOPKG;
- if (ms.hash >= PKEY_HASH__LAST ||
- !hash_algo_name[ms.hash])
+ if (ms.id_type != PKEY_ID_PKCS7) {
+ pr_err("Module is not signed with expected PKCS#7 message\n");
return -ENOPKG;
-
- key = request_asymmetric_key(sig, ms.signer_len,
- sig + ms.signer_len, ms.key_id_len);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- pks = mod_make_digest(ms.hash, mod, modlen);
- if (IS_ERR(pks)) {
- ret = PTR_ERR(pks);
- goto error_put_key;
}
- ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
- sig_len);
- if (ret < 0)
- goto error_free_pks;
-
- ret = verify_signature(key, pks);
- pr_devel("verify_signature() = %d\n", ret);
+ if (ms.algo != 0 ||
+ ms.hash != 0 ||
+ ms.signer_len != 0 ||
+ ms.key_id_len != 0 ||
+ ms.__pad[0] != 0 ||
+ ms.__pad[1] != 0 ||
+ ms.__pad[2] != 0) {
+ pr_err("PKCS#7 signature info has unexpected non-zero params\n");
+ return -EBADMSG;
+ }
-error_free_pks:
- mpi_free(pks->rsa.s);
- kfree(pks);
-error_put_key:
- key_put(key);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ret;
+ return system_verify_data(mod, modlen, mod + modlen, sig_len,
+ VERIFYING_MODULE_SIGNATURE);
}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ae9fc7cc360e..fd2c9acbcc19 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
.signr = sig,
};
+ RCU_LOCKDEP_WARN(!rcu_is_watching(),
+ "notify_die called but RCU thinks we're quiescent");
return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);
diff --git a/kernel/panic.c b/kernel/panic.c
index 04e91ff7560b..4579dbb7ed87 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,7 @@
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
+#include <linux/console.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -147,6 +148,15 @@ void panic(const char *fmt, ...)
bust_spinlocks(0);
+ /*
+ * We may have ended up stopping the CPU holding the lock (in
+ * smp_send_stop()) while still having some valuable data in the console
+ * buffer. Try to acquire the lock then release it regardless of the
+ * result. The release will also print the buffers out.
+ */
+ console_trylock();
+ console_unlock();
+
if (!panic_blink)
panic_blink = no_blink;
diff --git a/kernel/params.c b/kernel/params.c
index ed1e0a1cffa7..a6d6149c0fe6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -326,10 +326,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_charp);
-static void param_free_charp(void *arg)
+void param_free_charp(void *arg)
{
maybe_kfree_parameter(*((char **)arg));
}
+EXPORT_SYMBOL(param_free_charp);
const struct kernel_param_ops param_ops_charp = {
.set = param_set_charp,
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- rcu_lockdep_assert(rcu_read_lock_held(),
- "find_task_by_pid_ns() needs rcu_read_lock()"
- " protection");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ "find_task_by_pid_ns() needs rcu_read_lock() protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9e302315e33d..02e8dfaa1ce2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,16 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config SUSPEND_SKIP_SYNC
+ bool "Skip kernel's sys_sync() on suspend to RAM/standby"
+ depends on SUSPEND
+ depends on EXPERT
+ help
+ Skip the kernel sys_sync() before freezing user processes.
+ Some systems prefer not to pay this cost on every invocation
+ of suspend, or they are content with invoking sync() from
+ user-space before invoking suspend. Say Y if that's your case.
+
config HIBERNATE_CALLBACKS
bool
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 690f78f210f2..b7342a24f559 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -733,7 +733,7 @@ int hibernate(void)
* contents of memory is restored from the saved image.
*
* If this is successful, control reappears in the restored target kernel in
- * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
+ * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
* attempts to recover gracefully and make the kernel return to the normal mode
* of operation.
*/
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 63d395b5df93..b2dd4d999900 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,6 +272,22 @@ static inline void pm_print_times_init(void)
{
pm_print_times_enabled = !!initcall_debug;
}
+
+static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+}
+
+static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ return -EINVAL;
+}
+power_attr(pm_wakeup_irq);
+
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -604,6 +620,7 @@ static struct attribute * g[] = {
#endif
#ifdef CONFIG_PM_SLEEP_DEBUG
&pm_print_times_attr.attr,
+ &pm_wakeup_irq_attr.attr,
#endif
#endif
#ifdef CONFIG_FREEZER
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5235dd4e1e2f..3a970604308f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
while (to_alloc-- > 0) {
struct page *page;
- page = alloc_image_page(__GFP_HIGHMEM);
+ page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
memory_bm_set_bit(bm, page_to_pfn(page));
}
return nr_highmem;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 53266b729fd9..f9fe133c13e2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -35,6 +35,9 @@
const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
const char *pm_states[PM_SUSPEND_MAX];
+unsigned int pm_suspend_global_flags;
+EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
+
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
@@ -484,13 +487,16 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE)
freeze_begin();
+#ifndef CONFIG_SUSPEND_SKIP_SYNC
trace_suspend_resume(TPS("sync_filesystems"), 0, true);
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+#endif
pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+ pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
goto Unlock;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2f30ca91e4fa..12cd989dadf6 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb)
hb->error = 0;
}
-static void hib_end_io(struct bio *bio, int error)
+static void hib_end_io(struct bio *bio)
{
struct hib_bio_batch *hb = bio->bi_private;
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
- if (!uptodate || error) {
+ if (bio->bi_error) {
printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
imajor(bio->bi_bdev->bd_inode),
iminor(bio->bi_bdev->bd_inode),
(unsigned long long)bio->bi_iter.bi_sector);
-
- if (!error)
- error = -EIO;
}
if (bio_data_dir(bio) == WRITE)
put_page(page);
- if (error && !hb->error)
- hb->error = error;
+ if (bio->bi_error && !hb->error)
+ hb->error = bio->bi_error;
if (atomic_dec_and_test(&hb->count))
wake_up(&hb->wait);
@@ -261,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = hib_resume_bdev;
@@ -360,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
return -ENOSPC;
if (hb) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
@@ -368,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_WAIT |
+ src = (void *)__get_free_page(__GFP_RECLAIM |
__GFP_NOWARN |
__GFP_NORETRY);
if (src) {
@@ -676,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -979,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = tmp;
tmp->map = (struct swap_map_page *)
- __get_free_page(__GFP_WAIT | __GFP_HIGH);
+ __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!tmp->map) {
release_swap_reader(handle);
return -ENOMEM;
@@ -1246,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
- __GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT | __GFP_NOWARN |
- __GFP_NORETRY);
+ __GFP_RECLAIM | __GFP_HIGH :
+ __GFP_RECLAIM | __GFP_NOWARN |
+ __GFP_NORETRY);
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 019069c84ff6..1896386e16bb 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -17,6 +17,7 @@
#include <linux/list.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "power.h"
@@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {}
#define WL_GC_COUNT_MAX 100
#define WL_GC_TIME_SEC 300
+static void __wakelocks_gc(struct work_struct *work);
static LIST_HEAD(wakelocks_lru_list);
+static DECLARE_WORK(wakelock_work, __wakelocks_gc);
static unsigned int wakelocks_gc_count;
static inline void wakelocks_lru_add(struct wakelock *wl)
@@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl)
list_move(&wl->lru, &wakelocks_lru_list);
}
-static void wakelocks_gc(void)
+static void __wakelocks_gc(struct work_struct *work)
{
struct wakelock *wl, *aux;
ktime_t now;
- if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
- return;
+ mutex_lock(&wakelocks_lock);
now = ktime_get();
list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
@@ -127,6 +129,16 @@ static void wakelocks_gc(void)
}
}
wakelocks_gc_count = 0;
+
+ mutex_unlock(&wakelocks_lock);
+}
+
+static void wakelocks_gc(void)
+{
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ schedule_work(&wakelock_work);
}
#else /* !CONFIG_PM_WAKELOCKS_GC */
static inline void wakelocks_lru_add(struct wakelock *wl) {}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..2ce8826f1053 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -269,6 +269,9 @@ static u32 clear_idx;
#define PREFIX_MAX 32
#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+#define LOG_LEVEL(v) ((v) & 0x07)
+#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
+
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
#define LOG_ALIGN 4
@@ -517,6 +520,7 @@ int check_syslog_permissions(int type, int source)
ok:
return security_syslog(type);
}
+EXPORT_SYMBOL_GPL(check_syslog_permissions);
static void append_char(char **pp, char *e, char c)
{
@@ -611,7 +615,6 @@ struct devkmsg_user {
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
- int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
size_t len = iov_iter_count(from);
@@ -641,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
line = buf;
if (line[0] == '<') {
char *endp = NULL;
+ unsigned int u;
- i = simple_strtoul(line+1, &endp, 10);
+ u = simple_strtoul(line + 1, &endp, 10);
if (endp && endp[0] == '>') {
- level = i & 7;
- if (i >> 3)
- facility = i >> 3;
+ level = LOG_LEVEL(u);
+ if (LOG_FACILITY(u) != 0)
+ facility = LOG_FACILITY(u);
endp++;
len -= endp - line;
line = endp;
@@ -835,7 +839,7 @@ const struct file_operations kmsg_fops = {
.release = devkmsg_release,
};
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This appends the listed symbols to /proc/vmcore
*
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28d6e9f..99513e1160e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
node = cpu_to_mem(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -543,14 +543,14 @@ static int create_hash_tables(void)
int node = cpu_to_mem(cpu);
struct page *page;
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..b760bae64cf1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
if (data & ~(unsigned long)PTRACE_O_MASK)
return -EINVAL;
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+ !config_enabled(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
@@ -1003,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request,
break;
}
#endif
+
+ case PTRACE_SECCOMP_GET_FILTER:
+ ret = seccomp_get_filter(child, addr, datavp);
+ break;
+
default:
break;
}
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a808424b06..61a16569ffbf 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e32684c23b..d89328e260df 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -252,7 +252,7 @@ struct rcu_torture_ops {
void (*exp_sync)(void);
unsigned long (*get_state)(void);
void (*cond_sync)(unsigned long oldstate);
- void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+ call_rcu_func_t call;
void (*cb_barrier)(void);
void (*fqs)(void);
void (*stats)(void);
@@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void)
}
static void
-call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+call_rcu_busted(struct rcu_head *head, rcu_callback_t func)
{
/* This is a deliberate bug for testing purposes only! */
func(head);
@@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void)
}
static void srcu_torture_call(struct rcu_head *head,
- void (*func)(struct rcu_head *head))
+ rcu_callback_t func)
{
call_srcu(srcu_ctlp, head, func);
}
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
+ .get_state = get_state_synchronize_sched,
+ .cond_sync = cond_synchronize_sched,
.call = call_rcu_sched,
.cb_barrier = rcu_barrier_sched,
.fqs = rcu_sched_force_quiescent_state,
@@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = {
#define RCUTORTURE_TASKS_OPS &tasks_ops,
+static bool __maybe_unused torturing_tasks(void)
+{
+ return cur_ops == &tasks_ops;
+}
+
#else /* #ifdef CONFIG_TASKS_RCU */
#define RCUTORTURE_TASKS_OPS
+static bool __maybe_unused torturing_tasks(void)
+{
+ return false;
+}
+
#endif /* #else #ifdef CONFIG_TASKS_RCU */
/*
@@ -756,7 +768,6 @@ static int rcu_torture_boost(void *arg)
}
call_rcu_time = jiffies;
}
- cond_resched_rcu_qs();
stutter_wait("rcu_torture_boost");
if (torture_must_stop())
goto checkwait;
@@ -823,9 +834,7 @@ rcu_torture_cbflood(void *arg)
}
if (err) {
VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
- while (!torture_must_stop())
- schedule_timeout_interruptible(HZ);
- return 0;
+ goto wait_for_stop;
}
VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
do {
@@ -844,6 +853,7 @@ rcu_torture_cbflood(void *arg)
stutter_wait("rcu_torture_cbflood");
} while (!torture_must_stop());
vfree(rhp);
+wait_for_stop:
torture_kthread_stopping("rcu_torture_cbflood");
return 0;
}
@@ -1088,7 +1098,8 @@ static void rcu_torture_timer(unsigned long unused)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Leave because rcu_torture_writer is not yet underway */
cur_ops->readunlock(idx);
@@ -1162,7 +1173,8 @@ rcu_torture_reader(void *arg)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
cur_ops->readunlock(idx);
@@ -1195,7 +1207,6 @@ rcu_torture_reader(void *arg)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
- cond_resched_rcu_qs();
stutter_wait("rcu_torture_reader");
} while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable) {
@@ -1507,7 +1518,7 @@ static int rcu_torture_barrier_init(void)
int i;
int ret;
- if (n_barrier_cbs == 0)
+ if (n_barrier_cbs <= 0)
return 0;
if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
pr_alert("%s" TORTURE_FLAG
@@ -1729,15 +1740,15 @@ rcu_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_alert(" %s", torture_ops[i]->name);
pr_alert("\n");
- torture_init_end();
- return -EINVAL;
+ firsterr = -EINVAL;
+ goto unwind;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
if (cur_ops->init)
- cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+ cur_ops->init();
if (nreaders >= 0) {
nrealreaders = nreaders;
@@ -1786,12 +1797,15 @@ rcu_torture_init(void)
writer_task);
if (firsterr)
goto unwind;
- fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
- GFP_KERNEL);
- if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nfakewriters > 0) {
+ fakewriter_tasks = kzalloc(nfakewriters *
+ sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
@@ -1818,7 +1832,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
}
- if (test_no_idle_hz) {
+ if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
if (firsterr)
goto unwind;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index fb33d35ee0b7..a63a1ea5a41b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
}
/**
- * srcu_readers_active - returns approximate number of readers.
+ * srcu_readers_active - returns true if there are readers. and false
+ * otherwise
* @sp: which srcu_struct to count active readers (holding srcu_read_lock).
*
* Note that this is not an atomic primitive, and can therefore suffer
* severe errors when invoked on an active srcu_struct. That said, it
* can be useful as an error check at cleanup time.
*/
-static int srcu_readers_active(struct srcu_struct *sp)
+static bool srcu_readers_active(struct srcu_struct *sp)
{
int cpu;
unsigned long sum = 0;
@@ -297,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
int idx;
idx = READ_ONCE(sp->completed) & 0x1;
- preempt_disable();
__this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
__this_cpu_inc(sp->per_cpu_ref->seq[idx]);
- preempt_enable();
return idx;
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -386,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp)
* srcu_struct structure.
*/
void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
- void (*func)(struct rcu_head *head))
+ rcu_callback_t func)
{
unsigned long flags;
@@ -414,11 +413,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
struct rcu_head *head = &rcu.head;
bool done = false;
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
might_sleep();
init_completion(&rcu.completion);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
new file mode 100644
index 000000000000..be922c9f3d37
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,223 @@
+/*
+ * RCU-based infrastructure for lightweight reader-writer locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2015, Red Hat, Inc.
+ *
+ * Author: Oleg Nesterov <oleg@redhat.com>
+ */
+
+#include <linux/rcu_sync.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func) .held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
+static const struct {
+ void (*sync)(void);
+ void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+ void (*wait)(void);
+#ifdef CONFIG_PROVE_RCU
+ int (*held)(void);
+#endif
+} gp_ops[] = {
+ [RCU_SYNC] = {
+ .sync = synchronize_rcu,
+ .call = call_rcu,
+ .wait = rcu_barrier,
+ __INIT_HELD(rcu_read_lock_held)
+ },
+ [RCU_SCHED_SYNC] = {
+ .sync = synchronize_sched,
+ .call = call_rcu_sched,
+ .wait = rcu_barrier_sched,
+ __INIT_HELD(rcu_read_lock_sched_held)
+ },
+ [RCU_BH_SYNC] = {
+ .sync = synchronize_rcu_bh,
+ .call = call_rcu_bh,
+ .wait = rcu_barrier_bh,
+ __INIT_HELD(rcu_read_lock_bh_held)
+ },
+};
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define rss_lock gp_wait.lock
+
+#ifdef CONFIG_PROVE_RCU
+void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
+{
+ RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
+ "suspicious rcu_sync_is_idle() usage");
+}
+#endif
+
+/**
+ * rcu_sync_init() - Initialize an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be initialized
+ * @type: Flavor of RCU with which to synchronize rcu_sync structure
+ */
+void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+{
+ memset(rsp, 0, sizeof(*rsp));
+ init_waitqueue_head(&rsp->gp_wait);
+ rsp->gp_type = type;
+}
+
+/**
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update. After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths. A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+ bool need_wait, need_sync;
+
+ spin_lock_irq(&rsp->rss_lock);
+ need_wait = rsp->gp_count++;
+ need_sync = rsp->gp_state == GP_IDLE;
+ if (need_sync)
+ rsp->gp_state = GP_PENDING;
+ spin_unlock_irq(&rsp->rss_lock);
+
+ BUG_ON(need_wait && need_sync);
+
+ if (need_sync) {
+ gp_ops[rsp->gp_type].sync();
+ rsp->gp_state = GP_PASSED;
+ wake_up_all(&rsp->gp_wait);
+ } else if (need_wait) {
+ wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
+ } else {
+ /*
+ * Possible when there's a pending CB from a rcu_sync_exit().
+ * Nobody has yet been allowed the 'fast' path and thus we can
+ * avoid doing any sync(). The callback will get 'dropped'.
+ */
+ BUG_ON(rsp->gp_state != GP_PASSED);
+ }
+}
+
+/**
+ * rcu_sync_func() - Callback function managing reader access to fastpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is passed to one of the call_rcu() functions by
+ * rcu_sync_exit(), so that it is invoked after a grace period following the
+ * that invocation of rcu_sync_exit(). It takes action based on events that
+ * have taken place in the meantime, so that closely spaced rcu_sync_enter()
+ * and rcu_sync_exit() pairs need not wait for a grace period.
+ *
+ * If another rcu_sync_enter() is invoked before the grace period
+ * ended, reset state to allow the next rcu_sync_exit() to let the
+ * readers back onto their fastpaths (after a grace period). If both
+ * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked
+ * before the grace period ended, re-invoke call_rcu() on behalf of that
+ * rcu_sync_exit(). Otherwise, set all state back to idle so that readers
+ * can again use their fastpaths.
+ */
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+ struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+ unsigned long flags;
+
+ BUG_ON(rsp->gp_state != GP_PASSED);
+ BUG_ON(rsp->cb_state == CB_IDLE);
+
+ spin_lock_irqsave(&rsp->rss_lock, flags);
+ if (rsp->gp_count) {
+ /*
+ * A new rcu_sync_begin() has happened; drop the callback.
+ */
+ rsp->cb_state = CB_IDLE;
+ } else if (rsp->cb_state == CB_REPLAY) {
+ /*
+ * A new rcu_sync_exit() has happened; requeue the callback
+ * to catch a later GP.
+ */
+ rsp->cb_state = CB_PENDING;
+ gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+ } else {
+ /*
+ * We're at least a GP after rcu_sync_exit(); eveybody will now
+ * have observed the write side critical section. Let 'em rip!.
+ */
+ rsp->cb_state = CB_IDLE;
+ rsp->gp_state = GP_IDLE;
+ }
+ spin_unlock_irqrestore(&rsp->rss_lock, flags);
+}
+
+/**
+ * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who have completed, and can therefore
+ * now allow readers to make use of their fastpaths after a grace period
+ * has elapsed. After this grace period has completed, all subsequent
+ * calls to rcu_sync_is_idle() will return true, which tells readers that
+ * they can once again use their fastpaths.
+ */
+void rcu_sync_exit(struct rcu_sync *rsp)
+{
+ spin_lock_irq(&rsp->rss_lock);
+ if (!--rsp->gp_count) {
+ if (rsp->cb_state == CB_IDLE) {
+ rsp->cb_state = CB_PENDING;
+ gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+ } else if (rsp->cb_state == CB_PENDING) {
+ rsp->cb_state = CB_REPLAY;
+ }
+ }
+ spin_unlock_irq(&rsp->rss_lock);
+}
+
+/**
+ * rcu_sync_dtor() - Clean up an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be cleaned up
+ */
+void rcu_sync_dtor(struct rcu_sync *rsp)
+{
+ int cb_state;
+
+ BUG_ON(rsp->gp_count);
+
+ spin_lock_irq(&rsp->rss_lock);
+ if (rsp->cb_state == CB_REPLAY)
+ rsp->cb_state = CB_PENDING;
+ cb_state = rsp->cb_state;
+ spin_unlock_irq(&rsp->rss_lock);
+
+ if (cb_state != CB_IDLE) {
+ gp_ops[rsp->gp_type].wait();
+ BUG_ON(rsp->cb_state != CB_IDLE);
+ }
+}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c291bd65d2cb..944b1b491ed8 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -44,7 +44,7 @@ struct rcu_ctrlblk;
static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
static void rcu_process_callbacks(struct softirq_action *unused);
static void __call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu),
+ rcu_callback_t func,
struct rcu_ctrlblk *rcp);
#include "tiny_plugin.h"
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU read-side critical section");
cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
* Helper function for call_rcu() and call_rcu_bh().
*/
static void __call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu),
+ rcu_callback_t func,
struct rcu_ctrlblk *rcp)
{
unsigned long flags;
@@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head,
* period. But since we have but one CPU, that would be after any
* quiescent state.
*/
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, &rcu_sched_ctrlblk);
}
@@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
* Post an RCU bottom-half callback to be invoked after any subsequent
* quiescent state.
*/
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, &rcu_bh_ctrlblk);
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 65137bc28b2b..f07343b54fe5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
/*
* In order to export the rcu_state name to the tracing tools, it
@@ -96,7 +97,7 @@ struct rcu_state sname##_state = { \
.level = { &sname##_state.node[0] }, \
.rda = &sname##_data, \
.call = cr, \
- .fqs_state = RCU_GP_IDLE, \
+ .gp_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
.orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
@@ -124,13 +125,8 @@ module_param(rcu_fanout_exact, bool, 0444);
static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
- NUM_RCU_LVL_0,
- NUM_RCU_LVL_1,
- NUM_RCU_LVL_2,
- NUM_RCU_LVL_3,
- NUM_RCU_LVL_4,
-};
+/* Number of rcu_nodes at specified level. */
+static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
@@ -164,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
+static void rcu_report_exp_rdp(struct rcu_state *rsp,
+ struct rcu_data *rdp, bool wake);
/* rcuc/rcub kthread realtime priority */
#ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -248,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
*/
void rcu_sched_qs(void)
{
- if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
+ unsigned long flags;
+
+ if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_sched"),
__this_cpu_read(rcu_sched_data.gpnum),
TPS("cpuqs"));
- __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
+ if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ local_irq_save(flags);
+ if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(&rcu_sched_data),
+ true);
+ }
+ local_irq_restore(flags);
}
}
void rcu_bh_qs(void)
{
- if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
+ if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_bh"),
__this_cpu_read(rcu_bh_data.gpnum),
TPS("cpuqs"));
- __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
+ __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
}
}
@@ -340,12 +350,14 @@ static void rcu_momentary_dyntick_idle(void)
*/
void rcu_note_context_switch(void)
{
+ barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
rcu_preempt_note_context_switch();
if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
+ barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -356,12 +368,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
* RCU flavors in desperate need of a quiescent state, which will normally
* be none of them). Either way, do a lightweight quiescent state for
* all RCU flavors.
+ *
+ * The barrier() calls are redundant in the common case when this is
+ * called externally, but just in case this is called from within this
+ * file.
+ *
*/
void rcu_all_qs(void)
{
+ barrier(); /* Avoid RCU read-side critical sections leaking down. */
if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle();
this_cpu_inc(rcu_qs_ctr);
+ barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -649,12 +668,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
* It is illegal to enter an extended quiescent state while
* in an RCU read-side critical section.
*/
- rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
- "Illegal idle entry in RCU read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
- "Illegal idle entry in RCU-bh read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
- "Illegal idle entry in RCU-sched read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+ "Illegal idle entry in RCU read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
+ "Illegal idle entry in RCU-bh read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
+ "Illegal idle entry in RCU-sched read-side critical section.");
}
/*
@@ -701,7 +720,7 @@ void rcu_idle_enter(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_enter - inform RCU that we are resuming userspace.
*
@@ -714,7 +733,7 @@ void rcu_user_enter(void)
{
rcu_eqs_enter(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -828,7 +847,7 @@ void rcu_idle_exit(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_exit - inform RCU that we are exiting userspace.
*
@@ -839,7 +858,7 @@ void rcu_user_exit(void)
{
rcu_eqs_exit(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -978,9 +997,9 @@ bool notrace rcu_is_watching(void)
{
bool ret;
- preempt_disable();
+ preempt_disable_notrace();
ret = __rcu_is_watching();
- preempt_enable();
+ preempt_enable_notrace();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_is_watching);
@@ -1178,9 +1197,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
rsp->name, j - gpa,
- rsp->gpnum, rsp->completed, rsp->gp_flags);
+ rsp->gpnum, rsp->completed,
+ rsp->gp_flags, rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : 0);
}
/*
@@ -1745,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
*/
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
- rdp->passed_quiesce = 0;
+ rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
- rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
+ rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
}
@@ -1906,18 +1927,37 @@ static int rcu_gp_init(struct rcu_state *rsp)
}
/*
+ * Helper function for wait_event_interruptible_timeout() wakeup
+ * at force-quiescent-state time.
+ */
+static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Someone like call_rcu() requested a force-quiescent-state scan. */
+ *gfp = READ_ONCE(rsp->gp_flags);
+ if (*gfp & RCU_GP_FLAG_FQS)
+ return true;
+
+ /* The current grace period has completed. */
+ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+ return true;
+
+ return false;
+}
+
+/*
* Do one round of quiescent-state forcing.
*/
-static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
+static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
{
- int fqs_state = fqs_state_in;
bool isidle = false;
unsigned long maxj;
struct rcu_node *rnp = rcu_get_root(rsp);
WRITE_ONCE(rsp->gp_activity, jiffies);
rsp->n_force_qs++;
- if (fqs_state == RCU_SAVE_DYNTICK) {
+ if (first_time) {
/* Collect dyntick-idle snapshots. */
if (is_sysidle_rcu_state(rsp)) {
isidle = true;
@@ -1926,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
force_qs_rnp(rsp, dyntick_save_progress_counter,
&isidle, &maxj);
rcu_sysidle_report_gp(rsp, isidle, maxj);
- fqs_state = RCU_FORCE_QS;
} else {
/* Handle dyntick-idle and offline CPUs. */
isidle = true;
@@ -1940,7 +1979,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock);
}
- return fqs_state;
}
/*
@@ -2004,7 +2042,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
/* Declare grace period done. */
WRITE_ONCE(rsp->completed, rsp->gpnum);
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
- rsp->fqs_state = RCU_GP_IDLE;
+ rsp->gp_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
/* Advance CBs to reduce false positives below. */
needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
@@ -2022,7 +2060,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
*/
static int __noreturn rcu_gp_kthread(void *arg)
{
- int fqs_state;
+ bool first_gp_fqs;
int gf;
unsigned long j;
int ret;
@@ -2041,6 +2079,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
wait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
+ rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
@@ -2053,7 +2092,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/* Handle quiescent-state forcing. */
- fqs_state = RCU_SAVE_DYNTICK;
+ first_gp_fqs = true;
j = jiffies_till_first_fqs;
if (j > HZ) {
j = HZ;
@@ -2068,11 +2107,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = READ_ONCE(rsp->gp_flags)) &
- RCU_GP_FLAG_FQS) ||
- (!READ_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp)),
- j);
+ rcu_gp_fqs_check_wake(rsp, &gf), j);
+ rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!READ_ONCE(rnp->qsmask) &&
@@ -2084,7 +2120,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqsstart"));
- fqs_state = rcu_gp_fqs(rsp, fqs_state);
+ rcu_gp_fqs(rsp, first_gp_fqs);
+ first_gp_fqs = false;
trace_rcu_grace_period(rsp->name,
READ_ONCE(rsp->gpnum),
TPS("fqsend"));
@@ -2110,7 +2147,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/* Handle grace-period end. */
+ rsp->gp_state = RCU_GP_CLEANUP;
rcu_gp_cleanup(rsp);
+ rsp->gp_state = RCU_GP_CLEANED;
}
}
@@ -2318,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
- if ((rdp->passed_quiesce == 0 &&
+ if ((rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
rdp->gpwrap) {
@@ -2329,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
* We will instead need a new quiescent state that lies
* within the current grace period.
*/
- rdp->passed_quiesce = 0; /* need qs for new gp. */
+ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
@@ -2338,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
} else {
- rdp->qs_pending = 0;
+ rdp->core_needs_qs = 0;
/*
* This GP can't end until cpu checks in, so all of our
@@ -2369,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Does this CPU still need to do its part for current grace period?
* If no, return and let the other CPUs do their part as well.
*/
- if (!rdp->qs_pending)
+ if (!rdp->core_needs_qs)
return;
/*
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (!rdp->passed_quiesce &&
+ if (rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
return;
@@ -2998,7 +3037,7 @@ static void rcu_leak_callback(struct rcu_head *rhp)
* is expected to specify a CPU.
*/
static void
-__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+__call_rcu(struct rcu_head *head, rcu_callback_t func,
struct rcu_state *rsp, int cpu, bool lazy)
{
unsigned long flags;
@@ -3069,7 +3108,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
/*
* Queue an RCU-sched callback for invocation after a grace period.
*/
-void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_sched(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, &rcu_sched_state, -1, 0);
}
@@ -3078,7 +3117,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
/*
* Queue an RCU callback for invocation after a quicker grace period.
*/
-void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu_bh(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, &rcu_bh_state, -1, 0);
}
@@ -3092,7 +3131,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
* function may only be called from __kfree_rcu().
*/
void kfree_call_rcu(struct rcu_head *head,
- void (*func)(struct rcu_head *rcu))
+ rcu_callback_t func)
{
__call_rcu(head, func, rcu_state_p, -1, 1);
}
@@ -3161,10 +3200,10 @@ static inline int rcu_blocking_is_gp(void)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU-sched read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU-sched read-side critical section");
if (rcu_blocking_is_gp())
return;
if (rcu_gp_is_expedited())
@@ -3188,10 +3227,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
*/
void synchronize_rcu_bh(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
if (rcu_blocking_is_gp())
return;
if (rcu_gp_is_expedited())
@@ -3253,190 +3292,574 @@ void cond_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
{
/*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
*/
- smp_mb(); /* See above comment block. */
- return 0;
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_sched()
+ * and cond_synchronize_sched().
+ */
+ return smp_load_acquire(&rcu_sched_state.gpnum);
}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
/**
- * synchronize_sched_expedited - Brute-force RCU-sched grace period
- *
- * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code. In fact,
- * if you are using synchronize_sched_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_sched() instead.
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
*
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return. Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
*
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
*/
-void synchronize_sched_expedited(void)
+void cond_synchronize_sched(unsigned long oldstate)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
- struct rcu_state *rsp = &rcu_sched_state;
+ unsigned long newstate;
/*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
*/
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
+ newstate = smp_load_acquire(&rcu_sched_state.completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
+/* Adjust sequence number for start of update-side operation. */
+static void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(!(*sp & 0x1));
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WRITE_ONCE(*sp, *sp + 1);
+ WARN_ON_ONCE(*sp & 0x1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = (READ_ONCE(*sp) + 3) & ~0x1;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
+/* Wrapper functions for expedited grace periods. */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+ rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+ rcu_seq_end(&rsp->expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+ return rcu_seq_snap(&rsp->expedited_sequence);
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+ return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/*
+ * Reset the ->expmaskinit values in the rcu_node tree to reflect any
+ * recent CPU-online activity. Note that these masks are not cleared
+ * when CPUs go offline, so they reflect the union of all CPUs that have
+ * ever been online. This means that this function normally takes its
+ * no-work-to-do fastpath.
+ */
+static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
+{
+ bool done;
+ unsigned long flags;
+ unsigned long mask;
+ unsigned long oldmask;
+ int ncpus = READ_ONCE(rsp->ncpus);
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_up;
+
+ /* If no new CPUs onlined since last time, nothing to do. */
+ if (likely(ncpus == rsp->ncpus_snap))
return;
- }
+ rsp->ncpus_snap = ncpus;
/*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
+ * Each pass through the following loop propagates newly onlined
+ * CPUs for the current rcu_node structure up the rcu_node tree.
*/
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, fall back to normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- return;
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (rnp->expmaskinit == rnp->expmaskinitnext) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ continue; /* No new CPUs, nothing to do. */
+ }
+
+ /* Update this node's mask, track old value for propagation. */
+ oldmask = rnp->expmaskinit;
+ rnp->expmaskinit = rnp->expmaskinitnext;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ /* If was already nonzero, nothing to propagate. */
+ if (oldmask)
+ continue;
+
+ /* Propagate the new CPU up the tree. */
+ mask = rnp->grpmask;
+ rnp_up = rnp->parent;
+ done = false;
+ while (rnp_up) {
+ raw_spin_lock_irqsave(&rnp_up->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (rnp_up->expmaskinit)
+ done = true;
+ rnp_up->expmaskinit |= mask;
+ raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
+ if (done)
+ break;
+ mask = rnp_up->grpmask;
+ rnp_up = rnp_up->parent;
+ }
}
- WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
-
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+}
+
+/*
+ * Reset the ->expmask values in the rcu_node tree in preparation for
+ * a new expedited grace period.
+ */
+static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+
+ sync_exp_reset_tree_hotplug(rsp);
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(rnp->expmask);
+ rnp->expmask = rnp->expmaskinit;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+/*
+ * Return non-zero if there is no RCU expedited grace period in progress
+ * for the specified rcu_node structure, in other words, if all CPUs and
+ * tasks covered by the specified rcu_node structure have done their bit
+ * for the current expedited grace period. Works only for preemptible
+ * RCU -- other RCU implementation use other means.
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
+{
+ return rnp->exp_tasks == NULL &&
+ READ_ONCE(rnp->expmask) == 0;
+}
+
+/*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+ * grace period. This event is reported either to the rcu_node structure on
+ * which the task was queued or to one of that rcu_node structure's ancestors,
+ * recursively up the tree. (Calm down, calm down, we do the recursion
+ * iteratively!)
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex and the
+ * specified rcu_node structure's ->lock.
+ */
+static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake, unsigned long flags)
+ __releases(rnp->lock)
+{
+ unsigned long mask;
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
+ for (;;) {
+ if (!sync_rcu_preempt_exp_done(rnp)) {
+ if (!rnp->expmask)
+ rcu_initiate_boost(rnp, flags);
+ else
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ break;
}
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ if (rnp->parent == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ if (wake) {
+ smp_mb(); /* EGP done before wake_up(). */
+ wake_up(&rsp->expedited_wq);
+ }
+ break;
+ }
+ mask = rnp->grpmask;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+ rnp = rnp->parent;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled */
+ smp_mb__after_unlock_lock();
+ WARN_ON_ONCE(!(rnp->expmask & mask));
+ rnp->expmask &= ~mask;
+ }
+}
+
+/*
+ * Report expedited quiescent state for specified node. This is a
+ * lock-acquisition wrapper function for __rcu_report_exp_rnp().
+ *
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
+ struct rcu_node *rnp, bool wake)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ __rcu_report_exp_rnp(rsp, rnp, wake, flags);
+}
+
+/*
+ * Report expedited quiescent state for multiple CPUs, all covered by the
+ * specified leaf rcu_node structure. Caller must hold the root
+ * rcu_node's exp_funnel_mutex.
+ */
+static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
+ unsigned long mask, bool wake)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+ if (!(rnp->expmask & mask)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ rnp->expmask &= ~mask;
+ __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
+}
+
+/*
+ * Report expedited quiescent state for specified rcu_data (CPU).
+ * Caller must hold the root rcu_node's exp_funnel_mutex.
+ */
+static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
+ bool wake)
+{
+ rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp,
+ atomic_long_t *stat, unsigned long s)
+{
+ if (rcu_exp_gp_seq_done(rsp, s)) {
+ if (rnp)
+ mutex_unlock(&rnp->exp_funnel_mutex);
+ else if (rdp)
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ return true;
}
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns a
+ * pointer to the root rcu_node structure, or NULL if some other
+ * task did the expedited grace period for us.
+ */
+static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp0;
+ struct rcu_node *rnp1 = NULL;
/*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
+ * First try directly acquiring the root lock in order to reduce
+ * latency in the common case where expedited grace periods are
+ * rare. We check mutex_is_locked() to avoid pathological levels of
+ * memory contention on ->exp_funnel_mutex in the heavy-load case.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
+ rnp0 = rcu_get_root(rsp);
+ if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
+ if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
+ if (sync_exp_work_done(rsp, rnp0, NULL,
+ &rsp->expedited_workdone0, s))
+ return NULL;
+ return rnp0;
}
+ }
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
+ */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ return NULL;
+ mutex_lock(&rdp->exp_funnel_mutex);
+ rnp0 = rdp->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone2, s))
+ return NULL;
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ else
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone3, s))
+ return NULL;
+ return rnp1;
+}
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static void sync_sched_exp_handler(void *data)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = data;
+
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
+ return;
+ __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
+ resched_cpu(smp_processor_id());
+}
+
+/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
+static void sync_sched_exp_online_cleanup(int cpu)
+{
+ struct rcu_data *rdp;
+ int ret;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+ return;
+ ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
+ WARN_ON_ONCE(ret);
+}
+
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+ smp_call_func_t func)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long mask;
+ unsigned long mask_ofl_test;
+ unsigned long mask_ofl_ipi;
+ int ret;
+ struct rcu_node *rnp;
+
+ sync_exp_reset_tree(rsp);
+ rcu_for_each_leaf_node(rsp, rnp) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ smp_mb__after_unlock_lock();
+
+ /* Each pass checks a CPU for identity, offline, and idle. */
+ mask_ofl_test = 0;
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ mask_ofl_test |= rdp->grpmask;
}
+ mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
/*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
+ * Need to wait for any blocked tasks as well. Note that
+ * additional blocking tasks will also block the expedited
+ * GP until such time as the ->expmask bits are cleared.
*/
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
+ if (rcu_preempt_has_tasks(rnp))
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
+ /* IPI the remaining CPUs for expedited quiescent state. */
+ mask = 1;
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+ if (!(mask_ofl_ipi & mask))
+ continue;
+retry_ipi:
+ ret = smp_call_function_single(cpu, func, rsp, 0);
+ if (!ret) {
+ mask_ofl_ipi &= ~mask;
+ } else {
+ /* Failed, raced with offline. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask)) {
+ raw_spin_unlock_irqrestore(&rnp->lock,
+ flags);
+ schedule_timeout_uninterruptible(1);
+ if (cpu_online(cpu) &&
+ (rnp->expmask & mask))
+ goto retry_ipi;
+ raw_spin_lock_irqsave(&rnp->lock,
+ flags);
+ }
+ if (!(rnp->expmask & mask))
+ mask_ofl_ipi &= ~mask;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
}
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Report quiescent states for those that went offline. */
+ mask_ofl_test |= mask_ofl_ipi;
+ if (mask_ofl_test)
+ rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
+}
-all_cpus_idle:
- free_cpumask_var(cm);
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ unsigned long mask;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_root = rcu_get_root(rsp);
+ int ret;
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
+ jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+
+ for (;;) {
+ ret = wait_event_interruptible_timeout(
+ rsp->expedited_wq,
+ sync_rcu_preempt_exp_done(rnp_root),
+ jiffies_stall);
+ if (ret > 0)
+ return;
+ if (ret < 0) {
+ /* Hit a signal, disable CPU stall warnings. */
+ wait_event(rsp->expedited_wq,
+ sync_rcu_preempt_exp_done(rnp_root));
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
+ rsp->name);
+ rcu_for_each_leaf_node(rsp, rnp) {
+ (void)rcu_print_task_exp_stall(rnp);
+ mask = 1;
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+ struct rcu_data *rdp;
+
+ if (!(rnp->expmask & mask))
+ continue;
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ pr_cont(" %d-%c%c%c", cpu,
+ "O."[cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rnp->expmaskinit)],
+ "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
+ }
+ mask <<= 1;
}
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+ pr_cont(" } %lu jiffies s: %lu\n",
+ jiffies - jiffies_start, rsp->expedited_sequence);
+ rcu_for_each_leaf_node(rsp, rnp) {
+ mask = 1;
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
+ if (!(rnp->expmask & mask))
+ continue;
+ dump_cpu_task(cpu);
+ }
+ }
+ jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ }
+}
- put_online_cpus();
+/**
+ * synchronize_sched_expedited - Brute-force RCU-sched grace period
+ *
+ * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
+ * approach to force the grace period to end quickly. This consumes
+ * significant time on all CPUs and is unfriendly to real-time workloads,
+ * so is thus not recommended for any sort of common-case code. In fact,
+ * if you are using synchronize_sched_expedited() in a loop, please
+ * restructure your code to batch your updates, and then use a single
+ * synchronize_sched() instead.
+ *
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
+ */
+void synchronize_sched_expedited(void)
+{
+ unsigned long s;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp = &rcu_sched_state;
+
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
+
+ rnp = exp_funnel_lock(rsp, s);
+ if (rnp == NULL)
+ return; /* Someone else did our work for us. */
+
+ rcu_exp_gp_seq_start(rsp);
+ sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
+ synchronize_sched_expedited_wait(rsp);
+
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp->exp_funnel_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
@@ -3462,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
- rdp->qs_pending && !rdp->passed_quiesce &&
+ rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
- rdp->n_rp_qs_pending++;
- } else if (rdp->qs_pending &&
- (rdp->passed_quiesce ||
+ rdp->n_rp_core_needs_qs++;
+ } else if (rdp->core_needs_qs &&
+ (!rdp->cpu_no_qs.b.norm ||
rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
rdp->n_rp_report_qs++;
return 1;
@@ -3571,10 +3994,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
}
}
@@ -3586,7 +4009,7 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
atomic_inc(&rsp->barrier_cpu_count);
rsp->call(&rdp->barrier_head, rcu_barrier_callback);
}
@@ -3599,55 +4022,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
struct rcu_data *rdp;
- unsigned long snap = READ_ONCE(rsp->n_barrier_done);
- unsigned long snap_done;
+ unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, snap);
+ _rcu_barrier_trace(rsp, "Begin", -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
- /*
- * Ensure that all prior references, including to ->n_barrier_done,
- * are ordered before the _rcu_barrier() machinery.
- */
- smp_mb(); /* See above block comment. */
-
- /*
- * Recheck ->n_barrier_done to see if others did our work for us.
- * This means checking ->n_barrier_done for an even-to-odd-to-even
- * transition. The "if" expression below therefore rounds the old
- * value up to the next even number and adds two before comparing.
- */
- snap_done = rsp->n_barrier_done;
- _rcu_barrier_trace(rsp, "Check", -1, snap_done);
-
- /*
- * If the value in snap is odd, we needed to wait for the current
- * rcu_barrier() to complete, then wait for the next one, in other
- * words, we need the value of snap_done to be three larger than
- * the value of snap. On the other hand, if the value in snap is
- * even, we only had to wait for the next rcu_barrier() to complete,
- * in other words, we need the value of snap_done to be only two
- * greater than the value of snap. The "(snap + 3) & ~0x1" computes
- * this for us (thank you, Linus!).
- */
- if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
+ /* Did someone else do our work for us? */
+ if (rcu_seq_done(&rsp->barrier_sequence, s)) {
+ _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
}
- /*
- * Increment ->n_barrier_done to avoid duplicate work. Use
- * WRITE_ONCE() to prevent the compiler from speculating
- * the increment to precede the early-exit check.
- */
- WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
- smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
+ /* Mark the start of the barrier operation. */
+ rcu_seq_start(&rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3671,10 +4063,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
_rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
} else {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
__call_rcu(&rdp->barrier_head,
@@ -3682,11 +4074,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
}
} else if (READ_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
}
}
put_online_cpus();
@@ -3698,16 +4090,13 @@ static void _rcu_barrier(struct rcu_state *rsp)
if (atomic_dec_and_test(&rsp->barrier_cpu_count))
complete(&rsp->barrier_completion);
- /* Increment ->n_barrier_done to prevent duplicate work. */
- smp_mb(); /* Keep increment after above mechanism. */
- WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
- WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
- smp_mb(); /* Keep increment before caller's subsequent code. */
-
/* Wait for all rcu_barrier_callback() callbacks to be invoked. */
wait_for_completion(&rsp->barrier_completion);
+ /* Mark the end of the barrier operation. */
+ _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ rcu_seq_end(&rsp->barrier_sequence);
+
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rsp->barrier_mutex);
}
@@ -3770,6 +4159,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
rdp->cpu = cpu;
rdp->rsp = rsp;
+ mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -3790,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- rdp->beenonline = 1; /* We have now been online. */
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -3812,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
smp_mb__after_unlock_lock();
rnp->qsmaskinitnext |= mask;
+ rnp->expmaskinitnext |= mask;
+ if (!rdp->beenonline)
+ WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
+ rdp->beenonline = true; /* We have now been online. */
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
- rdp->passed_quiesce = false;
+ rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
- rdp->qs_pending = false;
+ rdp->core_needs_qs = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -3849,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self,
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
+ sync_sched_exp_online_cleanup(cpu);
rcu_boost_kthread_setaffinity(rnp, -1);
break;
case CPU_DOWN_PREPARE:
@@ -3860,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self,
rcu_cleanup_dying_cpu(rsp);
break;
case CPU_DYING_IDLE:
+ /* QS for any half-done expedited RCU-sched GP. */
+ preempt_disable();
+ rcu_report_exp_rdp(&rcu_sched_state,
+ this_cpu_ptr(rcu_sched_state.rda), true);
+ preempt_enable();
+
for_each_rcu_flavor(rsp) {
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
@@ -3961,22 +4361,22 @@ void rcu_scheduler_starting(void)
* Compute the per-level fanout, either using the exact fanout specified
* or balancing the tree, depending on the rcu_fanout_exact boot parameter.
*/
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
+static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
int i;
if (rcu_fanout_exact) {
- rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+ levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
for (i = rcu_num_lvls - 2; i >= 0; i--)
- rsp->levelspread[i] = RCU_FANOUT;
+ levelspread[i] = RCU_FANOUT;
} else {
int ccur;
int cprv;
cprv = nr_cpu_ids;
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- ccur = rsp->levelcnt[i];
- rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+ ccur = levelcnt[i];
+ levelspread[i] = (cprv + ccur - 1) / ccur;
cprv = ccur;
}
}
@@ -3988,23 +4388,19 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static const char * const buf[] = {
- "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static const char * const fqs[] = {
- "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = RCU_NODE_NAME_INIT;
+ static const char * const fqs[] = RCU_FQS_NAME_INIT;
+ static const char * const exp[] = RCU_EXP_NAME_INIT;
static u8 fl_mask = 0x1;
+
+ int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
+ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
int cpustride = 1;
int i;
int j;
struct rcu_node *rnp;
- BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
+ BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
/* Silence gcc 4.8 false positive about array index out of range. */
if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
@@ -4013,19 +4409,19 @@ static void __init rcu_init_one(struct rcu_state *rsp,
/* Initialize the level-tracking arrays. */
for (i = 0; i < rcu_num_lvls; i++)
- rsp->levelcnt[i] = num_rcu_lvl[i];
+ levelcnt[i] = num_rcu_lvl[i];
for (i = 1; i < rcu_num_lvls; i++)
- rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
- rcu_init_levelspread(rsp);
+ rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
+ rcu_init_levelspread(levelspread, levelcnt);
rsp->flavor_mask = fl_mask;
fl_mask <<= 1;
/* Initialize the elements themselves, starting from the leaves. */
for (i = rcu_num_lvls - 1; i >= 0; i--) {
- cpustride *= rsp->levelspread[i];
+ cpustride *= levelspread[i];
rnp = rsp->level[i];
- for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+ for (j = 0; j < levelcnt[i]; j++, rnp++) {
raw_spin_lock_init(&rnp->lock);
lockdep_set_class_and_name(&rnp->lock,
&rcu_node_class[i], buf[i]);
@@ -4045,18 +4441,22 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->grpmask = 0;
rnp->parent = NULL;
} else {
- rnp->grpnum = j % rsp->levelspread[i - 1];
+ rnp->grpnum = j % levelspread[i - 1];
rnp->grpmask = 1UL << rnp->grpnum;
rnp->parent = rsp->level[i - 1] +
- j / rsp->levelspread[i - 1];
+ j / levelspread[i - 1];
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
+ mutex_init(&rnp->exp_funnel_mutex);
+ lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
+ &rcu_exp_class[i], exp[i]);
}
}
init_waitqueue_head(&rsp->gp_wq);
+ init_waitqueue_head(&rsp->expedited_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
@@ -4076,9 +4476,7 @@ static void __init rcu_init_geometry(void)
{
ulong d;
int i;
- int j;
- int n = nr_cpu_ids;
- int rcu_capacity[MAX_RCU_LVLS + 1];
+ int rcu_capacity[RCU_NUM_LVLS];
/*
* Initialize any unspecified boot parameters.
@@ -4101,47 +4499,51 @@ static void __init rcu_init_geometry(void)
rcu_fanout_leaf, nr_cpu_ids);
/*
+ * The boot-time rcu_fanout_leaf parameter must be at least two
+ * and cannot exceed the number of bits in the rcu_node masks.
+ * Complain and fall back to the compile-time values if this
+ * limit is exceeded.
+ */
+ if (rcu_fanout_leaf < 2 ||
+ rcu_fanout_leaf > sizeof(unsigned long) * 8) {
+ rcu_fanout_leaf = RCU_FANOUT_LEAF;
+ WARN_ON(1);
+ return;
+ }
+
+ /*
* Compute number of nodes that can be handled an rcu_node tree
- * with the given number of levels. Setting rcu_capacity[0] makes
- * some of the arithmetic easier.
+ * with the given number of levels.
*/
- rcu_capacity[0] = 1;
- rcu_capacity[1] = rcu_fanout_leaf;
- for (i = 2; i <= MAX_RCU_LVLS; i++)
+ rcu_capacity[0] = rcu_fanout_leaf;
+ for (i = 1; i < RCU_NUM_LVLS; i++)
rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
/*
- * The boot-time rcu_fanout_leaf parameter is only permitted
- * to increase the leaf-level fanout, not decrease it. Of course,
- * the leaf-level fanout cannot exceed the number of bits in
- * the rcu_node masks. Finally, the tree must be able to accommodate
- * the configured number of CPUs. Complain and fall back to the
- * compile-time values if these limits are exceeded.
+ * The tree must be able to accommodate the configured number of CPUs.
+ * If this limit is exceeded, fall back to the compile-time values.
*/
- if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
- rcu_fanout_leaf > sizeof(unsigned long) * 8 ||
- n > rcu_capacity[MAX_RCU_LVLS]) {
+ if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
+ rcu_fanout_leaf = RCU_FANOUT_LEAF;
WARN_ON(1);
return;
}
+ /* Calculate the number of levels in the tree. */
+ for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
+ }
+ rcu_num_lvls = i + 1;
+
/* Calculate the number of rcu_nodes at each level of the tree. */
- for (i = 1; i <= MAX_RCU_LVLS; i++)
- if (n <= rcu_capacity[i]) {
- for (j = 0; j <= i; j++)
- num_rcu_lvl[j] =
- DIV_ROUND_UP(n, rcu_capacity[i - j]);
- rcu_num_lvls = i;
- for (j = i + 1; j <= MAX_RCU_LVLS; j++)
- num_rcu_lvl[j] = 0;
- break;
- }
+ for (i = 0; i < rcu_num_lvls; i++) {
+ int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
+ num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
+ }
/* Calculate the total number of rcu_node structures. */
rcu_num_nodes = 0;
- for (i = 0; i <= MAX_RCU_LVLS; i++)
+ for (i = 0; i < rcu_num_lvls; i++)
rcu_num_nodes += num_rcu_lvl[i];
- rcu_num_nodes -= n;
}
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..9fb4e238d4dc 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/stop_machine.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -36,8 +37,6 @@
* Of course, your mileage may vary.
*/
-#define MAX_RCU_LVLS 4
-
#ifdef CONFIG_RCU_FANOUT
#define RCU_FANOUT CONFIG_RCU_FANOUT
#else /* #ifdef CONFIG_RCU_FANOUT */
@@ -66,38 +65,45 @@
#if NR_CPUS <= RCU_FANOUT_1
# define RCU_NUM_LVLS 1
# define NUM_RCU_LVL_0 1
-# define NUM_RCU_LVL_1 (NR_CPUS)
-# define NUM_RCU_LVL_2 0
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES NUM_RCU_LVL_0
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_2 (NR_CPUS)
-# define NUM_RCU_LVL_3 0
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_3 (NR_CPUS)
-# define NUM_RCU_LVL_4 0
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
-# define NUM_RCU_LVL_4 (NR_CPUS)
+# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
+# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
+# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
-#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
-#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
-
extern int rcu_num_lvls;
extern int rcu_num_nodes;
@@ -157,16 +163,21 @@ struct rcu_node {
/* an rcu_data structure, otherwise, each */
/* bit corresponds to a child rcu_node */
/* structure. */
- unsigned long expmask; /* Groups that have ->blkd_tasks */
- /* elements that need to drain to allow the */
- /* current expedited grace period to */
- /* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit;
- /* Per-GP initial value for qsmask & expmask. */
+ /* Per-GP initial value for qsmask. */
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
/* Online CPUs for next grace period. */
+ unsigned long expmask; /* CPUs or groups that need to check in */
+ /* to allow the current expedited GP */
+ /* to complete. */
+ unsigned long expmaskinit;
+ /* Per-GP initial values for expmask. */
+ /* Initialized from ->expmaskinitnext at the */
+ /* beginning of each expedited GP. */
+ unsigned long expmaskinitnext;
+ /* Online CPUs for next expedited GP. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
@@ -236,6 +247,8 @@ struct rcu_node {
int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+ struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
/*
@@ -265,6 +278,18 @@ struct rcu_node {
for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \
(rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++)
+/*
+ * Union to allow "aggregate OR" operation on the need for a quiescent
+ * state by the normal and expedited grace periods.
+ */
+union rcu_noqs {
+ struct {
+ u8 norm;
+ u8 exp;
+ } b; /* Bits. */
+ u16 s; /* Set of bits, aggregate OR here. */
+};
+
/* Index values for nxttail array in struct rcu_data. */
#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@@ -281,18 +306,16 @@ struct rcu_data {
/* is aware of having started. */
unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
/* for rcu_all_qs() invocations. */
- bool passed_quiesce; /* User-mode/idle loop etc. */
- bool qs_pending; /* Core waits for quiesc state. */
+ union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */
+ bool core_needs_qs; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible gpnum/completed wrap. */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned long ticks_this_gp; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
/* 2) batch handling */
/*
@@ -346,7 +369,7 @@ struct rcu_data {
/* 5) __rcu_pending() statistics. */
unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
- unsigned long n_rp_qs_pending;
+ unsigned long n_rp_core_needs_qs;
unsigned long n_rp_report_qs;
unsigned long n_rp_cb_ready;
unsigned long n_rp_cpu_needs_gp;
@@ -355,11 +378,12 @@ struct rcu_data {
unsigned long n_rp_nocb_defer_wakeup;
unsigned long n_rp_need_nothing;
- /* 6) _rcu_barrier() and OOM callbacks. */
+ /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
struct rcu_head barrier_head;
#ifdef CONFIG_RCU_FAST_NO_HZ
struct rcu_head oom_head;
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+ struct mutex exp_funnel_mutex;
/* 7) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
@@ -387,21 +411,12 @@ struct rcu_data {
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
unsigned int softirq_snap; /* Snapshot of softirq activity. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
int cpu;
struct rcu_state *rsp;
};
-/* Values for fqs_state field in struct rcu_state. */
-#define RCU_GP_IDLE 0 /* No grace period in progress. */
-#define RCU_GP_INIT 1 /* Grace period being initialized. */
-#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
-#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
-#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
-
/* Values for nocb_defer_wakeup field in struct rcu_data. */
#define RCU_NOGP_WAKE_NOT 0
#define RCU_NOGP_WAKE 1
@@ -442,19 +457,18 @@ do { \
*/
struct rcu_state {
struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
- struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */
- u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
- u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
+ struct rcu_node *level[RCU_NUM_LVLS + 1];
+ /* Hierarchy levels (+1 to */
+ /* shut bogus gcc warning) */
u8 flavor_mask; /* bit in flavor mask. */
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
- void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
- void (*func)(struct rcu_head *head));
+ call_rcu_func_t call; /* call_rcu() flavor. */
+ int ncpus; /* # CPUs seen so far. */
/* The following fields are guarded by the root rcu_node's lock. */
- u8 fqs_state ____cacheline_internodealigned_in_smp;
- /* Force QS state. */
- u8 boost; /* Subject to priority boost. */
+ u8 boost ____cacheline_internodealigned_in_smp;
+ /* Subject to priority boost. */
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
@@ -479,21 +493,19 @@ struct rcu_state {
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
- unsigned long n_barrier_done; /* ++ at start and end of */
+ unsigned long barrier_sequence; /* ++ at start and end of */
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
+ unsigned long expedited_sequence; /* Take a ticket. */
+ atomic_long_t expedited_workdone0; /* # done by others #0. */
atomic_long_t expedited_workdone1; /* # done by others #1. */
atomic_long_t expedited_workdone2; /* # done by others #2. */
+ atomic_long_t expedited_workdone3; /* # done by others #3. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ atomic_t expedited_need_qs; /* # CPUs left to check in. */
+ wait_queue_head_t expedited_wq; /* Wait for check-ins. */
+ int ncpus_snap; /* # CPUs seen last time. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
@@ -524,10 +536,14 @@ struct rcu_state {
#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
-/* Values for rcu_state structure's gp_flags field. */
-#define RCU_GP_WAIT_INIT 0 /* Initial state. */
+/* Values for rcu_state structure's gp_state field. */
+#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */
#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
-#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
+#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
+#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */
+#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
+#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
extern struct list_head rcu_struct_flavors;
@@ -564,9 +580,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_preempt_check_callbacks(void);
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -635,3 +652,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
}
#endif /* #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Place this after a lock-acquisition primitive to guarantee that
+ * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies
+ * if the UNLOCK and LOCK are executed by the same CPU or if the
+ * UNLOCK and LOCK operate on the same lock variable.
+ */
+#ifdef CONFIG_PPC
+#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
+#else /* #ifdef CONFIG_PPC */
+#define smp_mb__after_unlock_lock() do { } while (0)
+#endif /* #else #ifdef CONFIG_PPC */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 013485fb2b06..630c19772630 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU lockdep checking is enabled.\n");
if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
pr_info("\tRCU torture testing starts during boot.\n");
- if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
- pr_info("\tAdditional per-CPU info printed with stalls.\n");
- if (NUM_RCU_LVL_4 != 0)
- pr_info("\tFour-level hierarchy is enabled.\n");
+ if (RCU_NUM_LVLS >= 4)
+ pr_info("\tFour(or more)-level hierarchy is enabled.\n");
if (RCU_FANOUT_LEAF != 16)
pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
RCU_FANOUT_LEAF);
@@ -103,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *const rcu_state_p = &rcu_preempt_state;
static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data;
-static int rcu_preempted_readers_exp(struct rcu_node *rnp);
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
@@ -116,6 +113,147 @@ static void __init rcu_bootup_announce(void)
rcu_bootup_announce_oddness();
}
+/* Flags for rcu_preempt_ctxt_queue() decision table. */
+#define RCU_GP_TASKS 0x8
+#define RCU_EXP_TASKS 0x4
+#define RCU_GP_BLKD 0x2
+#define RCU_EXP_BLKD 0x1
+
+/*
+ * Queues a task preempted within an RCU-preempt read-side critical
+ * section into the appropriate location within the ->blkd_tasks list,
+ * depending on the states of any ongoing normal and expedited grace
+ * periods. The ->gp_tasks pointer indicates which element the normal
+ * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
+ * indicates which element the expedited grace period is waiting on (again,
+ * NULL if none). If a grace period is waiting on a given element in the
+ * ->blkd_tasks list, it also waits on all subsequent elements. Thus,
+ * adding a task to the tail of the list blocks any grace period that is
+ * already waiting on one of the elements. In contrast, adding a task
+ * to the head of the list won't block any grace period that is already
+ * waiting on one of the elements.
+ *
+ * This queuing is imprecise, and can sometimes make an ongoing grace
+ * period wait for a task that is not strictly speaking blocking it.
+ * Given the choice, we needlessly block a normal grace period rather than
+ * blocking an expedited grace period.
+ *
+ * Note that an endless sequence of expedited grace periods still cannot
+ * indefinitely postpone a normal grace period. Eventually, all of the
+ * fixed number of preempted tasks blocking the normal grace period that are
+ * not also blocking the expedited grace period will resume and complete
+ * their RCU read-side critical sections. At that point, the ->gp_tasks
+ * pointer will equal the ->exp_tasks pointer, at which point the end of
+ * the corresponding expedited grace period will also be the end of the
+ * normal grace period.
+ */
+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long flags) __releases(rnp->lock)
+{
+ int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
+ (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
+ (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
+ (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
+ struct task_struct *t = current;
+
+ /*
+ * Decide where to queue the newly blocked task. In theory,
+ * this could be an if-statement. In practice, when I tried
+ * that, it was quite messy.
+ */
+ switch (blkd_state) {
+ case 0:
+ case RCU_EXP_TASKS:
+ case RCU_EXP_TASKS + RCU_GP_BLKD:
+ case RCU_GP_TASKS:
+ case RCU_GP_TASKS + RCU_EXP_TASKS:
+
+ /*
+ * Blocking neither GP, or first task blocking the normal
+ * GP but not blocking the already-waiting expedited GP.
+ * Queue at the head of the list to avoid unnecessarily
+ * blocking the already-waiting GPs.
+ */
+ list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+ break;
+
+ case RCU_EXP_BLKD:
+ case RCU_GP_BLKD:
+ case RCU_GP_BLKD + RCU_EXP_BLKD:
+ case RCU_GP_TASKS + RCU_EXP_BLKD:
+ case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+ case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+
+ /*
+ * First task arriving that blocks either GP, or first task
+ * arriving that blocks the expedited GP (with the normal
+ * GP already waiting), or a task arriving that blocks
+ * both GPs with both GPs already waiting. Queue at the
+ * tail of the list to avoid any GP waiting on any of the
+ * already queued tasks that are not blocking it.
+ */
+ list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
+ break;
+
+ case RCU_EXP_TASKS + RCU_EXP_BLKD:
+ case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
+ case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
+
+ /*
+ * Second or subsequent task blocking the expedited GP.
+ * The task either does not block the normal GP, or is the
+ * first task blocking the normal GP. Queue just after
+ * the first task blocking the expedited GP.
+ */
+ list_add(&t->rcu_node_entry, rnp->exp_tasks);
+ break;
+
+ case RCU_GP_TASKS + RCU_GP_BLKD:
+ case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
+
+ /*
+ * Second or subsequent task blocking the normal GP.
+ * The task does not block the expedited GP. Queue just
+ * after the first task blocking the normal GP.
+ */
+ list_add(&t->rcu_node_entry, rnp->gp_tasks);
+ break;
+
+ default:
+
+ /* Yet another exercise in excessive paranoia. */
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /*
+ * We have now queued the task. If it was the first one to
+ * block either grace period, update the ->gp_tasks and/or
+ * ->exp_tasks pointers, respectively, to reference the newly
+ * blocked tasks.
+ */
+ if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD))
+ rnp->gp_tasks = &t->rcu_node_entry;
+ if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
+ rnp->exp_tasks = &t->rcu_node_entry;
+ raw_spin_unlock(&rnp->lock);
+
+ /*
+ * Report the quiescent state for the expedited GP. This expedited
+ * GP should not be able to end until we report, so there should be
+ * no need to check for a subsequent expedited GP. (Though we are
+ * still in a quiescent state in any case.)
+ */
+ if (blkd_state & RCU_EXP_BLKD &&
+ t->rcu_read_unlock_special.b.exp_need_qs) {
+ t->rcu_read_unlock_special.b.exp_need_qs = false;
+ rcu_report_exp_rdp(rdp->rsp, rdp, true);
+ } else {
+ WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
+ }
+ local_irq_restore(flags);
+}
+
/*
* Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
@@ -127,11 +265,11 @@ static void __init rcu_bootup_announce(void)
*/
static void rcu_preempt_qs(void)
{
- if (!__this_cpu_read(rcu_data_p->passed_quiesce)) {
+ if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) {
trace_rcu_grace_period(TPS("rcu_preempt"),
__this_cpu_read(rcu_data_p->gpnum),
TPS("cpuqs"));
- __this_cpu_write(rcu_data_p->passed_quiesce, 1);
+ __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false);
barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
current->rcu_read_unlock_special.b.need_qs = false;
}
@@ -169,42 +307,18 @@ static void rcu_preempt_note_context_switch(void)
t->rcu_blocked_node = rnp;
/*
- * If this CPU has already checked in, then this task
- * will hold up the next grace period rather than the
- * current grace period. Queue the task accordingly.
- * If the task is queued for the current grace period
- * (i.e., this CPU has not yet passed through a quiescent
- * state for the current grace period), then as long
- * as that task remains queued, the current grace period
- * cannot end. Note that there is some uncertainty as
- * to exactly when the current grace period started.
- * We take a conservative approach, which can result
- * in unnecessarily waiting on tasks that started very
- * slightly after the current grace period began. C'est
- * la vie!!!
- *
- * But first, note that the current CPU must still be
- * on line!
+ * Verify the CPU's sanity, trace the preemption, and
+ * then queue the task as required based on the states
+ * of any ongoing and expedited grace periods.
*/
WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
- if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
- list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
- rnp->gp_tasks = &t->rcu_node_entry;
- if (IS_ENABLED(CONFIG_RCU_BOOST) &&
- rnp->boost_tasks != NULL)
- rnp->boost_tasks = rnp->gp_tasks;
- } else {
- list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
- if (rnp->qsmask & rdp->grpmask)
- rnp->gp_tasks = &t->rcu_node_entry;
- }
trace_rcu_preempt_task(rdp->rsp->name,
t->pid,
(rnp->qsmask & rdp->grpmask)
? rnp->gpnum
: rnp->gpnum + 1);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_preempt_ctxt_queue(rnp, rdp, flags);
} else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special.s) {
@@ -274,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t)
unsigned long flags;
struct list_head *np;
bool drop_boost_mutex = false;
+ struct rcu_data *rdp;
struct rcu_node *rnp;
union rcu_special special;
@@ -284,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
/*
- * If RCU core is waiting for this CPU to exit critical section,
- * let it know that we have done so. Because irqs are disabled,
+ * If RCU core is waiting for this CPU to exit its critical section,
+ * report the fact that it has exited. Because irqs are disabled,
* t->rcu_read_unlock_special cannot change.
*/
special = t->rcu_read_unlock_special;
@@ -298,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t)
}
}
+ /*
+ * Respond to a request for an expedited grace period, but only if
+ * we were not preempted, meaning that we were running on the same
+ * CPU throughout. If we were preempted, the exp_need_qs flag
+ * would have been cleared at the time of the first preemption,
+ * and the quiescent state would be reported when we were dequeued.
+ */
+ if (special.b.exp_need_qs) {
+ WARN_ON_ONCE(special.b.blocked);
+ t->rcu_read_unlock_special.b.exp_need_qs = false;
+ rdp = this_cpu_ptr(rcu_state_p->rda);
+ rcu_report_exp_rdp(rcu_state_p, rdp, true);
+ if (!t->rcu_read_unlock_special.s) {
+ local_irq_restore(flags);
+ return;
+ }
+ }
+
/* Hardware IRQ handlers cannot block, complain if they get here. */
if (in_irq() || in_serving_softirq()) {
lockdep_rcu_suspicious(__FILE__, __LINE__,
"rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
- pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+ pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n",
t->rcu_read_unlock_special.s,
t->rcu_read_unlock_special.b.blocked,
+ t->rcu_read_unlock_special.b.exp_need_qs,
t->rcu_read_unlock_special.b.need_qs);
local_irq_restore(flags);
return;
@@ -331,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t)
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
- empty_exp = !rcu_preempted_readers_exp(rnp);
+ empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
@@ -355,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
* so we must take a snapshot of the expedited state.
*/
- empty_exp_now = !rcu_preempted_readers_exp(rnp);
+ empty_exp_now = sync_rcu_preempt_exp_done(rnp);
if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
rnp->gpnum,
@@ -418,8 +552,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
{
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -431,18 +563,6 @@ static void rcu_print_task_stall_end(void)
pr_cont("\n");
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void rcu_print_task_stall_begin(struct rcu_node *rnp)
-{
-}
-
-static void rcu_print_task_stall_end(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
/*
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each.
@@ -466,6 +586,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
}
/*
+ * Scan the current list of tasks blocked within RCU read-side critical
+ * sections, printing out the tid of each that is blocking the current
+ * expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+ int ndetected = 0;
+
+ if (!rnp->exp_tasks)
+ return 0;
+ t = list_entry(rnp->exp_tasks->prev,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ pr_cont(" P%d", t->pid);
+ ndetected++;
+ }
+ return ndetected;
+}
+
+/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
@@ -499,8 +640,8 @@ static void rcu_preempt_check_callbacks(void)
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- __this_cpu_read(rcu_data_p->qs_pending) &&
- !__this_cpu_read(rcu_data_p->passed_quiesce))
+ __this_cpu_read(rcu_data_p->core_needs_qs) &&
+ __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm))
t->rcu_read_unlock_special.b.need_qs = true;
}
@@ -516,7 +657,7 @@ static void rcu_preempt_do_callbacks(void)
/*
* Queue a preemptible-RCU callback for invocation after a grace period.
*/
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, rcu_state_p, -1, 0);
}
@@ -538,10 +679,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
void synchronize_rcu(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu() in RCU read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu() in RCU read-side critical section");
if (!rcu_scheduler_active)
return;
if (rcu_gp_is_expedited())
@@ -551,157 +692,41 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
-
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(struct rcu_node *rnp)
-{
- return rnp->exp_tasks != NULL;
-}
-
-/*
- * return non-zero if there is no RCU expedited grace period in progress
- * for the specified rcu_node structure, in other words, if all CPUs and
- * tasks covered by the specified rcu_node structure have done their bit
- * for the current expedited grace period. Works only for preemptible
- * RCU -- other RCU implementation use other means.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
-{
- return !rcu_preempted_readers_exp(rnp) &&
- READ_ONCE(rnp->expmask) == 0;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period. This event is reported either to the rcu_node structure on
- * which the task was queued or to one of that rcu_node structure's ancestors,
- * recursively up the tree. (Calm down, calm down, we do the recursion
- * iteratively!)
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
- bool wake)
-{
- unsigned long flags;
- unsigned long mask;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- for (;;) {
- if (!sync_rcu_preempt_exp_done(rnp)) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- break;
- }
- if (rnp->parent == NULL) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (wake) {
- smp_mb(); /* EGP done before wake_up(). */
- wake_up(&sync_rcu_preempt_exp_wq);
- }
- break;
- }
- mask = rnp->grpmask;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- rnp = rnp->parent;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- smp_mb__after_unlock_lock();
- rnp->expmask &= ~mask;
- }
-}
-
-/*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure, phase 1. If there
- * are such tasks, set the ->expmask bits up the rcu_node tree and also
- * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
- * that work is needed here.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
- */
-static void
-sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
-{
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp_up;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- WARN_ON_ONCE(rnp->expmask);
- WARN_ON_ONCE(rnp->exp_tasks);
- if (!rcu_preempt_has_tasks(rnp)) {
- /* No blocked tasks, nothing to do. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- /* Call for Phase 2 and propagate ->expmask bits up the tree. */
- rnp->expmask = 1;
- rnp_up = rnp;
- while (rnp_up->parent) {
- mask = rnp_up->grpmask;
- rnp_up = rnp_up->parent;
- if (rnp_up->expmask & mask)
- break;
- raw_spin_lock(&rnp_up->lock); /* irqs already off */
- smp_mb__after_unlock_lock();
- rnp_up->expmask |= mask;
- raw_spin_unlock(&rnp_up->lock); /* irqs still off */
- }
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
-}
-
/*
- * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure, phase 2. If the
- * leaf rcu_node structure has its ->expmask field set, check for tasks.
- * If there are some, clear ->expmask and set ->exp_tasks accordingly,
- * then initiate RCU priority boosting. Otherwise, clear ->expmask and
- * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
- * enabling rcu_read_unlock_special() to do the bit-clearing.
- *
- * Caller must hold sync_rcu_preempt_exp_mutex.
+ * Remote handler for smp_call_function_single(). If there is an
+ * RCU read-side critical section in effect, request that the
+ * next rcu_read_unlock() record the quiescent state up the
+ * ->expmask fields in the rcu_node tree. Otherwise, immediately
+ * report the quiescent state.
*/
-static void
-sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
+static void sync_rcu_exp_handler(void *info)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rnp->lock, flags);
- smp_mb__after_unlock_lock();
- if (!rnp->expmask) {
- /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
-
- /* Phase 1 is over. */
- rnp->expmask = 0;
+ struct rcu_data *rdp;
+ struct rcu_state *rsp = info;
+ struct task_struct *t = current;
/*
- * If there are still blocked tasks, set up ->exp_tasks so that
- * rcu_read_unlock_special() will wake us and then boost them.
+ * Within an RCU read-side critical section, request that the next
+ * rcu_read_unlock() report. Unless this RCU read-side critical
+ * section has already blocked, in which case it is already set
+ * up for the expedited grace period to wait on it.
*/
- if (rcu_preempt_has_tasks(rnp)) {
- rnp->exp_tasks = rnp->blkd_tasks.next;
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
+ if (t->rcu_read_lock_nesting > 0 &&
+ !t->rcu_read_unlock_special.b.blocked) {
+ t->rcu_read_unlock_special.b.exp_need_qs = true;
return;
}
- /* No longer any blocked tasks, so undo bit setting. */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rcu_report_exp_rnp(rsp, rnp, false);
+ /*
+ * We are either exiting an RCU read-side critical section (negative
+ * values of t->rcu_read_lock_nesting) or are not in one at all
+ * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU
+ * read-side critical section that blocked before this expedited
+ * grace period started. Either way, we can immediately report
+ * the quiescent state.
+ */
+ rdp = this_cpu_ptr(rsp->rda);
+ rcu_report_exp_rdp(rsp, rdp, true);
}
/**
@@ -719,80 +744,28 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
void synchronize_rcu_expedited(void)
{
struct rcu_node *rnp;
+ struct rcu_node *rnp_unlock;
struct rcu_state *rsp = rcu_state_p;
- unsigned long snap;
- int trycount = 0;
+ unsigned long s;
- smp_mb(); /* Caller's modifications seen first by other CPUs. */
- snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
- smp_mb(); /* Above access cannot bleed into critical section. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Block CPU-hotplug operations. This means that any CPU-hotplug
- * operation that finds an rcu_node structure with tasks in the
- * process of being boosted will know that all tasks blocking
- * this expedited grace period will already be in the process of
- * being boosted. This simplifies the process of moving tasks
- * from leaf to root rcu_node structures.
- */
- if (!try_get_online_cpus()) {
- /* CPU-hotplug operation in flight, fall back to normal GP. */
- wait_rcu_gp(call_rcu);
- return;
- }
+ rnp_unlock = exp_funnel_lock(rsp, s);
+ if (rnp_unlock == NULL)
+ return; /* Someone else did our work for us. */
- /*
- * Acquire lock, falling back to synchronize_rcu() if too many
- * lock-acquisition failures. Of course, if someone does the
- * expedited grace period for us, just leave.
- */
- while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
- if (ULONG_CMP_LT(snap,
- READ_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto mb_ret; /* Others did our work for us. */
- }
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- put_online_cpus();
- wait_rcu_gp(call_rcu);
- return;
- }
- }
- if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
- put_online_cpus();
- goto unlock_mb_ret; /* Others did our work for us. */
- }
+ rcu_exp_gp_seq_start(rsp);
- /* force all RCU readers onto ->blkd_tasks lists. */
- synchronize_sched_expedited();
-
- /*
- * Snapshot current state of ->blkd_tasks lists into ->expmask.
- * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
- * to start clearing them. Doing this in one phase leads to
- * strange races between setting and clearing bits, so just say "no"!
- */
- rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init1(rsp, rnp);
- rcu_for_each_leaf_node(rsp, rnp)
- sync_rcu_preempt_exp_init2(rsp, rnp);
-
- put_online_cpus();
+ /* Initialize the rcu_node tree in preparation for the wait. */
+ sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
/* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
- wait_event(sync_rcu_preempt_exp_wq,
- sync_rcu_preempt_exp_done(rnp));
+ synchronize_sched_expedited_wait(rsp);
/* Clean up and exit. */
- smp_mb(); /* ensure expedited GP seen before counter increment. */
- WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1);
-unlock_mb_ret:
- mutex_unlock(&sync_rcu_preempt_exp_mutex);
-mb_ret:
- smp_mb(); /* ensure subsequent action seen after grace period. */
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp_unlock->exp_funnel_mutex);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -893,6 +866,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
}
/*
+ * Because preemptible RCU does not exist, we never have to check for
+ * tasks blocked within RCU read-side critical sections that are
+ * blocking the current expedited grace period.
+ */
+static int rcu_print_task_exp_stall(struct rcu_node *rnp)
+{
+ return 0;
+}
+
+/*
* Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
@@ -1061,8 +1044,7 @@ static int rcu_boost(struct rcu_node *rnp)
}
/*
- * Priority-boosting kthread. One per leaf rcu_node and one for the
- * root rcu_node.
+ * Priority-boosting kthread, one per leaf rcu_node.
*/
static int rcu_boost_kthread(void *arg)
{
@@ -1680,12 +1662,10 @@ static int rcu_oom_notify(struct notifier_block *self,
*/
atomic_set(&oom_callback_count, 1);
- get_online_cpus();
for_each_online_cpu(cpu) {
smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
cond_resched_rcu_qs();
}
- put_online_cpus();
/* Unconditionally decrement: no need to wake ourselves up. */
atomic_dec(&oom_callback_count);
@@ -1706,8 +1686,6 @@ early_initcall(rcu_register_oom_notifier);
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_CPU_STALL_INFO
-
#ifdef CONFIG_RCU_FAST_NO_HZ
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
@@ -1765,8 +1743,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
- cpu, ticks_value, ticks_title,
+ pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ cpu,
+ "O."[!!cpu_online(cpu)],
+ "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
+ "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
@@ -1796,33 +1778,6 @@ static void increment_cpu_stall_ticks(void)
raw_cpu_inc(rsp->rda->ticks_this_gp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
-static void print_cpu_stall_info_begin(void)
-{
- pr_cont(" {");
-}
-
-static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
-{
- pr_cont(" %d", cpu);
-}
-
-static void print_cpu_stall_info_end(void)
-{
- pr_cont("} ");
-}
-
-static void zero_cpu_stall_ticks(struct rcu_data *rdp)
-{
-}
-
-static void increment_cpu_stall_ticks(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
-
#ifdef CONFIG_RCU_NOCB_CPU
/*
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..ef7093cc9b5c 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v)
static int show_rcubarrier(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "bcc: %d nbd: %lu\n",
+ seq_printf(m, "bcc: %d bseq: %lu\n",
atomic_read(&rsp->barrier_cpu_count),
- rsp->n_barrier_done);
+ rsp->barrier_sequence);
return 0;
}
@@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
+ seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
ulong2long(rdp->completed), ulong2long(rdp->gpnum),
- rdp->passed_quiesce,
+ rdp->cpu_no_qs.b.norm,
rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
- rdp->qs_pending);
+ rdp->core_needs_qs);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
@@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
- atomic_long_read(&rsp->expedited_start),
- atomic_long_read(&rsp->expedited_done),
- atomic_long_read(&rsp->expedited_wrap),
- atomic_long_read(&rsp->expedited_tryfail),
+ seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+ rsp->expedited_sequence,
+ atomic_long_read(&rsp->expedited_workdone0),
atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2),
+ atomic_long_read(&rsp->expedited_workdone3),
atomic_long_read(&rsp->expedited_normal),
- atomic_long_read(&rsp->expedited_stoppedcpus),
- atomic_long_read(&rsp->expedited_done_tries),
- atomic_long_read(&rsp->expedited_done_lost),
- atomic_long_read(&rsp->expedited_done_exit));
+ atomic_read(&rsp->expedited_need_qs),
+ rsp->expedited_sequence / 2);
return 0;
}
@@ -271,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ",
ulong2long(rsp->completed), ulong2long(gpnum),
- rsp->fqs_state,
+ rsp->gp_state,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff));
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
@@ -364,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
cpu_is_offline(rdp->cpu) ? '!' : ' ',
rdp->n_rcu_pending);
seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ",
- rdp->n_rp_qs_pending,
+ rdp->n_rp_core_needs_qs,
rdp->n_rp_report_qs,
rdp->n_rp_cb_ready,
rdp->n_rp_cpu_needs_gp);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index afaecb7a799a..5f748c5a40f0 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate");
module_param(rcu_expedited, int, 0);
+#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+/**
+ * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * RCU-sched read-side critical section. In absence of
+ * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
+ * critical section unless it can prove otherwise. Note that disabling
+ * of preemption (including disabling irqs) counts as an RCU-sched
+ * read-side critical section. This is useful for debug checks in functions
+ * that required that they be called within an RCU-sched read-side
+ * critical section.
+ *
+ * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
+ * and while lockdep is disabled.
+ *
+ * Note that if the CPU is in the idle loop from an RCU point of
+ * view (ie: that we are in the section between rcu_idle_enter() and
+ * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
+ * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
+ * that are in such a section, considering these as in extended quiescent
+ * state, so such a CPU is effectively never in an RCU read-side critical
+ * section regardless of what RCU primitives it invokes. This state of
+ * affairs is required --- we need to keep an RCU-free window in idle
+ * where the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run in
+ * the idle task.
+ *
+ * Similarly, we avoid claiming an SRCU read lock held if the current
+ * CPU is offline.
+ */
+int rcu_read_lock_sched_held(void)
+{
+ int lockdep_opinion = 0;
+
+ if (!debug_lockdep_rcu_enabled())
+ return 1;
+ if (!rcu_is_watching())
+ return 0;
+ if (!rcu_lockdep_current_cpu_online())
+ return 0;
+ if (debug_locks)
+ lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
+ return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+}
+EXPORT_SYMBOL(rcu_read_lock_sched_held);
+#endif
+
#ifndef CONFIG_TINY_RCU
static atomic_t rcu_expedited_nesting =
@@ -269,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head)
rcu = container_of(head, struct rcu_synchronize, head);
complete(&rcu->completion);
}
+EXPORT_SYMBOL_GPL(wakeme_after_rcu);
-void wait_rcu_gp(call_rcu_func_t crf)
+void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+ struct rcu_synchronize *rs_array)
{
- struct rcu_synchronize rcu;
+ int i;
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- crf(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
+ /* Initialize and register callbacks for each flavor specified. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu ||
+ crcu_array[i] == call_rcu_bh)) {
+ might_sleep();
+ continue;
+ }
+ init_rcu_head_on_stack(&rs_array[i].head);
+ init_completion(&rs_array[i].completion);
+ (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+ }
+
+ /* Wait for all callbacks to be invoked. */
+ for (i = 0; i < n; i++) {
+ if (checktiny &&
+ (crcu_array[i] == call_rcu ||
+ crcu_array[i] == call_rcu_bh))
+ continue;
+ wait_for_completion(&rs_array[i].completion);
+ destroy_rcu_head_on_stack(&rs_array[i].head);
+ }
}
-EXPORT_SYMBOL_GPL(wait_rcu_gp);
+EXPORT_SYMBOL_GPL(__wait_rcu_gp);
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
@@ -468,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void);
* Post an RCU-tasks callback. First call must be from process context
* after the scheduler if fully operational.
*/
-void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
unsigned long flags;
bool needwake;
@@ -523,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
void synchronize_rcu_tasks(void)
{
/* Complain if the scheduler has not started. */
- rcu_lockdep_assert(!rcu_scheduler_active,
- "synchronize_rcu_tasks called too soon");
+ RCU_LOCKDEP_WARN(!rcu_scheduler_active,
+ "synchronize_rcu_tasks called too soon");
/* Wait for the grace period. */
wait_rcu_gp(call_rcu_tasks);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
kernel_restart(buffer);
break;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
case LINUX_REBOOT_CMD_KEXEC:
ret = kernel_kexec();
break;
diff --git a/kernel/resource.c b/kernel/resource.c
index fed052a1bc9f..f150dbbe6f62 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
-/*
- * Search for a resouce entry that fully contains the specified region.
- * If found, return 1 if it is RAM, 0 if not.
- * If not found, or region is not fully contained, return -1
+/**
+ * region_intersects() - determine intersection of region with known resources
+ * @start: region start address
+ * @size: size of region
+ * @name: name of resource (in iomem_resource)
*
- * Used by the ioremap functions to ensure the user is not remapping RAM and is
- * a vast speed up over walking through the resource table page by page.
+ * Check if the specified region partially overlaps or fully eclipses a
+ * resource identified by @name. Return REGION_DISJOINT if the region
+ * does not overlap @name, return REGION_MIXED if the region overlaps
+ * @type and another resource, and return REGION_INTERSECTS if the
+ * region overlaps @type and no other defined resource. Note, that
+ * REGION_INTERSECTS is also returned in the case when the specified
+ * region overlaps RAM and undefined memory holes.
+ *
+ * region_intersect() is used by memory remapping functions to ensure
+ * the user is not remapping RAM and is a vast speed up over walking
+ * through the resource table page by page.
*/
-int region_is_ram(resource_size_t start, unsigned long size)
+int region_intersects(resource_size_t start, size_t size, const char *name)
{
- struct resource *p;
- resource_size_t end = start + size - 1;
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- const char *name = "System RAM";
- int ret = -1;
+ resource_size_t end = start + size - 1;
+ int type = 0; int other = 0;
+ struct resource *p;
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
- if (p->end < start)
- continue;
-
- if (p->start <= start && end <= p->end) {
- /* resource fully contains region */
- if ((p->flags != flags) || strcmp(p->name, name))
- ret = 0;
- else
- ret = 1;
- break;
- }
- if (end < p->start)
- break; /* not found */
+ bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+
+ if (start >= p->start && start <= p->end)
+ is_type ? type++ : other++;
+ if (end >= p->start && end <= p->end)
+ is_type ? type++ : other++;
+ if (p->start >= start && p->end <= end)
+ is_type ? type++ : other++;
}
read_unlock(&resource_lock);
- return ret;
+
+ if (other == 0)
+ return type ? REGION_INTERSECTS : REGION_DISJOINT;
+
+ if (type)
+ return REGION_MIXED;
+
+ return REGION_DISJOINT;
}
void __weak arch_remove_reservations(struct resource *avail)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..4d568ac9319e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- if (static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_dec(&sched_feat_keys[i]);
+ static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- if (!static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_inc(&sched_feat_keys[i]);
+ static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
@@ -623,18 +621,21 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu))
+ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
return cpu;
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i)) {
+ if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
cpu = i;
goto unlock;
}
}
}
+
+ if (!is_housekeeping_cpu(cpu))
+ cpu = housekeeping_any_cpu();
unlock:
rcu_read_unlock();
return cpu;
@@ -816,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
/*
* SCHED_IDLE tasks get minimal weight:
*/
- if (p->policy == SCHED_IDLE) {
+ if (idle_policy(p->policy)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
@@ -826,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
load->inv_weight = prio_to_wmult[prio];
}
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(rq, p);
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(rq, p);
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -1151,15 +1154,45 @@ static int migration_cpu_stop(void *data)
return 0;
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
cpumask_copy(&p->cpus_allowed, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+ bool queued, running;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ }
+ if (running)
+ put_prev_task(rq, p);
+
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (queued)
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
+}
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -1169,7 +1202,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
{
unsigned long flags;
struct rq *rq;
@@ -1178,6 +1212,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
rq = task_rq_lock(p, &flags);
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
@@ -1214,6 +1257,11 @@ out:
return ret;
}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1246,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p, new_cpu);
+ p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
}
@@ -1287,12 +1335,16 @@ static int migrate_swap_stop(void *data)
struct rq *src_rq, *dst_rq;
int ret = -EAGAIN;
+ if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+ return -EAGAIN;
+
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
double_raw_lock(&arg->src_task->pi_lock,
&arg->dst_task->pi_lock);
double_rq_lock(src_rq, dst_rq);
+
if (task_cpu(arg->dst_task) != arg->dst_cpu)
goto unlock;
@@ -1528,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
goto out;
}
+ /* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- /* No more Mr. Nice Guy. */
- cpuset_cpus_allowed_fallback(p);
- state = possible;
- break;
-
+ if (IS_ENABLED(CONFIG_CPUSETS)) {
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+ }
+ /* fall-through */
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
@@ -1595,6 +1649,15 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+
#endif /* CONFIG_SMP */
static void
@@ -1637,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#endif /* CONFIG_SCHEDSTATS */
}
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -1654,9 +1717,9 @@ static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags);
- trace_sched_wakeup(p, true);
-
p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
@@ -1874,6 +1937,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (!(p->state & state))
goto out;
+ trace_sched_waking(p);
+
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
@@ -1949,6 +2014,8 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
+ trace_sched_waking(p);
+
if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
@@ -2016,9 +2083,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
-#ifdef CONFIG_SMP
- p->se.avg.decay_count = 0;
-#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -2058,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif /* CONFIG_NUMA_BALANCING */
}
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+
void set_numabalancing_state(bool enabled)
{
if (enabled)
- sched_feat_set("NUMA");
+ static_branch_enable(&sched_numa_balancing);
else
- sched_feat_set("NO_NUMA");
-}
-#else
-__read_mostly bool numabalancing_enabled;
-
-void set_numabalancing_state(bool enabled)
-{
- numabalancing_enabled = enabled;
+ static_branch_disable(&sched_numa_balancing);
}
-#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2082,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
{
struct ctl_table t;
int err;
- int state = numabalancing_enabled;
+ int state = static_branch_likely(&sched_numa_balancing);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2200,8 +2258,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
#ifdef CONFIG_SMP
inline struct dl_bw *dl_bw_of(int i)
{
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
return &cpu_rq(i)->rd->dl_bw;
}
@@ -2210,8 +2268,8 @@ static inline int dl_bw_cpus(int i)
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
@@ -2293,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /* Initialize new task's runnable average */
+ init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -2302,16 +2362,21 @@ void wake_up_new_task(struct task_struct *p)
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
- /* Initialize new task's runnable average */
- init_task_runnable_average(p);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
- trace_sched_wakeup_new(p, true);
+ trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Nothing relies on rq->lock after this, so its fine to
+ * drop it.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
#endif
task_rq_unlock(rq, p, &flags);
}
@@ -2420,7 +2485,6 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
- trace_sched_switch(prev, next);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
@@ -2454,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+ /*
+ * The previous task will have left us with a preempt_count of 2
+ * because it left us after:
+ *
+ * schedule()
+ * preempt_disable(); // 1
+ * __schedule()
+ * raw_spin_lock_irq(&rq->lock) // 2
+ *
+ * Also, see FORK_PREEMPT_COUNT.
+ */
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
+
rq->prev_mm = NULL;
/*
@@ -2461,15 +2541,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* If a task dies, then it sets TASK_DEAD in tsk->state and calls
* schedule one last time. The schedule call will never return, and
* the scheduled task must drop that reference.
- * The test for TASK_DEAD must occur while the runqueue locks are
- * still held, otherwise prev could be scheduled on another cpu, die
- * there before we look at prev->state, and then the reference would
- * be dropped twice.
- * Manfred Spraul <manfred@colorfullife.com>
+ *
+ * We must observe prev->state before clearing prev->on_cpu (in
+ * finish_lock_switch), otherwise a concurrent wakeup can get prev
+ * running on another CPU and we could rave with its RUNNING -> DEAD
+ * transition, resulting in a double drop.
*/
prev_state = prev->state;
vtime_task_switch(prev);
- finish_arch_switch(prev);
perf_event_task_sched_in(prev, current);
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -2489,7 +2568,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
put_task_struct(prev);
}
- tick_nohz_task_switch(current);
+ tick_nohz_task_switch();
return rq;
}
@@ -2539,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
{
struct rq *rq;
- /* finish_task_switch() drops rq->lock and enables preemtion */
- preempt_disable();
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
+
rq = finish_task_switch(prev);
balance_callback(rq);
preempt_enable();
@@ -2614,13 +2700,20 @@ unsigned long nr_running(void)
/*
* Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race. The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
*/
bool single_task_running(void)
{
- if (cpu_rq(smp_processor_id())->nr_running == 1)
- return true;
- else
- return false;
+ return raw_rq()->nr_running == 1;
}
EXPORT_SYMBOL(single_task_running);
@@ -2891,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+ BUG_ON(task_stack_end_corrupted(prev));
#endif
- /*
- * Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path. Otherwise whine
- * if we are scheduling when we should not.
- */
- if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+ if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
+ preempt_count_set(PREEMPT_DISABLED);
+ }
rcu_sleep_check();
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -2985,7 +3076,7 @@ again:
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -2997,6 +3088,17 @@ static void __sched __schedule(void)
rcu_note_context_switch();
prev = rq->curr;
+ /*
+ * do_exit() calls schedule() with preemption disabled as an exception;
+ * however we must fix that up, otherwise the next task will see an
+ * inconsistent (higher) preempt count.
+ *
+ * It also avoids the below schedule_debug() test from complaining
+ * about this.
+ */
+ if (unlikely(prev->state == TASK_DEAD))
+ preempt_enable_no_resched_notrace();
+
schedule_debug(prev);
if (sched_feat(HRTICK))
@@ -3014,7 +3116,7 @@ static void __sched __schedule(void)
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
switch_count = &prev->nivcsw;
- if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
@@ -3050,6 +3152,7 @@ static void __sched __schedule(void)
rq->curr = next;
++*switch_count;
+ trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
} else {
@@ -3079,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule();
+ __schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
}
@@ -3119,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
- preempt_active_enter();
- __schedule();
- preempt_active_exit();
+ preempt_disable_notrace();
+ __schedule(true);
+ preempt_enable_no_resched_notrace();
/*
* Check again in case we missed a preemption opportunity
@@ -3172,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
return;
do {
- /*
- * Use raw __prempt_count() ops that don't call function.
- * We can't call functions before disabling preemption which
- * disarm preemption tracing recursions.
- */
- __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
- barrier();
+ preempt_disable_notrace();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule();
+ __schedule(true);
exception_exit(prev_ctx);
- barrier();
- __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ preempt_enable_no_resched_notrace();
} while (need_resched());
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3212,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- preempt_active_enter();
+ preempt_disable();
local_irq_enable();
- __schedule();
+ __schedule(true);
local_irq_disable();
- preempt_active_exit();
+ sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
@@ -3244,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, queued, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3276,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
@@ -3294,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- enqueue_flag = ENQUEUE_REPLENISH;
+ enqueue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3302,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
- enqueue_flag = ENQUEUE_HEAD;
+ enqueue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3354,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
}
queued = task_on_rq_queued(p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -3363,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (queued) {
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -3684,10 +3780,7 @@ recheck:
} else {
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
- if (policy != SCHED_DEADLINE &&
- policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- policy != SCHED_IDLE)
+ if (!valid_policy(policy))
return -EINVAL;
}
@@ -3743,7 +3836,7 @@ recheck:
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (idle_policy(p->policy) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -3868,7 +3961,7 @@ change:
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
@@ -3878,11 +3971,15 @@ change:
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
+ int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
- enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ if (oldprio <= p->prio)
+ enqueue_flags |= ENQUEUE_HEAD;
+
+ enqueue_task(rq, p, enqueue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
@@ -3960,6 +4057,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy,
{
return _sched_setscheduler(p, policy, param, false);
}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
@@ -4340,7 +4438,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = set_cpus_allowed_ptr(p, new_mask);
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4492,7 +4590,7 @@ SYSCALL_DEFINE0(sched_yield)
int __sched _cond_resched(void)
{
- if (should_resched()) {
+ if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
@@ -4510,7 +4608,7 @@ EXPORT_SYMBOL(_cond_resched);
*/
int __cond_resched_lock(spinlock_t *lock)
{
- int resched = should_resched();
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held(lock);
@@ -4532,7 +4630,7 @@ int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (should_resched()) {
+ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
local_bh_enable();
preempt_schedule_common();
local_bh_disable();
@@ -4865,13 +4963,22 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+ /*
+ * Its possible that init_idle() gets called multiple times on a task,
+ * in that case do_set_cpus_allowed() will not do the right thing.
+ *
+ * And since this is boot we can forgo the serialization.
+ */
+ set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4888,10 +4995,11 @@ void init_idle(struct task_struct *idle, int cpu)
rq->curr = rq->idle = idle;
idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -4902,7 +5010,7 @@ void init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}
@@ -5021,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
@@ -5030,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
task_rq_unlock(rq, p, &flags);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -5124,24 +5232,47 @@ static void migrate_tasks(struct rq *dead_rq)
break;
/*
- * Ensure rq->lock covers the entire task selection
- * until the migration.
+ * pick_next_task assumes pinned rq->lock.
*/
lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
+ /*
+ * Rules for changing task_struct::cpus_allowed are holding
+ * both pi_lock and rq->lock, such that holding either
+ * stabilizes the mask.
+ *
+ * Drop rq->lock is not quite as disastrous as it usually is
+ * because !cpu_active at this point, which means load-balance
+ * will not interfere. Also, stop-machine.
+ */
+ lockdep_unpin_lock(&rq->lock);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&next->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ /*
+ * Since we're inside stop-machine, _nothing_ should have
+ * changed the task, WARN if weird stuff happened, because in
+ * that case the above rq->lock drop is a fail too.
+ */
+ if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+ raw_spin_unlock(&next->pi_lock);
+ continue;
+ }
+
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
- lockdep_unpin_lock(&rq->lock);
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) {
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
}
+ raw_spin_unlock(&next->pi_lock);
}
rq->stop = stop;
@@ -5311,8 +5442,7 @@ static void register_sched_domain_sysctl(void)
/* may be called multiple times per register */
static void unregister_sched_domain_sysctl(void)
{
- if (sd_sysctl_header)
- unregister_sysctl_table(sd_sysctl_header);
+ unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -5429,13 +5559,27 @@ static void set_cpu_rq_start_time(void)
static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ int cpu = (long)hcpu;
+
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
set_cpu_rq_start_time();
return NOTIFY_OK;
+
+ case CPU_ONLINE:
+ /*
+ * At this point a starting CPU has marked itself as online via
+ * set_cpu_online(). But it might not yet have marked itself
+ * as active, which is essential from here on.
+ */
+ set_cpu_active(cpu, true);
+ stop_machine_unpark(cpu);
+ return NOTIFY_OK;
+
case CPU_DOWN_FAILED:
- set_cpu_active((long)hcpu, true);
+ set_cpu_active(cpu, true);
return NOTIFY_OK;
+
default:
return NOTIFY_DONE;
}
@@ -6367,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
{ NULL, },
};
-struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static struct sched_domain_topology_level *sched_domain_topology =
+ default_topology;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
@@ -6445,8 +6590,10 @@ static void init_numa_topology_type(void)
n = sched_max_numa_distance;
- if (n <= 1)
+ if (sched_domains_numa_levels <= 1) {
sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
for_each_online_node(a) {
for_each_online_node(b) {
@@ -7134,9 +7281,6 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
- /* nohz_full won't take effect without isolating the cpus. */
- tick_nohz_full_add_cpus_to(cpu_isolated_map);
-
sched_init_numa();
/*
@@ -7369,7 +7513,7 @@ void __init sched_init(void)
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
static inline int preempt_count_equals(int preempt_offset)
{
- int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
+ int nested = preempt_count() + rcu_preempt_depth();
return (nested == preempt_offset);
}
@@ -7616,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, 0);
+ dequeue_task(rq, tsk, DEQUEUE_SAVE);
if (unlikely(running))
put_prev_task(rq, tsk);
@@ -7632,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk, queued);
+ tsk->sched_class->task_move_group(tsk);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
@@ -7640,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, tsk, 0);
+ enqueue_task(rq, tsk, ENQUEUE_RESTORE);
task_rq_unlock(rq, tsk, &flags);
}
@@ -8068,7 +8212,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
{
sched_move_task(task);
}
@@ -8100,21 +8244,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
sched_move_task(task);
}
-static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
-{
- /*
- * cgroup_exit() is called in the copy_process() failure path.
- * Ignore this case since the task hasn't ran yet, this avoids
- * trying to poke a half freed task state from generic code.
- */
- if (!(task->flags & PF_EXITING))
- return;
-
- sched_move_task(task);
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -8446,7 +8575,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .exit = cpu_cgroup_exit,
.legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index c6acb07466bb..5a75b08cfd85 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,11 +31,6 @@ static inline int right_child(int i)
return (i << 1) + 2;
}
-static inline int dl_time_before(u64 a, u64 b)
-{
- return (s64)(a - b) < 0;
-}
-
static void cpudl_exchange(struct cpudl *cp, int a, int b)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 1a0a6ef2fbe1..fcbdf83fed7e 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -2,6 +2,7 @@
#define _LINUX_CPUDL_H
#include <linux/sched.h>
+#include <linux/sched/deadline.h>
#define IDX_INVALID -1
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..26a54461bf59 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
*ut = p->utime;
*st = p->stime;
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
@@ -555,48 +556,43 @@ drop_precision:
}
/*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
*
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
- cputime_t old;
-
- while (new > (old = READ_ONCE(*counter)))
- cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer. Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ * stime + utime == rtime
+ * stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
*/
static void cputime_adjust(struct task_cputime *curr,
- struct cputime *prev,
+ struct prev_cputime *prev,
cputime_t *ut, cputime_t *st)
{
cputime_t rtime, stime, utime;
+ unsigned long flags;
- /*
- * Tick based cputime accounting depend on random scheduling
- * timeslices of a task to be interrupted or not by the timer.
- * Depending on these circumstances, the number of these interrupts
- * may be over or under-optimistic, matching the real user and system
- * cputime with a variable precision.
- *
- * Fix this by scaling these tick based values against the total
- * runtime accounted by the CFS scheduler.
- */
+ /* Serialize concurrent callers such that we can honour our guarantees */
+ raw_spin_lock_irqsave(&prev->lock, flags);
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
/*
- * Update userspace visible utime/stime values only if actual execution
- * time is bigger than already exported. Note that can happen, that we
- * provided bigger values due to scaling inaccuracy on big numbers.
+ * This is possible under two circumstances:
+ * - rtime isn't monotonic after all (a bug);
+ * - we got reordered by the lock.
+ *
+ * In both cases this acts as a filter such that the rest of the code
+ * can assume it is monotonic regardless of anything else.
*/
if (prev->stime + prev->utime >= rtime)
goto out;
@@ -606,22 +602,46 @@ static void cputime_adjust(struct task_cputime *curr,
if (utime == 0) {
stime = rtime;
- } else if (stime == 0) {
- utime = rtime;
- } else {
- cputime_t total = stime + utime;
+ goto update;
+ }
- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}
- cputime_advance(&prev->stime, stime);
- cputime_advance(&prev->utime, utime);
+ stime = scale_stime((__force u64)stime, (__force u64)rtime,
+ (__force u64)(stime + utime));
+
+ /*
+ * Make sure stime doesn't go backwards; this preserves monotonicity
+ * for utime because rtime is monotonic.
+ *
+ * utime_i+1 = rtime_i+1 - stime_i
+ * = rtime_i+1 - (rtime_i - utime_i)
+ * = (rtime_i+1 - rtime_i) + utime_i
+ * >= utime_i
+ */
+ if (stime < prev->stime)
+ stime = prev->stime;
+ utime = rtime - stime;
+
+ /*
+ * Make sure utime doesn't go backwards; this still preserves
+ * monotonicity for stime, analogous argument to above.
+ */
+ if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }
+update:
+ prev->stime = stime;
+ prev->utime = utime;
out:
*ut = prev->utime;
*st = prev->stime;
+ raw_spin_unlock_irqrestore(&prev->lock, flags);
}
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
@@ -633,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
task_cputime(p, &cputime.utime, &cputime.stime);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0a17af35670a..8b0a15e285f9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -668,8 +668,15 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* Queueing this task back might have overloaded rq, check if we need
* to kick someone away.
*/
- if (has_pushable_dl_tasks(rq))
+ if (has_pushable_dl_tasks(rq)) {
+ /*
+ * Nothing relies on rq->lock after this, so its safe to drop
+ * rq->lock.
+ */
+ lockdep_unpin_lock(&rq->lock);
push_dl_task(rq);
+ lockdep_pin_lock(&rq->lock);
+ }
#endif
unlock:
@@ -953,7 +960,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
/*
* Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (relative) deadline is
+ * task if we have one and its (absolute) deadline is
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
@@ -1066,8 +1073,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
int target = find_later_rq(p);
if (target != -1 &&
- dl_time_before(p->dl.deadline,
- cpu_rq(target)->dl.earliest_dl.curr))
+ (dl_time_before(p->dl.deadline,
+ cpu_rq(target)->dl.earliest_dl.curr) ||
+ (cpu_rq(target)->dl.dl_nr_running == 0)))
cpu = target;
}
rcu_read_unlock();
@@ -1417,7 +1425,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
later_rq = cpu_rq(cpu);
- if (!dl_time_before(task->dl.deadline,
+ if (later_rq->dl.dl_nr_running &&
+ !dl_time_before(task->dl.deadline,
later_rq->dl.earliest_dl.curr)) {
/*
* Target rq has tasks of equal or earlier deadline,
@@ -1563,7 +1572,7 @@ out:
static void push_dl_tasks(struct rq *rq)
{
- /* Terminates as it moves a -deadline task */
+ /* push_dl_task() will return true if it moved a -deadline task */
while (push_dl_task(rq))
;
}
@@ -1657,7 +1666,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_dl_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
@@ -1669,9 +1677,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
- struct rq *rq;
struct root_domain *src_rd;
- int weight;
+ struct rq *rq;
BUG_ON(!dl_task(p));
@@ -1697,37 +1704,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- /*
- * Update only if the task is actually running (i.e.,
- * it is on the rq AND it is not throttled).
- */
- if (!on_dl_rq(&p->dl))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_dl_task(rq, p);
- BUG_ON(!rq->dl.dl_nr_migratory);
- rq->dl.dl_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_dl_task(rq, p);
- rq->dl.dl_nr_migratory++;
- }
-
- update_dl_migration(&rq->dl);
+ set_cpus_allowed_common(p, new_mask);
}
/* Assumes rq->lock is held */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4222ec50ab88..641511771ae6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
- if (!se) {
- struct sched_avg *avg = &cpu_rq(cpu)->avg;
- P(avg->runnable_avg_sum);
- P(avg->avg_period);
+ if (!se)
return;
- }
-
PN(se->exec_start);
PN(se->vruntime);
@@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
- P(se->avg.runnable_avg_sum);
- P(se->avg.running_avg_sum);
- P(se->avg.avg_period);
- P(se->avg.load_avg_contrib);
- P(se->avg.utilization_avg_contrib);
- P(se->avg.decay_count);
+ P(se->avg.load_avg);
+ P(se->avg.util_avg);
#endif
#undef PN
#undef P
@@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
+ cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
- cfs_rq->blocked_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
- cfs_rq->utilization_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
+ atomic_long_read(&cfs_rq->removed_load_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
+ atomic_long_read(&cfs_rq->removed_util_avg));
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
- cfs_rq->tg_load_contrib);
- SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
- cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
+ cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
- SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
- atomic_read(&cfs_rq->tg->runnable_avg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
- P(se.avg.runnable_avg_sum);
- P(se.avg.running_avg_sum);
- P(se.avg.avg_period);
- P(se.avg.load_avg_contrib);
- P(se.avg.utilization_avg_contrib);
- P(se.avg.decay_count);
+ P(se.avg.load_sum);
+ P(se.avg.util_sum);
+ P(se.avg.load_avg);
+ P(se.avg.util_avg);
+ P(se.avg.last_update_time);
#endif
P(policy);
P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d113c3ba8bc4..824aa9f501a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update);
-
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
- /* We should have no load, but we need to update last_decay. */
- update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
*/
static u64 __sched_period(unsigned long nr_running)
{
- u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;
-
- if (unlikely(nr_running > nr_latency)) {
- period = sysctl_sched_min_granularity;
- period *= nr_running;
- }
-
- return period;
+ if (unlikely(nr_running > sched_nr_latency))
+ return nr_running * sysctl_sched_min_granularity;
+ else
+ return sysctl_sched_latency;
}
/*
@@ -669,22 +659,38 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
-static inline void __update_task_entity_contrib(struct sched_entity *se);
-static inline void __update_task_entity_utilization(struct sched_entity *se);
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
+ * dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
-/* Give new task start runnable values to heavy its load in infant time */
-void init_task_runnable_average(struct task_struct *p)
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
{
- u32 slice;
+ struct sched_avg *sa = &se->avg;
- slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
- p->se.avg.avg_period = slice;
- __update_task_entity_contrib(&p->se);
- __update_task_entity_utilization(&p->se);
+ sa->last_update_time = 0;
+ /*
+ * sched_avg's period_contrib should be strictly less then 1024, so
+ * we give it 1023 to make sure it is almost a period (1024us), and
+ * will definitely be update (after enqueue).
+ */
+ sa->period_contrib = 1023;
+ sa->load_avg = scale_load_down(se->load.weight);
+ sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
-void init_task_runnable_average(struct task_struct *p)
+void init_entity_runnable_average(struct sched_entity *se)
{
}
#endif
@@ -1415,8 +1421,9 @@ static bool numa_has_capacity(struct task_numa_env *env)
* --------------------- vs ---------------------
* src->compute_capacity dst->compute_capacity
*/
- if (src->load * dst->compute_capacity >
- dst->load * src->compute_capacity)
+ if (src->load * dst->compute_capacity * env->imbalance_pct >
+
+ dst->load * src->compute_capacity * 100)
return true;
return false;
@@ -1702,8 +1709,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.avg_period;
+ delta = p->se.avg.load_sum / p->se.load.weight;
+ *period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
@@ -2063,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
int local = !!(flags & TNF_FAULT_LOCAL);
int priv;
- if (!numabalancing_enabled)
+ if (!static_branch_likely(&sched_numa_balancing))
return;
/* for example, ksmd faulting in a user's mm */
@@ -2151,7 +2158,7 @@ void task_numa_work(struct callback_head *work)
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
- long pages;
+ long pages, virtpages;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -2197,9 +2204,11 @@ void task_numa_work(struct callback_head *work)
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ virtpages = pages * 8; /* Scan up to this much virtual space */
if (!pages)
return;
+
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
if (!vma) {
@@ -2234,18 +2243,22 @@ void task_numa_work(struct callback_head *work)
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
- nr_pte_updates += change_prot_numa(vma, start, end);
+ nr_pte_updates = change_prot_numa(vma, start, end);
/*
- * Scan sysctl_numa_balancing_scan_size but ensure that
- * at least one PTE is updated so that unused virtual
- * address space is quickly skipped.
+ * Try to scan sysctl_numa_balancing_size worth of
+ * hpages that have at least one present PTE that
+ * is not already pte-numa. If the VMA contains
+ * areas that are unused or already full of prot_numa
+ * PTEs, scan up to virtpages, to skip through those
+ * areas faster.
*/
if (nr_pte_updates)
pages -= (end - start) >> PAGE_SHIFT;
+ virtpages -= (end - start) >> PAGE_SHIFT;
start = end;
- if (pages <= 0)
+ if (pages <= 0 || virtpages <= 0)
goto out;
cond_resched();
@@ -2351,12 +2364,12 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
long tg_weight;
/*
- * Use this CPU's actual weight instead of the last load_contribution
- * to gain a more accurate current total weight. See
- * update_cfs_rq_load_contribution().
+ * Use this CPU's real-time load instead of the last load contribution
+ * as the updating of the contribution is delayed, and we will use the
+ * the real-time load to calc the share. See update_tg_load_avg().
*/
tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_contrib;
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
tg_weight += cfs_rq->load.weight;
return tg_weight;
@@ -2429,14 +2442,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2485,9 +2490,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
local_n %= LOAD_AVG_PERIOD;
}
- val *= runnable_avg_yN_inv[local_n];
- /* We don't use SRR here since we always want to round down. */
- return val >> 32;
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
}
/*
@@ -2518,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
}
+#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
+#error "load tracking assumes 2^10 as unit"
+#endif
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
@@ -2546,23 +2556,22 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
- struct sched_avg *sa,
- int runnable,
- int running)
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, periods;
- u32 runnable_contrib;
- int delta_w, decayed = 0;
- unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ u64 delta, scaled_delta, periods;
+ u32 contrib;
+ unsigned int delta_w, scaled_delta_w, decayed = 0;
+ unsigned long scale_freq, scale_cpu;
- delta = now - sa->last_runnable_update;
+ delta = now - sa->last_update_time;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
return 0;
}
@@ -2573,26 +2582,35 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
delta >>= 10;
if (!delta)
return 0;
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->avg_period % 1024;
+ delta_w = sa->period_contrib;
if (delta + delta_w >= 1024) {
- /* period roll-over */
decayed = 1;
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
/*
* Now that we know we're crossing a period boundary, figure
* out how much from delta we need to complete the current
* period and accrue it.
*/
delta_w = 1024 - delta_w;
- if (runnable)
- sa->runnable_avg_sum += delta_w;
+ scaled_delta_w = cap_scale(delta_w, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * scaled_delta_w;
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum +=
+ weight * scaled_delta_w;
+ }
+ }
if (running)
- sa->running_avg_sum += delta_w * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta_w;
+ sa->util_sum += scaled_delta_w * scale_cpu;
delta -= delta_w;
@@ -2600,341 +2618,221 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
periods = delta / 1024;
delta %= 1024;
- sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
- periods + 1);
- sa->running_avg_sum = decay_load(sa->running_avg_sum,
- periods + 1);
- sa->avg_period = decay_load(sa->avg_period,
- periods + 1);
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
- runnable_contrib = __compute_runnable_contrib(periods);
- if (runnable)
- sa->runnable_avg_sum += runnable_contrib;
+ contrib = __compute_runnable_contrib(periods);
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
if (running)
- sa->running_avg_sum += runnable_contrib * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += runnable_contrib;
+ sa->util_sum += contrib * scale_cpu;
}
/* Remainder of delta accrued against u_0` */
- if (runnable)
- sa->runnable_avg_sum += delta;
+ scaled_delta = cap_scale(delta, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * scaled_delta;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * scaled_delta;
+ }
if (running)
- sa->running_avg_sum += delta * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta;
-
- return decayed;
-}
-
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 decays = atomic64_read(&cfs_rq->decay_counter);
+ sa->util_sum += scaled_delta * scale_cpu;
- decays -= se->avg.decay_count;
- se->avg.decay_count = 0;
- if (!decays)
- return 0;
+ sa->period_contrib += delta;
- se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.utilization_avg_contrib =
- decay_load(se->avg.utilization_avg_contrib, decays);
+ if (decayed) {
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ }
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ }
- return decays;
+ return decayed;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long tg_contrib;
-
- tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
- tg_contrib -= cfs_rq->tg_load_contrib;
-
- if (!tg_contrib)
- return;
-
- if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic_long_add(tg_contrib, &tg->load_avg);
- cfs_rq->tg_load_contrib += tg_contrib;
- }
-}
-
/*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * and effective_load (which is not done because it is too costly).
*/
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
- struct task_group *tg = cfs_rq->tg;
- long contrib;
-
- /* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->avg_period + 1);
- contrib -= cfs_rq->tg_runnable_contrib;
+ long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
- atomic_add(contrib, &tg->runnable_avg);
- cfs_rq->tg_runnable_contrib += contrib;
+ if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg = cfs_rq->tg;
- int runnable_avg;
-
- u64 contrib;
-
- contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div_u64(contrib,
- atomic_long_read(&tg->load_avg) + 1);
-
- /*
- * For group entities we need to compute a correction term in the case
- * that they are consuming <1 cpu so that we would contribute the same
- * load as a task of equal weight.
- *
- * Explicitly co-ordinating this measurement would be expensive, but
- * fortunately the sum of each cpus contribution forms a usable
- * lower-bound on the true value.
- *
- * Consider the aggregate of 2 contributions. Either they are disjoint
- * (and the sum represents true value) or they are disjoint and we are
- * understating by the aggregate of their overlap.
- *
- * Extending this to N cpus, for a given overlap, the maximum amount we
- * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
- * cpus that overlap for this interval and w_i is the interval width.
- *
- * On a small machine; the first term is well-bounded which bounds the
- * total error since w_i is a subset of the period. Whereas on a
- * larger machine, while this first term can be larger, if w_i is the
- * of consequential size guaranteed to see n_i*w_i quickly converge to
- * our upper bound of 1-cpu.
- */
- runnable_avg = atomic_read(&tg->runnable_avg);
- if (runnable_avg < NICE_0_LOAD) {
- se->avg.load_avg_contrib *= runnable_avg;
- se->avg.load_avg_contrib >>= NICE_0_SHIFT;
- }
-}
-
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
- runnable, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
- u32 contrib;
-
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
- contrib /= (se->avg.avg_period + 1);
- se->avg.load_avg_contrib = scale_load(contrib);
-}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- long old_contrib = se->avg.load_avg_contrib;
+ struct sched_avg *sa = &cfs_rq->avg;
+ int decayed, removed = 0;
- if (entity_is_task(se)) {
- __update_task_entity_contrib(se);
- } else {
- __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
- __update_group_entity_contrib(se);
+ if (atomic_long_read(&cfs_rq->removed_load_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ sa->load_avg = max_t(long, sa->load_avg - r, 0);
+ sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+ removed = 1;
}
- return se->avg.load_avg_contrib - old_contrib;
-}
+ if (atomic_long_read(&cfs_rq->removed_util_avg)) {
+ long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ sa->util_avg = max_t(long, sa->util_avg - r, 0);
+ sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ }
+ decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
-static inline void __update_task_entity_utilization(struct sched_entity *se)
-{
- u32 contrib;
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
- /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
- contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
- contrib /= (se->avg.avg_period + 1);
- se->avg.utilization_avg_contrib = scale_load(contrib);
+ return decayed || removed;
}
-static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
{
- long old_contrib = se->avg.utilization_avg_contrib;
-
- if (entity_is_task(se))
- __update_task_entity_utilization(se);
- else
- se->avg.utilization_avg_contrib =
- group_cfs_rq(se)->utilization_load_avg;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int cpu = cpu_of(rq_of(cfs_rq));
- return se->avg.utilization_avg_contrib - old_contrib;
-}
+ /*
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
+ */
+ __update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
- long load_contrib)
-{
- if (likely(load_contrib < cfs_rq->blocked_load_avg))
- cfs_rq->blocked_load_avg -= load_contrib;
- else
- cfs_rq->blocked_load_avg = 0;
+ if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ update_tg_load_avg(cfs_rq, 0);
}
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
-/* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta, utilization_delta;
- int cpu = cpu_of(rq_of(cfs_rq));
- u64 now;
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ goto skip_aging;
/*
- * For a group entity we need to use their owned cfs_rq_clock_task() in
- * case they are the parent of a throttled hierarchy.
+ * If we got migrated (either between CPUs or between cgroups) we'll
+ * have aged the average right before clearing @last_update_time.
*/
- if (entity_is_task(se))
- now = cfs_rq_clock_task(cfs_rq);
- else
- now = cfs_rq_clock_task(group_cfs_rq(se));
+ if (se->avg.last_update_time) {
+ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, 0, 0, NULL);
- if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
- cfs_rq->curr == se))
- return;
+ /*
+ * XXX: we could have just aged the entire load away if we've been
+ * absent from the fair class for too long.
+ */
+ }
- contrib_delta = __update_entity_load_avg_contrib(se);
- utilization_delta = __update_entity_utilization_avg_contrib(se);
+skip_aging:
+ se->avg.last_update_time = cfs_rq->avg.last_update_time;
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se->avg.load_sum;
+ cfs_rq->avg.util_avg += se->avg.util_avg;
+ cfs_rq->avg.util_sum += se->avg.util_sum;
+}
- if (!update_cfs_rq)
- return;
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
- if (se->on_rq) {
- cfs_rq->runnable_load_avg += contrib_delta;
- cfs_rq->utilization_load_avg += utilization_delta;
- } else {
- subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
- }
+ cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+ cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+ cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+ cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
}
-/*
- * Decay the load contributed by all blocked children and account this so that
- * their contribution may appropriately discounted when they wake up.
- */
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+/* Add the load generated by se into cfs_rq's load average */
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
- u64 decays;
-
- decays = now - cfs_rq->last_decay;
- if (!decays && !force_update)
- return;
+ struct sched_avg *sa = &se->avg;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int migrated, decayed;
- if (atomic_long_read(&cfs_rq->removed_load)) {
- unsigned long removed_load;
- removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
- subtract_blocked_load_contrib(cfs_rq, removed_load);
+ migrated = !sa->last_update_time;
+ if (!migrated) {
+ __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
}
- if (decays) {
- cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
- decays);
- atomic64_add(decays, &cfs_rq->decay_counter);
- cfs_rq->last_decay = now;
- }
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
+
+ cfs_rq->runnable_load_avg += sa->load_avg;
+ cfs_rq->runnable_load_sum += sa->load_sum;
- __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+ if (migrated)
+ attach_entity_load_avg(cfs_rq, se);
+
+ if (decayed || migrated)
+ update_tg_load_avg(cfs_rq, 0);
}
-/* Add the load generated by se into cfs_rq's child load-average */
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup)
+/* Remove the runnable load generated by se from cfs_rq's runnable load average */
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * We track migrations using entity decay_count <= 0, on a wake-up
- * migration we use a negative decay count to track the remote decays
- * accumulated while sleeping.
- *
- * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
- * are seen by enqueue_entity_load_avg() as a migration with an already
- * constructed load_avg_contrib.
- */
- if (unlikely(se->avg.decay_count <= 0)) {
- se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
- if (se->avg.decay_count) {
- /*
- * In a wake-up migration we have to approximate the
- * time sleeping. This is because we can't synchronize
- * clock_task between the two cpus, and it is not
- * guaranteed to be read-safe. Instead, we can
- * approximate this using our carried decays, which are
- * explicitly atomically readable.
- */
- se->avg.last_runnable_update -= (-se->avg.decay_count)
- << 20;
- update_entity_load_avg(se, 0);
- /* Indicate that we're now synchronized and on-rq */
- se->avg.decay_count = 0;
- }
- wakeup = 0;
- } else {
- __synchronize_entity_decay(se);
- }
-
- /* migrated tasks did not contribute to our blocked load */
- if (wakeup) {
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- update_entity_load_avg(se, 0);
- }
+ update_load_avg(se, 1);
- cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
- cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+ cfs_rq->runnable_load_avg =
+ max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
+ cfs_rq->runnable_load_sum =
+ max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
/*
- * Remove se's load from this cfs_rq child load-average, if the entity is
- * transitioning to a blocked state we track its projected decay using
- * blocked_load_avg.
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
*/
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep)
+void remove_entity_load_avg(struct sched_entity *se)
{
- update_entity_load_avg(se, 1);
- /* we force update consideration on load-balancer moves */
- update_cfs_rq_blocked_load(cfs_rq, !sleep);
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 last_update_time_copy;
+
+ do {
+ last_update_time_copy = cfs_rq->load_last_update_time_copy;
+ smp_rmb();
+ last_update_time = cfs_rq->avg.last_update_time;
+ } while (last_update_time != last_update_time_copy);
+#else
+ last_update_time = cfs_rq->avg.last_update_time;
+#endif
- cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
- cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
- if (sleep) {
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+ __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
+ atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
+ atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
}
/*
@@ -2944,7 +2842,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
*/
void idle_enter_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 1);
}
/*
@@ -2954,24 +2851,33 @@ void idle_enter_fair(struct rq *this_rq)
*/
void idle_exit_fair(struct rq *this_rq)
{
- update_rq_runnable_avg(this_rq, 0);
+}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->runnable_load_avg;
+}
+
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.load_avg;
}
static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
-static inline void update_entity_load_avg(struct sched_entity *se,
- int update_cfs_rq) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int wakeup) {}
-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
- struct sched_entity *se,
- int sleep) {}
-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void
+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
+
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
static inline int idle_balance(struct rq *rq)
{
@@ -3103,7 +3009,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
+ enqueue_entity_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
update_cfs_shares(cfs_rq);
@@ -3178,7 +3084,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
- dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
+ dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
@@ -3268,7 +3174,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
- update_entity_load_avg(se, 1);
+ update_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -3368,7 +3274,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
- update_entity_load_avg(prev, 1);
+ update_load_avg(prev, 0);
}
cfs_rq->curr = NULL;
}
@@ -3384,8 +3290,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
/*
* Ensure that runnable average is periodically updated.
*/
- update_entity_load_avg(curr, 1);
- update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_load_avg(curr, 1);
update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
@@ -4258,14 +4163,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
- update_rq_runnable_avg(rq, rq->nr_running);
+ if (!se)
add_nr_running(rq, 1);
- }
+
hrtick_update(rq);
}
@@ -4319,14 +4223,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
+ update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
- update_entity_load_avg(se, 1);
}
- if (!se) {
+ if (!se)
sub_nr_running(rq, 1);
- update_rq_runnable_avg(rq, 1);
- }
+
hrtick_update(rq);
}
@@ -4439,6 +4342,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+}
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
@@ -4460,7 +4369,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
static void update_idle_cpu_load(struct rq *this_rq)
{
unsigned long curr_jiffies = READ_ONCE(jiffies);
- unsigned long load = this_rq->cfs.runnable_load_avg;
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
unsigned long pending_updates;
/*
@@ -4506,7 +4415,7 @@ void update_cpu_load_nohz(void)
*/
void update_cpu_load_active(struct rq *this_rq)
{
- unsigned long load = this_rq->cfs.runnable_load_avg;
+ unsigned long load = weighted_cpuload(cpu_of(this_rq));
/*
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
*/
@@ -4514,12 +4423,6 @@ void update_cpu_load_active(struct rq *this_rq)
__update_cpu_load(this_rq, load, 1);
}
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->cfs.runnable_load_avg;
-}
-
/*
* Return a low guess at the load of a migration-source cpu weighted
* according to the scheduling class and "nice" value.
@@ -4567,7 +4470,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = rq->cfs.runnable_load_avg;
+ unsigned long load_avg = weighted_cpuload(cpu);
if (nr_running)
return load_avg / nr_running;
@@ -4686,7 +4589,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* w = rw_i + @wl
*/
- w = se->my_q->load.weight + wl;
+ w = cfs_rq_load_avg(se->my_q) + wl;
/*
* wl = S * s'_i; see (2)
@@ -4707,7 +4610,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
/*
* wl = dw_i = S * (s'_i - s_i); see (3)
*/
- wl -= se->load.weight;
+ wl -= se->avg.load_avg;
/*
* Recursively apply this logic to all parent groups to compute
@@ -4730,26 +4633,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees. In order
+ * to determine whether we should let the load spread vs consolodating to
+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other. With
+ * both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size. Waker/wakee
+ * being client/server, worker/dispatcher, interrupt source or whatever is
+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ */
static int wake_wide(struct task_struct *p)
{
+ unsigned int master = current->wakee_flips;
+ unsigned int slave = p->wakee_flips;
int factor = this_cpu_read(sd_llc_size);
- /*
- * Yeah, it's the switching-frequency, could means many wakee or
- * rapidly switch, use factor here will just help to automatically
- * adjust the loose-degree, so bigger node will lead to more pull.
- */
- if (p->wakee_flips > factor) {
- /*
- * wakee is somewhat hot, it needs certain amount of cpu
- * resource, so if waker is far more hot, prefer to leave
- * it alone.
- */
- if (current->wakee_flips > (factor * p->wakee_flips))
- return 1;
- }
-
- return 0;
+ if (master < slave)
+ swap(master, slave);
+ if (slave < factor || master < slave * factor)
+ return 0;
+ return 1;
}
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4667,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
unsigned long weight;
int balanced;
- /*
- * If we wake multiple tasks be careful to not bounce
- * ourselves around too much.
- */
- if (wake_wide(p))
- return 0;
-
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
@@ -4781,14 +4680,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
*/
if (sync) {
tg = task_group(current);
- weight = current->se.load.weight;
+ weight = current->se.avg.load_avg;
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
- weight = p->se.load.weight;
+ weight = p->se.avg.load_avg;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -4976,32 +4875,39 @@ next:
done:
return target;
}
+
/*
- * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
* tasks. The unit of the return value must be the one of capacity so we can
- * compare the usage with the capacity of the CPU that is available for CFS
- * task (ie cpu_capacity).
- * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
- * CPU. It represents the amount of utilization of a CPU in the range
- * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
- * capacity of the CPU because it's about the running time on this CPU.
- * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
- * because of unfortunate rounding in avg_period and running_load_avg or just
- * after migrating tasks until the average stabilizes with the new running
- * time. So we need to check that the usage stays into the range
- * [0..cpu_capacity_orig] and cap if necessary.
- * Without capping the usage, a group could be seen as overloaded (CPU0 usage
- * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
*/
-static int get_cpu_usage(int cpu)
+static int cpu_util(int cpu)
{
- unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
unsigned long capacity = capacity_orig_of(cpu);
- if (usage >= SCHED_LOAD_SCALE)
- return capacity;
-
- return (usage * capacity) >> SCHED_LOAD_SHIFT;
+ return (util >= capacity) ? capacity : util;
}
/*
@@ -5021,17 +4927,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
- int new_cpu = cpu;
+ int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
- continue;
+ break;
/*
* If both cpu and prev_cpu are part of this domain,
@@ -5045,17 +4951,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (tmp->flags & sd_flag)
sd = tmp;
+ else if (!want_affine)
+ break;
}
- if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
- prev_cpu = cpu;
-
- if (sd_flag & SD_BALANCE_WAKE) {
- new_cpu = select_idle_sibling(p, prev_cpu);
- goto unlock;
+ if (affine_sd) {
+ sd = NULL; /* Prefer wake_affine over balance flags */
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+ new_cpu = cpu;
}
- while (sd) {
+ if (!sd) {
+ if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ new_cpu = select_idle_sibling(p, new_cpu);
+
+ } else while (sd) {
struct sched_group *group;
int weight;
@@ -5089,7 +4999,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
/* while loop will break here if sd == NULL */
}
-unlock:
rcu_read_unlock();
return new_cpu;
@@ -5101,26 +5010,27 @@ unlock:
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
* other assumptions, including the state of rq->lock, should be made.
*/
-static void
-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+static void migrate_task_rq_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
-
/*
- * Load tracking: accumulate removed load so that it can be processed
- * when we next update owning cfs_rq under rq->lock. Tasks contribute
- * to blocked load iff they have a positive decay-count. It can never
- * be negative here since on-rq tasks have decay-count == 0.
+ * We are supposed to update the task to "current" time, then its up to date
+ * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
+ * what current time is, so simply throw away the out-of-date time. This
+ * will result in the wakee task is less decayed, but giving the wakee more
+ * load sounds not bad.
*/
- if (se->avg.decay_count) {
- se->avg.decay_count = -__synchronize_entity_decay(se);
- atomic_long_add(se->avg.load_avg_contrib,
- &cfs_rq->removed_load);
- }
+ remove_entity_load_avg(&p->se);
+
+ /* Tell new CPU we are migrated */
+ p->se.avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- se->exec_start = 0;
+ p->se.exec_start = 0;
+}
+
+static void task_dead_fair(struct task_struct *p)
+{
+ remove_entity_load_avg(&p->se);
}
#endif /* CONFIG_SMP */
@@ -5670,72 +5580,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
#ifdef CONFIG_NUMA_BALANCING
/*
- * Returns true if the destination node is the preferred node.
- * Needs to match fbq_classify_rq(): if there is a runnable task
- * that is not on its preferred node, we should identify it.
+ * Returns 1, if task migration degrades locality
+ * Returns 0, if task migration improves locality i.e migration preferred.
+ * Returns -1, if task migration is not affected by locality.
*/
-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_faults, dst_faults;
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
- !(env->sd->flags & SD_NUMA)) {
- return false;
- }
-
- src_nid = cpu_to_node(env->src_cpu);
- dst_nid = cpu_to_node(env->dst_cpu);
-
- if (src_nid == dst_nid)
- return false;
-
- /* Encourage migration to the preferred node. */
- if (dst_nid == p->numa_preferred_nid)
- return true;
-
- /* Migrating away from the preferred node is bad. */
- if (src_nid == p->numa_preferred_nid)
- return false;
-
- if (numa_group) {
- src_faults = group_faults(p, src_nid);
- dst_faults = group_faults(p, dst_nid);
- } else {
- src_faults = task_faults(p, src_nid);
- dst_faults = task_faults(p, dst_nid);
- }
-
- return dst_faults > src_faults;
-}
-
-
-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
-{
- struct numa_group *numa_group = rcu_dereference(p->numa_group);
- unsigned long src_faults, dst_faults;
- int src_nid, dst_nid;
-
- if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
- return false;
+ if (!static_branch_likely(&sched_numa_balancing))
+ return -1;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return false;
+ return -1;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return false;
+ return -1;
- /* Migrating away from the preferred node is bad. */
- if (src_nid == p->numa_preferred_nid)
- return true;
+ /* Migrating away from the preferred node is always bad. */
+ if (src_nid == p->numa_preferred_nid) {
+ if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
+ return 1;
+ else
+ return -1;
+ }
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return false;
+ return 0;
if (numa_group) {
src_faults = group_faults(p, src_nid);
@@ -5749,16 +5626,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
}
#else
-static inline bool migrate_improves_locality(struct task_struct *p,
+static inline int migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return false;
-}
-
-static inline bool migrate_degrades_locality(struct task_struct *p,
- struct lb_env *env)
-{
- return false;
+ return -1;
}
#endif
@@ -5768,7 +5639,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot = 0;
+ int tsk_cache_hot;
lockdep_assert_held(&env->src_rq->lock);
@@ -5826,13 +5697,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, env);
- if (!tsk_cache_hot)
- tsk_cache_hot = migrate_degrades_locality(p, env);
+ tsk_cache_hot = migrate_degrades_locality(p, env);
+ if (tsk_cache_hot == -1)
+ tsk_cache_hot = task_hot(p, env);
- if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
+ if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot) {
+ if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
@@ -5906,6 +5777,13 @@ static int detach_tasks(struct lb_env *env)
return 0;
while (!list_empty(tasks)) {
+ /*
+ * We don't want to steal all, otherwise we may be treated likewise,
+ * which could at worst lead to a livelock crash.
+ */
+ if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+ break;
+
p = list_first_entry(tasks, struct task_struct, se.group_node);
env->loop++;
@@ -6015,39 +5893,6 @@ static void attach_tasks(struct lb_env *env)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * update tg->load_weight by folding this cpu's load_avg
- */
-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
-{
- struct sched_entity *se = tg->se[cpu];
- struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
-
- /* throttled entities do not contribute to load */
- if (throttled_hierarchy(cfs_rq))
- return;
-
- update_cfs_rq_blocked_load(cfs_rq, 1);
-
- if (se) {
- update_entity_load_avg(se, 1);
- /*
- * We pivot on our runnable average having decayed to zero for
- * list removal. This generally implies that all our children
- * have also been removed (modulo rounding error or bandwidth
- * control); however, such cases are rare and we can fix these
- * at enqueue.
- *
- * TODO: fix up out-of-order children on enqueue.
- */
- if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
- list_del_leaf_cfs_rq(cfs_rq);
- } else {
- struct rq *rq = rq_of(cfs_rq);
- update_rq_runnable_avg(rq, rq->nr_running);
- }
-}
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6056,19 +5901,19 @@ static void update_blocked_averages(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
+
/*
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
for_each_leaf_cfs_rq(rq, cfs_rq) {
- /*
- * Note: We may want to consider periodically releasing
- * rq->lock about these updates so that creating many task
- * groups does not result in continually extending hold time.
- */
- __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
- }
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ continue;
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ update_tg_load_avg(cfs_rq, 0);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6096,14 +5941,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
if (!se) {
- cfs_rq->h_load = cfs_rq->runnable_load_avg;
+ cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
while ((se = cfs_rq->h_load_next) != NULL) {
load = cfs_rq->h_load;
- load = div64_ul(load * se->avg.load_avg_contrib,
- cfs_rq->runnable_load_avg + 1);
+ load = div64_ul(load * se->avg.load_avg,
+ cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
@@ -6115,17 +5960,25 @@ static unsigned long task_h_load(struct task_struct *p)
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
- return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
- cfs_rq->runnable_load_avg + 1);
+ return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+ cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static inline void update_blocked_averages(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ struct cfs_rq *cfs_rq = &rq->cfs;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
static unsigned long task_h_load(struct task_struct *p)
{
- return p->se.avg.load_avg_contrib;
+ return p->se.avg.load_avg;
}
#endif
@@ -6146,7 +5999,7 @@ struct sg_lb_stats {
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
- unsigned long group_usage; /* Total usage of the group */
+ unsigned long group_util; /* Total utilization of the group */
unsigned int sum_nr_running; /* Nr tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
@@ -6222,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
- return sd->smt_gain / sd->span_weight;
-
- return SCHED_CAPACITY_SCALE;
-}
-
-unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
- return default_scale_cpu_capacity(sd, cpu);
-}
-
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -6264,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu)
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
- unsigned long capacity = SCHED_CAPACITY_SCALE;
+ unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
- if (sched_feat(ARCH_CAPACITY))
- capacity *= arch_scale_cpu_capacity(sd, cpu);
- else
- capacity *= default_scale_cpu_capacity(sd, cpu);
-
- capacity >>= SCHED_CAPACITY_SHIFT;
-
cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= scale_rt_capacity(cpu);
@@ -6399,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group)
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
* We consider that a group has spare capacity if the * number of task is
- * smaller than the number of CPUs or if the usage is lower than the available
- * capacity for CFS tasks.
+ * smaller than the number of CPUs or if the utilization is lower than the
+ * available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
* account the variance of the tasks' load and to return true if the available
* capacity in meaningful for the load balancer.
@@ -6414,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
return true;
if ((sgs->group_capacity * 100) >
- (sgs->group_usage * env->sd->imbalance_pct))
+ (sgs->group_util * env->sd->imbalance_pct))
return true;
return false;
@@ -6435,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
return false;
if ((sgs->group_capacity * 100) <
- (sgs->group_usage * env->sd->imbalance_pct))
+ (sgs->group_util * env->sd->imbalance_pct))
return true;
return false;
}
-static enum group_type group_classify(struct lb_env *env,
- struct sched_group *group,
- struct sg_lb_stats *sgs)
+static inline enum
+group_type group_classify(struct sched_group *group,
+ struct sg_lb_stats *sgs)
{
if (sgs->group_no_capacity)
return group_overloaded;
@@ -6483,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
load = source_load(i, load_idx);
sgs->group_load += load;
- sgs->group_usage += get_cpu_usage(i);
+ sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
if (rq->nr_running > 1)
@@ -6508,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
sgs->group_no_capacity = group_is_overloaded(env, sgs);
- sgs->group_type = group_classify(env, group, sgs);
+ sgs->group_type = group_classify(group, sgs);
}
/**
@@ -6642,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
group_has_capacity(env, &sds->local_stat) &&
(sgs->sum_nr_running > 1)) {
sgs->group_no_capacity = 1;
- sgs->group_type = group_overloaded;
+ sgs->group_type = group_classify(sg, sgs);
}
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
@@ -7822,8 +7655,22 @@ out:
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
- if (likely(update_next_balance))
+ if (likely(update_next_balance)) {
rq->next_balance = next_balance;
+
+#ifdef CONFIG_NO_HZ_COMMON
+ /*
+ * If this CPU has been elected to perform the nohz idle
+ * balance. Other idle CPUs have already rebalanced with
+ * nohz_idle_balance() and nohz.next_balance has been
+ * updated accordingly. This CPU is now running the idle load
+ * balance for itself and we need to update the
+ * nohz.next_balance accordingly.
+ */
+ if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
+ nohz.next_balance = rq->next_balance;
+#endif
+ }
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -7836,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
int this_cpu = this_rq->cpu;
struct rq *rq;
int balance_cpu;
+ /* Earliest time when we have to do rebalance again */
+ unsigned long next_balance = jiffies + 60*HZ;
+ int update_next_balance = 0;
if (idle != CPU_IDLE ||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
@@ -7867,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
rebalance_domains(rq, CPU_IDLE);
}
- if (time_after(this_rq->next_balance, rq->next_balance))
- this_rq->next_balance = rq->next_balance;
+ if (time_after(next_balance, rq->next_balance)) {
+ next_balance = rq->next_balance;
+ update_next_balance = 1;
+ }
}
- nohz.next_balance = this_rq->next_balance;
+
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
@@ -8023,10 +7882,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
- if (numabalancing_enabled)
+ if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
-
- update_rq_runnable_avg(rq, 1);
}
/*
@@ -8101,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
check_preempt_curr(rq, p, 0);
}
-static void switched_from_fair(struct rq *rq, struct task_struct *p)
+static inline bool vruntime_normalized(struct task_struct *p)
{
struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when it's
- * switched back to the fair class the enqueue_entity(.flags=0) will
- * do the right thing.
+ * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
+ * the dequeue_entity(.flags=0) will already have normalized the
+ * vruntime.
+ */
+ if (p->on_rq)
+ return true;
+
+ /*
+ * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * But there are some cases where it has already been normalized:
*
- * If it's queued, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it's !queued, then only when
- * the task is sleeping will it still have non-normalized vruntime.
+ * - A forked child which is waiting for being woken up by
+ * wake_up_new_task().
+ * - A task which has been woken up by try_to_wake_up() and
+ * waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
+ if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ return true;
+
+ return false;
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!vruntime_normalized(p)) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -8124,44 +7999,50 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
se->vruntime -= cfs_rq->min_vruntime;
}
-#ifdef CONFIG_SMP
- /*
- * Remove our load from contribution when we leave sched_fair
- * and ensure we don't carry in an old decay_count if we
- * switch back.
- */
- if (se->avg.decay_count) {
- __synchronize_entity_decay(se);
- subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
- }
-#endif
+ /* Catch up with the cfs_rq and remove our load when we leave */
+ detach_entity_load_avg(cfs_rq, se);
}
-/*
- * We switched to the sched_fair class.
- */
-static void switched_to_fair(struct rq *rq, struct task_struct *p)
+static void attach_task_cfs_rq(struct task_struct *p)
{
-#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Since the real-depth could have been changed (only FAIR
* class maintain depth value), reset depth properly.
*/
se->depth = se->parent ? se->parent->depth + 1 : 0;
#endif
- if (!task_on_rq_queued(p))
- return;
- /*
- * We were most likely switched from sched_rt, so
- * kick off the schedule if running, otherwise just see
- * if we can still preempt the current task.
- */
- if (rq->curr == p)
- resched_curr(rq);
- else
- check_preempt_curr(rq, p, 0);
+ /* Synchronize task with its cfs_rq */
+ attach_entity_load_avg(cfs_rq, se);
+
+ if (!vruntime_normalized(p))
+ se->vruntime += cfs_rq->min_vruntime;
+}
+
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ detach_task_cfs_rq(p);
+}
+
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
+{
+ attach_task_cfs_rq(p);
+
+ if (task_on_rq_queued(p)) {
+ /*
+ * We were most likely switched from sched_rt, so
+ * kick off the schedule if running, otherwise just see
+ * if we can still preempt the current task.
+ */
+ if (rq->curr == p)
+ resched_curr(rq);
+ else
+ check_preempt_curr(rq, p, 0);
+ }
}
/* Account for a task changing its policy or group.
@@ -8190,62 +8071,22 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
#endif
#ifdef CONFIG_SMP
- atomic64_set(&cfs_rq->decay_counter, 1);
- atomic_long_set(&cfs_rq->removed_load, 0);
+ atomic_long_set(&cfs_rq->removed_load_avg, 0);
+ atomic_long_set(&cfs_rq->removed_util_avg, 0);
#endif
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int queued)
+static void task_move_group_fair(struct task_struct *p)
{
- struct sched_entity *se = &p->se;
- struct cfs_rq *cfs_rq;
-
- /*
- * If the task was not on the rq at the time of this cgroup movement
- * it must have been asleep, sleeping tasks keep their ->vruntime
- * absolute on their old rq until wakeup (needed for the fair sleeper
- * bonus in place_entity()).
- *
- * If it was on the rq, we've just 'preempted' it, which does convert
- * ->vruntime to a relative base.
- *
- * Make sure both cases convert their relative position when migrating
- * to another cgroup's rq. This does somewhat interfere with the
- * fair sleeper stuff for the first placement, but who cares.
- */
- /*
- * When !queued, vruntime of the task has usually NOT been normalized.
- * But there are some cases where it has already been normalized:
- *
- * - Moving a forked child which is waiting for being woken up by
- * wake_up_new_task().
- * - Moving a task which has been woken up by try_to_wake_up() and
- * waiting for actually being woken up by sched_ttwu_pending().
- *
- * To prevent boost or penalty in the new cfs_rq caused by delta
- * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
- */
- if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
- queued = 1;
-
- if (!queued)
- se->vruntime -= cfs_rq_of(se)->min_vruntime;
+ detach_task_cfs_rq(p);
set_task_rq(p, task_cpu(p));
- se->depth = se->parent ? se->parent->depth + 1 : 0;
- if (!queued) {
- cfs_rq = cfs_rq_of(se);
- se->vruntime += cfs_rq->min_vruntime;
+
#ifdef CONFIG_SMP
- /*
- * migrate_task_rq_fair() will have removed our previous
- * contribution, but we must synchronize for ongoing future
- * decay.
- */
- se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+ /* Tell se's cfs_rq has been changed -- migrated */
+ p->se.avg.last_update_time = 0;
#endif
- }
+ attach_task_cfs_rq(p);
}
void free_fair_sched_group(struct task_group *tg)
@@ -8257,8 +8098,11 @@ void free_fair_sched_group(struct task_group *tg)
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se)
+ if (tg->se) {
+ if (tg->se[i])
+ remove_entity_load_avg(tg->se[i]);
kfree(tg->se[i]);
+ }
}
kfree(tg->cfs_rq);
@@ -8295,6 +8139,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ init_entity_runnable_average(se);
}
return 1;
@@ -8444,6 +8289,8 @@ const struct sched_class fair_sched_class = {
.rq_offline = rq_offline_fair,
.task_waking = task_waking_fair,
+ .task_dead = task_dead_fair,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 91e33cd485f6..69631fa46c2f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
*/
SCHED_FEAT(WAKEUP_PREEMPTION, true)
-/*
- * Use arch dependent cpu capacity functions
- */
-SCHED_FEAT(ARCH_CAPACITY, true)
-
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
@@ -72,27 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
+SCHED_FEAT(ATTACH_AGE_LOAD, true)
-/*
- * Apply the automatic NUMA scheduling policy. Enabled automatically
- * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=
- */
-#ifdef CONFIG_NUMA_BALANCING
-SCHED_FEAT(NUMA, false)
-
-/*
- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
- * higher number of hinting faults are recorded during active load
- * balancing.
- */
-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
-
-/*
- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
- * lower number of hinting faults have been recorded. As this has
- * the potential to prevent a task ever migrating to a new node
- * due to CPU overload it is disabled by default.
- */
-SCHED_FEAT(NUMA_RESIST_LOWER, false)
-#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 594275ed2620..4a2ef5a02fd3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -57,9 +57,11 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
+ stop_critical_timings();
while (!tif_need_resched() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
+ start_critical_timings();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
return 1;
@@ -83,10 +85,13 @@ void __weak arch_cpu_idle(void)
*/
void default_idle_call(void)
{
- if (current_clr_polling_and_test())
+ if (current_clr_polling_and_test()) {
local_irq_enable();
- else
+ } else {
+ stop_critical_timings();
arch_cpu_idle();
+ start_critical_timings();
+ }
}
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -141,12 +146,6 @@ static void cpuidle_idle_call(void)
}
/*
- * During the idle period, stop measuring the disabled irqs
- * critical sections latencies
- */
- stop_critical_timings();
-
- /*
* Tell the RCU framework we are entering an idle section,
* so no more rcu read side critical sections and one more
* step to the grace period
@@ -198,7 +197,6 @@ exit_idle:
local_irq_enable();
rcu_idle_exit();
- start_critical_timings();
}
DEFINE_PER_CPU(bool, cpu_dead_idle);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c65dac8c97cd..c4ae0f1fdf9b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0d193a243e96..e3cc16312046 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
*/
-static int do_balance_runtime(struct rt_rq *rt_rq)
+static void do_balance_runtime(struct rt_rq *rt_rq)
{
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
- int i, weight, more = 0;
+ int i, weight;
u64 rt_period;
weight = cpumask_weight(rd->span);
@@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
diff = rt_period - rt_rq->rt_runtime;
iter->rt_runtime -= diff;
rt_rq->rt_runtime += diff;
- more = 1;
if (rt_rq->rt_runtime == rt_period) {
raw_spin_unlock(&iter->rt_runtime_lock);
break;
@@ -683,8 +682,6 @@ next:
raw_spin_unlock(&iter->rt_runtime_lock);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
-
- return more;
}
/*
@@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
}
}
-static int balance_runtime(struct rt_rq *rt_rq)
+static void balance_runtime(struct rt_rq *rt_rq)
{
- int more = 0;
-
if (!sched_feat(RT_RUNTIME_SHARE))
- return more;
+ return;
if (rt_rq->rt_time > rt_rq->rt_runtime) {
raw_spin_unlock(&rt_rq->rt_runtime_lock);
- more = do_balance_runtime(rt_rq);
+ do_balance_runtime(rt_rq);
raw_spin_lock(&rt_rq->rt_runtime_lock);
}
-
- return more;
}
#else /* !CONFIG_SMP */
-static inline int balance_runtime(struct rt_rq *rt_rq)
-{
- return 0;
-}
+static inline void balance_runtime(struct rt_rq *rt_rq) {}
#endif /* CONFIG_SMP */
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
@@ -2069,7 +2059,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
(rq->curr->nr_cpus_allowed < 2 ||
@@ -2077,45 +2066,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
push_rt_tasks(rq);
}
-static void set_cpus_allowed_rt(struct task_struct *p,
- const struct cpumask *new_mask)
-{
- struct rq *rq;
- int weight;
-
- BUG_ON(!rt_task(p));
-
- if (!task_on_rq_queued(p))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- rq = task_rq(p);
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_task(rq, p);
- BUG_ON(!rq->rt.rt_nr_migratory);
- rq->rt.rt_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_task(rq, p);
- rq->rt.rt_nr_migratory++;
- }
-
- update_rt_migration(&rq->rt);
-}
-
/* Assumes rq->lock is held */
static void rq_online_rt(struct rq *rq)
{
@@ -2324,7 +2274,7 @@ const struct sched_class rt_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
- .set_cpus_allowed = set_cpus_allowed_rt,
+ .set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 84d48790bb6d..efd3bfc7e347 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
*/
#define RUNTIME_INF ((u64)~0ULL)
+static inline int idle_policy(int policy)
+{
+ return policy == SCHED_IDLE;
+}
static inline int fair_policy(int policy)
{
return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
{
return policy == SCHED_DEADLINE;
}
+static inline bool valid_policy(int policy)
+{
+ return idle_policy(policy) || fair_policy(policy) ||
+ rt_policy(policy) || dl_policy(policy);
+}
static inline int task_has_rt_policy(struct task_struct *p)
{
@@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
-static inline bool dl_time_before(u64 a, u64 b)
-{
- return (s64)(a - b) < 0;
-}
-
/*
* Tells if entity @a should preempt entity @b.
*/
@@ -245,7 +249,6 @@ struct task_group {
#ifdef CONFIG_SMP
atomic_long_t load_avg;
- atomic_t runnable_avg;
#endif
#endif
@@ -366,27 +369,20 @@ struct cfs_rq {
#ifdef CONFIG_SMP
/*
- * CFS Load tracking
- * Under CFS, load is tracked on a per-entity basis and aggregated up.
- * This allows for the description of both thread and group usage (in
- * the FAIR_GROUP_SCHED case).
- * runnable_load_avg is the sum of the load_avg_contrib of the
- * sched_entities on the rq.
- * blocked_load_avg is similar to runnable_load_avg except that its
- * the blocked sched_entities on the rq.
- * utilization_load_avg is the sum of the average running time of the
- * sched_entities on the rq.
+ * CFS load tracking
*/
- unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
- atomic64_t decay_counter;
- u64 last_decay;
- atomic_long_t removed_load;
-
+ struct sched_avg avg;
+ u64 runnable_load_sum;
+ unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- /* Required to track per-cpu representation of a task_group */
- u32 tg_runnable_contrib;
- unsigned long tg_load_contrib;
+ unsigned long tg_load_avg_contrib;
+#endif
+ atomic_long_t removed_load_avg, removed_util_avg;
+#ifndef CONFIG_64BIT
+ u64 load_last_update_time_copy;
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* h_load = weight * f(tg)
*
@@ -595,8 +591,6 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-
- struct sched_avg avg;
#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
@@ -1013,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
-#ifdef CONFIG_NUMA_BALANCING
-#define sched_feat_numa(x) sched_feat(x)
-#ifdef CONFIG_SCHED_DEBUG
-#define numabalancing_enabled sched_feat_numa(NUMA)
-#else
-extern bool numabalancing_enabled;
-#endif /* CONFIG_SCHED_DEBUG */
-#else
-#define sched_feat_numa(x) (0)
-#define numabalancing_enabled (0)
-#endif /* CONFIG_NUMA_BALANCING */
+extern struct static_key_false sched_numa_balancing;
static inline u64 global_rt_period(void)
{
@@ -1065,9 +1049,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev) do { } while (0)
-#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
@@ -1091,9 +1072,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
+ *
+ * Pairs with the control dependency and rmb in try_to_wake_up().
*/
- smp_wmb();
- prev->on_cpu = 0;
+ smp_store_release(&prev->on_cpu, 0);
#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
@@ -1169,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_HEAD 2
+#define ENQUEUE_WAKEUP 0x01
+#define ENQUEUE_HEAD 0x02
#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
#else
-#define ENQUEUE_WAKING 0
+#define ENQUEUE_WAKING 0x00
#endif
-#define ENQUEUE_REPLENISH 8
+#define ENQUEUE_REPLENISH 0x08
+#define ENQUEUE_RESTORE 0x10
-#define DEQUEUE_SLEEP 1
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02
#define RETRY_TASK ((void *)-1UL)
@@ -1206,7 +1190,7 @@ struct sched_class {
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
- void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+ void (*migrate_task_rq)(struct task_struct *p);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1239,7 +1223,7 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p, int on_rq);
+ void (*task_move_group) (struct task_struct *p);
#endif
};
@@ -1268,6 +1252,8 @@ extern void trigger_load_balance(struct rq *rq);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+
#else
static inline void idle_enter_fair(struct rq *rq) { }
@@ -1319,7 +1305,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
-extern void init_task_runnable_average(struct task_struct *p);
+extern void init_entity_runnable_average(struct sched_entity *se);
static inline void add_nr_running(struct rq *rq, unsigned count)
{
@@ -1415,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
}
#endif
+#ifndef arch_scale_cpu_capacity
+static __always_inline
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+ if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
+ return sd->smt_gain / sd->span_weight;
+
+ return SCHED_CAPACITY_SCALE;
+}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec45a6ac..cbc67da10954 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
+ .set_cpus_allowed = set_cpus_allowed_common,
#endif
.set_curr_task = set_curr_task_stop,
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 245df6b32b81..580ac2d4024f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(struct seccomp_data *sd)
{
- struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ struct seccomp_filter *f =
+ lockless_dereference(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
- /* Make sure cross-thread synced filter points somewhere sane. */
- smp_read_barrier_depends();
-
if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
@@ -348,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
+ const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -371,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(-ENOMEM);
ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
- seccomp_check_filter);
+ seccomp_check_filter, save_orig);
if (ret < 0) {
kfree(sfilter);
return ERR_PTR(ret);
@@ -470,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk)
static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
if (filter) {
- bpf_prog_free(filter->prog);
+ bpf_prog_destroy(filter->prog);
kfree(filter);
}
}
@@ -549,7 +549,11 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- if (mode == 0)
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return;
+
+ if (mode == SECCOMP_MODE_DISABLED)
return;
else if (mode == SECCOMP_MODE_STRICT)
__secure_computing_strict(this_syscall);
@@ -650,6 +654,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
int this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return SECCOMP_PHASE1_OK;
+
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
@@ -860,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
/* prctl interface doesn't have flags, so they are always zero. */
return do_seccomp(op, 0, uargs);
}
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+ void __user *data)
+{
+ struct seccomp_filter *filter;
+ struct sock_fprog_kern *fprog;
+ long ret;
+ unsigned long count = 0;
+
+ if (!capable(CAP_SYS_ADMIN) ||
+ current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+ return -EACCES;
+ }
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ filter = task->seccomp.filter;
+ while (filter) {
+ filter = filter->prev;
+ count++;
+ }
+
+ if (filter_off >= count) {
+ ret = -ENOENT;
+ goto out;
+ }
+ count -= filter_off;
+
+ filter = task->seccomp.filter;
+ while (filter && count > 1) {
+ filter = filter->prev;
+ count--;
+ }
+
+ if (WARN_ON(count != 1 || !filter)) {
+ /* The filter tree shouldn't shrink while we're using it. */
+ ret = -ENOENT;
+ goto out;
+ }
+
+ fprog = filter->prog->orig_prog;
+ if (!fprog) {
+ /* This must be a new non-cBPF filter, since we save every
+ * every cBPF filter's orig_prog above when
+ * CONFIG_CHECKPOINT_RESTORE is enabled.
+ */
+ ret = -EMEDIUMTYPE;
+ goto out;
+ }
+
+ ret = fprog->len;
+ if (!data)
+ goto out;
+
+ get_seccomp_filter(task);
+ spin_unlock_irq(&task->sighand->siglock);
+
+ if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
+ ret = -EFAULT;
+
+ put_seccomp_filter(task);
+ return ret;
+
+out:
+ spin_unlock_irq(&task->sighand->siglock);
+ return ret;
+}
+#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 0f6bbbe77b46..c0b01fe24bbd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-/*
- * Notify the system that a driver wants to block all signals for this
- * process, and wants to be notified if any signals at all were to be
- * sent/acted upon. If the notifier routine returns non-zero, then the
- * signal will be acted upon after all. If the notifier routine returns 0,
- * then then signal will be blocked. Only one block per process is
- * allowed. priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.
- */
-void
-block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&current->sighand->siglock, flags);
- current->notifier_mask = mask;
- current->notifier_data = priv;
- current->notifier = notifier;
- spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
-
-/* Notify the system that blocking has ended. */
-
-void
-unblock_all_signals(void)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&current->sighand->siglock, flags);
- current->notifier = NULL;
- current->notifier_data = NULL;
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
-
static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
{
struct sigqueue *q, *first = NULL;
@@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
{
int sig = next_signal(pending, mask);
- if (sig) {
- if (current->notifier) {
- if (sigismember(current->notifier_mask, sig)) {
- if (!(current->notifier)(current->notifier_data)) {
- clear_thread_flag(TIF_SIGPENDING);
- return 0;
- }
- }
- }
-
+ if (sig)
collect_signal(sig, pending, info);
- }
-
return sig;
}
@@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
sigset_t flush;
if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
- if (signal->flags & SIGNAL_GROUP_COREDUMP)
+ if (!(signal->flags & SIGNAL_GROUP_EXIT))
return sig == SIGKILL;
/*
* The process is in the middle of dying, nothing to do.
@@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig);
EXPORT_SYMBOL(send_sig);
EXPORT_SYMBOL(send_sig_info);
EXPORT_SYMBOL(sigprocmask);
-EXPORT_SYMBOL(block_all_signals);
-EXPORT_SYMBOL(unblock_all_signals);
-
/*
* System call entry points.
diff --git a/kernel/smp.c b/kernel/smp.c
index 07854477c164..d903c02223af 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
cpumask_var_t cpus;
int cpu, ret;
- might_sleep_if(gfp_flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp_flags));
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
preempt_disable();
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..d264f59bff56 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
- if (ht->cleanup)
+ /* cleanup must mirror setup */
+ if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
@@ -221,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
{
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
- if (ht->pre_unpark)
- ht->pre_unpark(cpu);
- kthread_unpark(tsk);
+ if (!ht->selfparking)
+ kthread_unpark(tsk);
}
void smpboot_unpark_threads(unsigned int cpu)
@@ -259,15 +259,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
unsigned int cpu;
- /* Unpark any threads that were voluntarily parked. */
- for_each_cpu_not(cpu, ht->cpumask) {
- if (cpu_online(cpu)) {
- struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
- if (tsk)
- kthread_unpark(tsk);
- }
- }
-
/* We need to destroy also the parked threads of offline cpus */
for_each_possible_cpu(cpu) {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +272,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * to hotplug
* @plug_thread: Hotplug thread descriptor
+ * @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
- cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+ cpumask_copy(plug_thread->cpumask, cpumask);
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
@@ -301,9 +295,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
+ free_cpumask_var(plug_thread->cpumask);
goto out;
}
- smpboot_unpark_thread(plug_thread, cpu);
+ if (cpumask_test_cpu(cpu, cpumask))
+ smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
@@ -311,7 +307,7 @@ out:
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..867bc20e1ef1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,13 +35,16 @@ struct cpu_stop_done {
/* the actual stopper, one per every possible cpu, enabled on online cpus */
struct cpu_stopper {
+ struct task_struct *thread;
+
spinlock_t lock;
bool enabled; /* is this stopper enabled? */
struct list_head works; /* list of pending works */
+
+ struct cpu_stop_work stop_work; /* for stop_cpus */
};
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
-static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
/*
@@ -70,22 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
}
}
+static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
+ struct cpu_stop_work *work)
+{
+ list_add_tail(&work->list, &stopper->works);
+ wake_up_process(stopper->thread);
+}
+
/* queue @work to @stopper. if offline, @work is completed immediately */
static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
-
unsigned long flags;
spin_lock_irqsave(&stopper->lock, flags);
-
- if (stopper->enabled) {
- list_add_tail(&work->list, &stopper->works);
- wake_up_process(p);
- } else
+ if (stopper->enabled)
+ __cpu_stop_queue_work(stopper, work);
+ else
cpu_stop_signal_done(work->done, false);
-
spin_unlock_irqrestore(&stopper->lock, flags);
}
@@ -139,7 +144,7 @@ enum multi_stop_state {
};
struct multi_stop_data {
- int (*fn)(void *);
+ cpu_stop_fn_t fn;
void *data;
/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
unsigned int num_threads;
@@ -211,6 +216,31 @@ static int multi_cpu_stop(void *data)
return err;
}
+static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
+ int cpu2, struct cpu_stop_work *work2)
+{
+ struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+ struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+ int err;
+
+ lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
+ spin_lock_irq(&stopper1->lock);
+ spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
+
+ err = -ENOENT;
+ if (!stopper1->enabled || !stopper2->enabled)
+ goto unlock;
+
+ err = 0;
+ __cpu_stop_queue_work(stopper1, work1);
+ __cpu_stop_queue_work(stopper2, work2);
+unlock:
+ spin_unlock(&stopper2->lock);
+ spin_unlock_irq(&stopper1->lock);
+ lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+
+ return err;
+}
/**
* stop_two_cpus - stops two cpus
* @cpu1: the cpu to stop
@@ -245,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
cpu_stop_init_done(&done, 2);
set_state(&msdata, MULTI_STOP_PREPARE);
- /*
- * If we observe both CPUs active we know _cpu_down() cannot yet have
- * queued its stop_machine works and therefore ours will get executed
- * first. Or its not either one of our CPUs that's getting unplugged,
- * in which case we don't care.
- *
- * This relies on the stopper workqueues to be FIFO.
- */
- if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+ if (cpu1 > cpu2)
+ swap(cpu1, cpu2);
+ if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
preempt_enable();
return -ENOENT;
}
- lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
- cpu_stop_queue_work(cpu1, &work1);
- cpu_stop_queue_work(cpu2, &work2);
- lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
-
preempt_enable();
wait_for_completion(&done.completion);
@@ -293,7 +312,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
/* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex);
-static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
static void queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
@@ -302,22 +320,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
struct cpu_stop_work *work;
unsigned int cpu;
- /* initialize works and done */
- for_each_cpu(cpu, cpumask) {
- work = &per_cpu(stop_cpus_work, cpu);
- work->fn = fn;
- work->arg = arg;
- work->done = done;
- }
-
/*
* Disable preemption while queueing to avoid getting
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
lg_global_lock(&stop_cpus_lock);
- for_each_cpu(cpu, cpumask)
- cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
+ for_each_cpu(cpu, cpumask) {
+ work = &per_cpu(cpu_stopper.stop_work, cpu);
+ work->fn = fn;
+ work->arg = arg;
+ work->done = done;
+ cpu_stop_queue_work(cpu, work);
+ }
lg_global_unlock(&stop_cpus_lock);
}
@@ -454,45 +469,47 @@ repeat:
}
}
+void stop_machine_park(int cpu)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ /*
+ * Lockless. cpu_stopper_thread() will take stopper->lock and flush
+ * the pending works before it parks, until then it is fine to queue
+ * the new works.
+ */
+ stopper->enabled = false;
+ kthread_park(stopper->thread);
+}
+
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
static void cpu_stop_create(unsigned int cpu)
{
- sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
+ sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
}
static void cpu_stop_park(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- struct cpu_stop_work *work;
- unsigned long flags;
- /* drain remaining works */
- spin_lock_irqsave(&stopper->lock, flags);
- list_for_each_entry(work, &stopper->works, list)
- cpu_stop_signal_done(work->done, false);
- stopper->enabled = false;
- spin_unlock_irqrestore(&stopper->lock, flags);
+ WARN_ON(!list_empty(&stopper->works));
}
-static void cpu_stop_unpark(unsigned int cpu)
+void stop_machine_unpark(int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
- spin_lock_irq(&stopper->lock);
stopper->enabled = true;
- spin_unlock_irq(&stopper->lock);
+ kthread_unpark(stopper->thread);
}
static struct smp_hotplug_thread cpu_stop_threads = {
- .store = &cpu_stopper_task,
+ .store = &cpu_stopper.thread,
.thread_should_run = cpu_stop_should_run,
.thread_fn = cpu_stopper_thread,
.thread_comm = "migration/%u",
.create = cpu_stop_create,
- .setup = cpu_stop_unpark,
.park = cpu_stop_park,
- .pre_unpark = cpu_stop_unpark,
.selfparking = true,
};
@@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
}
BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
+ stop_machine_unpark(raw_smp_processor_id());
stop_machine_initialized = true;
return 0;
}
@@ -515,7 +533,7 @@ early_initcall(cpu_stop_init);
#ifdef CONFIG_STOP_MACHINE
-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
struct multi_stop_data msdata = {
.fn = fn,
@@ -548,7 +566,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
}
-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
int ret;
@@ -582,7 +600,7 @@ EXPORT_SYMBOL_GPL(stop_machine);
* 0 if all executions of @fn returned 0, any non zero return value if any
* returned non zero.
*/
-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
struct multi_stop_data msdata = { .fn = fn, .data = data,
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..6af9212ab5aa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
goto out_unlock; /* No processes for this user */
}
do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
+ if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
error = set_one_prio(p, niceval, error);
} while_each_thread(g, p);
if (!uid_eq(uid, cred->uid))
@@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
goto out_unlock; /* No processes for this user */
}
do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
+ if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
* overall picture.
*/
err = -EACCES;
- if (!S_ISREG(inode->i_mode) ||
- exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+ if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
goto exit;
err = inode_permission(inode, MAY_EXEC);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8..0623787ec67a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
cond_syscall(sys_ssetmask);
cond_syscall(sys_vm86old);
cond_syscall(sys_vm86);
+cond_syscall(sys_modify_ldt);
cond_syscall(sys_ipc);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);
@@ -193,6 +194,7 @@ cond_syscall(sys_mlock);
cond_syscall(sys_munlock);
cond_syscall(sys_mlockall);
cond_syscall(sys_munlockall);
+cond_syscall(sys_mlock2);
cond_syscall(sys_mincore);
cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
@@ -218,6 +220,7 @@ cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
@@ -243,3 +246,6 @@ cond_syscall(sys_bpf);
/* execveat */
cond_syscall(sys_execveat);
+
+/* membarrier */
+cond_syscall(sys_membarrier);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..dc6858d6639e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
#include <linux/kexec.h>
+#include <linux/bpf.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -621,7 +622,7 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
{
.procname = "kexec_load_disabled",
.data = &kexec_load_disabled,
@@ -887,6 +888,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+ {
+ .procname = "hardlockup_panic",
+ .data = &hardlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_SMP
{
.procname = "softlockup_all_cpu_backtrace",
@@ -897,6 +909,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "hardlockup_all_cpu_backtrace",
+ .data = &sysctl_hardlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif /* CONFIG_SMP */
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -1139,6 +1160,18 @@ static struct ctl_table kern_table[] = {
.proc_handler = timer_migration_handler,
},
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
+#endif
{ }
};
@@ -1995,7 +2028,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2201,7 +2234,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
int val = *valp;
if (val < 0) {
*negp = true;
- *lvalp = (unsigned long)-val;
+ *lvalp = -(unsigned long)val;
} else {
*negp = false;
*lvalp = (unsigned long)val;
@@ -2436,7 +2469,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2459,7 +2492,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
@@ -2484,7 +2517,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
unsigned long lval;
if (val < 0) {
*negp = true;
- lval = (unsigned long)-val;
+ lval = -(unsigned long)val;
} else {
*negp = false;
lval = (unsigned long)val;
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644
index 3e9868d47535..000000000000
--- a/kernel/system_certificates.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <linux/export.h>
-#include <linux/init.h>
-
- __INITRODATA
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list)
-VMLINUX_SYMBOL(system_certificate_list):
-__cert_list_start:
- .incbin "kernel/x509_certificate_list"
-__cert_list_end:
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list_size)
-VMLINUX_SYMBOL(system_certificate_list_size):
-#ifdef CONFIG_64BIT
- .quad __cert_list_end - __cert_list_start
-#else
- .long __cert_list_end - __cert_list_start
-#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644
index 875f64e8935b..000000000000
--- a/kernel/system_keyring.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* System trusted keyring for trusted public keys
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
-
-extern __initconst const u8 system_certificate_list[];
-extern __initconst const unsigned long system_certificate_list_size;
-
-/*
- * Load the compiled-in keys
- */
-static __init int system_trusted_keyring_init(void)
-{
- pr_notice("Initialise system trusted keyring\n");
-
- system_trusted_keyring =
- keyring_alloc(".system_keyring",
- KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(system_trusted_keyring))
- panic("Can't allocate system trusted keyring\n");
-
- set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
- return 0;
-}
-
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(system_trusted_keyring_init);
-
-/*
- * Load the compiled-in list of X.509 certificates.
- */
-static __init int load_system_certificate_list(void)
-{
- key_ref_t key;
- const u8 *p, *end;
- size_t plen;
-
- pr_notice("Loading compiled-in X.509 certificates\n");
-
- p = system_certificate_list;
- end = p + system_certificate_list_size;
- while (p < end) {
- /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
- * than 256 bytes in size.
- */
- if (end - p < 4)
- goto dodgy_cert;
- if (p[0] != 0x30 &&
- p[1] != 0x82)
- goto dodgy_cert;
- plen = (p[2] << 8) | p[3];
- plen += 4;
- if (plen > end - p)
- goto dodgy_cert;
-
- key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
- "asymmetric",
- NULL,
- p,
- plen,
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ),
- KEY_ALLOC_NOT_IN_QUOTA |
- KEY_ALLOC_TRUSTED);
- if (IS_ERR(key)) {
- pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
- PTR_ERR(key));
- } else {
- set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
- pr_notice("Loaded X.509 cert '%s'\n",
- key_ref_to_ptr(key)->description);
- key_ref_put(key);
- }
- p += plen;
- }
-
- return 0;
-
-dodgy_cert:
- pr_err("Problem parsing in-kernel X.509 certificate list\n");
- return 0;
-}
-late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8727032e3a6f..53fa971d000d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* This is like the signal handler which runs in kernel mode, but it doesn't
* try to wake up the @task.
*
+ * Note: there is no ordering guarantee on works queued here.
+ *
* RETURNS:
* 0 if succeeds or -ESRCH.
*/
@@ -108,16 +110,6 @@ void task_work_run(void)
raw_spin_unlock_wait(&task->pi_lock);
smp_mb();
- /* Reverse the list to run the works in fifo order */
- head = NULL;
- do {
- next = work->next;
- work->next = head;
- head = work;
- work = next;
- } while (work);
-
- work = head;
do {
next = work->next;
work->func(work);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 579ce1b929af..4008d9f95dd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -92,12 +92,10 @@ config NO_HZ_FULL
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
# We need at least one periodic CPU for timekeeping
depends on SMP
- # RCU_USER_QS dependency
depends on HAVE_CONTEXT_TRACKING
# VIRT_CPU_ACCOUNTING_GEN dependency
depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
select NO_HZ_COMMON
- select RCU_USER_QS
select RCU_NOCB_CPU
select VIRT_CPU_ACCOUNTING_GEN
select IRQ_WORK
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 50eb107f1198..a9b76a40319e 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns);
static int __clockevents_switch_state(struct clock_event_device *dev,
enum clock_event_state state)
{
- /* Transition with legacy set_mode() callback */
- if (dev->set_mode) {
- /* Legacy callback doesn't support new modes */
- if (state > CLOCK_EVT_STATE_ONESHOT)
- return -ENOSYS;
- /*
- * 'clock_event_state' and 'clock_event_mode' have 1-to-1
- * mapping until *_ONESHOT, and so a simple cast will work.
- */
- dev->set_mode((enum clock_event_mode)state, dev);
- dev->mode = (enum clock_event_mode)state;
- return 0;
- }
-
if (dev->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
@@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)
{
int ret = 0;
- if (dev->set_mode) {
- dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
- dev->mode = CLOCK_EVT_MODE_RESUME;
- } else if (dev->tick_resume) {
+ if (dev->tick_resume)
ret = dev->tick_resume(dev);
- }
return ret;
}
@@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
}
EXPORT_SYMBOL_GPL(clockevents_unbind_device);
-/* Sanity check of state transition callbacks */
-static int clockevents_sanity_check(struct clock_event_device *dev)
-{
- /* Legacy set_mode() callback */
- if (dev->set_mode) {
- /* We shouldn't be supporting new modes now */
- WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
- dev->set_state_shutdown || dev->tick_resume ||
- dev->set_state_oneshot_stopped);
-
- BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
- return 0;
- }
-
- if (dev->features & CLOCK_EVT_FEAT_DUMMY)
- return 0;
-
- return 0;
-}
-
/**
* clockevents_register_device - register a clock event device
* @dev: device to register
@@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
- BUG_ON(clockevents_sanity_check(dev));
-
/* Initialize state to DETACHED */
clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 841b72f720e8..0d8fe8b8f727 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data)
continue;
/* Check the deviation from the watchdog clocksource. */
- if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
+ if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
cs->name);
pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
@@ -479,7 +479,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
* return half the number of nanoseconds the hardware counter can technically
* cover. This is done so that we can potentially detect problems caused by
* delayed timers or bad hardware, which might result in time intervals that
- * are larger then what the math used can handle without overflows.
+ * are larger than what the math used can handle without overflows.
*/
u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
{
@@ -595,16 +595,15 @@ static void __clocksource_select(bool skipcur)
*/
static void clocksource_select(void)
{
- return __clocksource_select(false);
+ __clocksource_select(false);
}
static void clocksource_select_fallback(void)
{
- return __clocksource_select(true);
+ __clocksource_select(true);
}
#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
-
static inline void clocksource_select(void) { }
static inline void clocksource_select_fallback(void) { }
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c7ae4b641c4..435b8850dd80 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -59,7 +59,7 @@
/*
* The timer bases:
*
- * There are more clockids then hrtimer bases. Thus, we index
+ * There are more clockids than hrtimer bases. Thus, we index
* into the timer bases by the hrtimer_base_type enum. When trying
* to reach a base using a clockid, hrtimer_clockid_to_base()
* is used to convert from clockid to the proper hrtimer_base_type.
@@ -183,7 +183,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
if (pinned || !base->migration_enabled)
- return this_cpu_ptr(&hrtimer_bases);
+ return base;
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
}
#else
@@ -191,23 +191,32 @@ static inline
struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
int pinned)
{
- return this_cpu_ptr(&hrtimer_bases);
+ return base;
}
#endif
/*
- * Switch the timer base to the current CPU when possible.
+ * We switch the timer base to a power-optimized selected CPU target,
+ * if:
+ * - NO_HZ_COMMON is enabled
+ * - timer migration is enabled
+ * - the timer callback is not running
+ * - the timer is not the first expiring timer on the new target
+ *
+ * If one of the above requirements is not fulfilled we move the timer
+ * to the current CPU or leave it on the previously assigned CPU if
+ * the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
int pinned)
{
- struct hrtimer_cpu_base *new_cpu_base, *this_base;
+ struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
int basenum = base->index;
- this_base = this_cpu_ptr(&hrtimer_bases);
- new_cpu_base = get_target_base(this_base, pinned);
+ this_cpu_base = this_cpu_ptr(&hrtimer_bases);
+ new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
new_base = &new_cpu_base->clock_base[basenum];
@@ -229,19 +238,19 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_base &&
+ if (new_cpu_base != this_cpu_base &&
hrtimer_check_target(timer, new_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
- new_cpu_base = this_base;
+ new_cpu_base = this_cpu_base;
timer->base = base;
goto again;
}
timer->base = new_base;
} else {
- if (new_cpu_base != this_base &&
+ if (new_cpu_base != this_cpu_base &&
hrtimer_check_target(timer, new_base)) {
- new_cpu_base = this_base;
+ new_cpu_base = this_cpu_base;
goto again;
}
}
@@ -679,14 +688,14 @@ static void retrigger_next_event(void *arg)
/*
* Switch to high resolution mode
*/
-static int hrtimer_switch_to_hres(void)
+static void hrtimer_switch_to_hres(void)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
printk(KERN_WARNING "Could not switch to high resolution "
"mode on CPU %d\n", base->cpu);
- return 0;
+ return;
}
base->hres_active = 1;
hrtimer_resolution = HIGH_RES_NSEC;
@@ -694,7 +703,6 @@ static int hrtimer_switch_to_hres(void)
tick_setup_sched_timer();
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
- return 1;
}
static void clock_was_set_work(struct work_struct *work)
@@ -718,7 +726,7 @@ void clock_was_set_delayed(void)
static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
static inline int hrtimer_hres_active(void) { return 0; }
static inline int hrtimer_is_hres_enabled(void) { return 0; }
-static inline int hrtimer_switch_to_hres(void) { return 0; }
+static inline void hrtimer_switch_to_hres(void) { }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_reprogram(struct hrtimer *timer,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index fb4d98c7fd43..149cc8086aea 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -99,7 +99,7 @@ static time64_t ntp_next_leap_sec = TIME64_MAX;
static int pps_valid; /* signal watchdog counter */
static long pps_tf[3]; /* phase median filter */
static long pps_jitter; /* current jitter (ns) */
-static struct timespec pps_fbase; /* beginning of the last freq interval */
+static struct timespec64 pps_fbase; /* beginning of the last freq interval */
static int pps_shift; /* current interval duration (s) (shift) */
static int pps_intcnt; /* interval counter */
static s64 pps_freq; /* frequency offset (scaled ns/s) */
@@ -487,6 +487,11 @@ out:
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock(struct timespec now)
+{
+ return -ENODEV;
+}
+
int __weak update_persistent_clock64(struct timespec64 now64)
{
struct timespec now;
@@ -504,7 +509,7 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
static void sync_cmos_clock(struct work_struct *work)
{
struct timespec64 now;
- struct timespec next;
+ struct timespec64 next;
int fail = 1;
/*
@@ -554,7 +559,7 @@ static void sync_cmos_clock(struct work_struct *work)
next.tv_nsec -= NSEC_PER_SEC;
}
queue_delayed_work(system_power_efficient_wq,
- &sync_cmos_work, timespec_to_jiffies(&next));
+ &sync_cmos_work, timespec64_to_jiffies(&next));
}
void ntp_notify_cmos_timer(void)
@@ -768,13 +773,13 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
* pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
* while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
struct pps_normtime {
- __kernel_time_t sec; /* seconds */
+ s64 sec; /* seconds */
long nsec; /* nanoseconds */
};
/* normalize the timestamp so that nsec is in the
( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
-static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts)
{
struct pps_normtime norm = {
.sec = ts.tv_sec,
@@ -856,7 +861,7 @@ static long hardpps_update_freq(struct pps_normtime freq_norm)
pps_errcnt++;
pps_dec_freq_interval();
printk_deferred(KERN_ERR
- "hardpps: PPSERROR: interval too long - %ld s\n",
+ "hardpps: PPSERROR: interval too long - %lld s\n",
freq_norm.sec);
return 0;
}
@@ -943,7 +948,7 @@ static void hardpps_update_phase(long error)
* This code is based on David Mills's reference nanokernel
* implementation. It was mostly rewritten but keeps the same idea.
*/
-void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
struct pps_normtime pts_norm, freq_norm;
@@ -964,7 +969,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
}
/* ok, now we have a base for frequency calculation */
- freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+ freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase));
/* check that the signal is in the range
* [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 65430504ca26..af924470eac0 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -9,5 +9,5 @@ extern ktime_t ntp_get_next_leap(void);
extern int second_overflow(unsigned long secs);
extern int ntp_validate_timex(struct timex *);
extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
-extern void __hardpps(const struct timespec *, const struct timespec *);
+extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 892e3dae0aac..f5e86d282d52 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -249,7 +249,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
* but barriers are not required because update_gt_cputime()
* can handle concurrent updates.
*/
- WRITE_ONCE(cputimer->running, 1);
+ WRITE_ONCE(cputimer->running, true);
}
sample_cputime_atomic(times, &cputimer->cputime_atomic);
}
@@ -864,6 +864,13 @@ static void check_thread_timers(struct task_struct *tsk,
unsigned long long expires;
unsigned long soft;
+ /*
+ * If cputime_expires is zero, then there are no active
+ * per thread CPU timers.
+ */
+ if (task_cputime_zero(&tsk->cputime_expires))
+ return;
+
expires = check_timers_list(timers, firing, prof_ticks(tsk));
tsk_expires->prof_exp = expires_to_cputime(expires);
@@ -911,7 +918,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
struct thread_group_cputimer *cputimer = &sig->cputimer;
/* Turn off cputimer->running. This is done without locking. */
- WRITE_ONCE(cputimer->running, 0);
+ WRITE_ONCE(cputimer->running, false);
}
static u32 onecputick;
@@ -962,6 +969,19 @@ static void check_process_timers(struct task_struct *tsk,
unsigned long soft;
/*
+ * If cputimer is not running, then there are no active
+ * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
+ */
+ if (!READ_ONCE(tsk->signal->cputimer.running))
+ return;
+
+ /*
+ * Signify that a thread is checking for process timers.
+ * Write access to this field is protected by the sighand lock.
+ */
+ sig->cputimer.checking_timer = true;
+
+ /*
* Collect the current process totals.
*/
thread_group_cputimer(tsk, &cputime);
@@ -1015,6 +1035,8 @@ static void check_process_timers(struct task_struct *tsk,
sig->cputime_expires.sched_exp = sched_expires;
if (task_cputime_zero(&sig->cputime_expires))
stop_process_timers(sig);
+
+ sig->cputimer.checking_timer = false;
}
/*
@@ -1117,24 +1139,33 @@ static inline int task_cputime_expired(const struct task_cputime *sample,
static inline int fastpath_timer_check(struct task_struct *tsk)
{
struct signal_struct *sig;
- cputime_t utime, stime;
-
- task_cputime(tsk, &utime, &stime);
if (!task_cputime_zero(&tsk->cputime_expires)) {
- struct task_cputime task_sample = {
- .utime = utime,
- .stime = stime,
- .sum_exec_runtime = tsk->se.sum_exec_runtime
- };
+ struct task_cputime task_sample;
+ task_cputime(tsk, &task_sample.utime, &task_sample.stime);
+ task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
return 1;
}
sig = tsk->signal;
- /* Check if cputimer is running. This is accessed without locking. */
- if (READ_ONCE(sig->cputimer.running)) {
+ /*
+ * Check if thread group timers expired when the cputimer is
+ * running and no other thread in the group is already checking
+ * for thread group cputimers. These fields are read without the
+ * sighand lock. However, this is fine because this is meant to
+ * be a fastpath heuristic to determine whether we should try to
+ * acquire the sighand lock to check/handle timers.
+ *
+ * In the worst case scenario, if 'running' or 'checking_timer' gets
+ * set but the current thread doesn't see the change yet, we'll wait
+ * until the next thread in the group gets a scheduler interrupt to
+ * handle the timer. This isn't an issue in practice because these
+ * types of delays with signals actually getting sent are expected.
+ */
+ if (READ_ONCE(sig->cputimer.running) &&
+ !READ_ONCE(sig->cputimer.checking_timer)) {
struct task_cputime group_sample;
sample_cputime_atomic(&group_sample, &sig->cputimer.cputime_atomic);
@@ -1174,12 +1205,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
* put them on the firing list.
*/
check_thread_timers(tsk, &firing);
- /*
- * If there are any active process wide timers (POSIX 1.b, itimers,
- * RLIMIT_CPU) cputimer must be running.
- */
- if (READ_ONCE(tsk->signal->cputimer.running))
- check_process_timers(tsk, &firing);
+
+ check_process_timers(tsk, &firing);
/*
* We must release these locks before taking any timer's lock.
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 3e7db49a2381..53d7184da0be 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -18,30 +18,23 @@
static struct hrtimer bctimer;
-static void bc_set_mode(enum clock_event_mode mode,
- struct clock_event_device *bc)
+static int bc_shutdown(struct clock_event_device *evt)
{
- switch (mode) {
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- /*
- * Note, we cannot cancel the timer here as we might
- * run into the following live lock scenario:
- *
- * cpu 0 cpu1
- * lock(broadcast_lock);
- * hrtimer_interrupt()
- * bc_handler()
- * tick_handle_oneshot_broadcast();
- * lock(broadcast_lock);
- * hrtimer_cancel()
- * wait_for_callback()
- */
- hrtimer_try_to_cancel(&bctimer);
- break;
- default:
- break;
- }
+ /*
+ * Note, we cannot cancel the timer here as we might
+ * run into the following live lock scenario:
+ *
+ * cpu 0 cpu1
+ * lock(broadcast_lock);
+ * hrtimer_interrupt()
+ * bc_handler()
+ * tick_handle_oneshot_broadcast();
+ * lock(broadcast_lock);
+ * hrtimer_cancel()
+ * wait_for_callback()
+ */
+ hrtimer_try_to_cancel(&bctimer);
+ return 0;
}
/*
@@ -82,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
}
static struct clock_event_device ce_broadcast_hrtimer = {
- .set_mode = bc_set_mode,
+ .set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
CLOCK_EVT_FEAT_KTIME |
@@ -102,13 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
{
ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
- switch (ce_broadcast_hrtimer.mode) {
- case CLOCK_EVT_MODE_ONESHOT:
+ if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
return HRTIMER_RESTART;
- default:
- return HRTIMER_NORESTART;
- }
+
+ return HRTIMER_NORESTART;
}
void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index f8bf47571dda..4fcd99e12aa0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -304,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev)
int cpu;
cpu = smp_processor_id();
- if (!cpumask_test_cpu(cpu, newdev->cpumask))
- goto out_bc;
-
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
@@ -401,7 +398,6 @@ void tick_shutdown(unsigned int cpu)
* the set mode function!
*/
clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
- dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c792429e98c6..7c7ec4515983 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -197,27 +197,9 @@ static bool can_stop_full_tick(void)
return true;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
-
-/*
- * Re-evaluate the need for the tick on the current CPU
- * and restart it if necessary.
- */
-void __tick_nohz_full_check(void)
-{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
- if (tick_nohz_full_cpu(smp_processor_id())) {
- if (ts->tick_stopped && !is_idle_task(current)) {
- if (!can_stop_full_tick())
- tick_nohz_restart_sched_tick(ts, ktime_get());
- }
- }
-}
-
static void nohz_full_kick_work_func(struct irq_work *work)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -252,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu)
static void nohz_full_kick_ipi(void *info)
{
- __tick_nohz_full_check();
+ /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
/*
@@ -276,7 +258,7 @@ void tick_nohz_full_kick_all(void)
* It might need the tick due to per task/process properties:
* perf events, posix cpu timers, ...
*/
-void __tick_nohz_task_switch(struct task_struct *tsk)
+void __tick_nohz_task_switch(void)
{
unsigned long flags;
@@ -308,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)
__setup("nohz_full=", tick_nohz_full_setup);
static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action,
+ void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
/*
- * If we handle the timekeeping duty for full dynticks CPUs,
- * we can't safely shutdown that CPU.
+ * The boot CPU handles housekeeping duty (unbound timers,
+ * workqueues, timekeeping, ...) on behalf of full dynticks
+ * CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
return NOTIFY_BAD;
@@ -388,6 +371,12 @@ void __init tick_nohz_init(void)
cpu_notifier(tick_nohz_cpu_down_callback, 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
+
+ /*
+ * We need at least one CPU to handle housekeeping work such
+ * as timekeeping, unbound timers, workqueues, ...
+ */
+ WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
#endif
@@ -705,21 +694,38 @@ out:
return tick;
}
-static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
+{
+ /* Update jiffies first */
+ tick_do_update_jiffies64(now);
+ update_cpu_load_nohz();
+
+ calc_load_exit_idle();
+ touch_softlockup_watchdog();
+ /*
+ * Cancel the scheduled timer and restore the tick
+ */
+ ts->tick_stopped = 0;
+ ts->idle_exittime = now;
+
+ tick_nohz_restart(ts, now);
+}
+
+static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
#ifdef CONFIG_NO_HZ_FULL
int cpu = smp_processor_id();
- if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ if (!tick_nohz_full_cpu(cpu))
return;
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
- if (!can_stop_full_tick())
- return;
-
- tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ if (can_stop_full_tick())
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+ else if (ts->tick_stopped)
+ tick_nohz_restart_sched_tick(ts, ktime_get());
#endif
}
@@ -849,7 +855,7 @@ void tick_nohz_irq_exit(void)
if (ts->inidle)
__tick_nohz_idle_enter(ts);
else
- tick_nohz_full_stop_tick(ts);
+ tick_nohz_full_update_tick(ts);
}
/**
@@ -864,23 +870,6 @@ ktime_t tick_nohz_get_sleep_length(void)
return ts->sleep_length;
}
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
-{
- /* Update jiffies first */
- tick_do_update_jiffies64(now);
- update_cpu_load_nohz();
-
- calc_load_exit_idle();
- touch_softlockup_watchdog();
- /*
- * Cancel the scheduled timer and restore the tick
- */
- ts->tick_stopped = 0;
- ts->idle_exittime = now;
-
- tick_nohz_restart(ts, now);
-}
-
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 85d5bb1d67eb..86751c68e08d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs);
unsigned int jiffies_to_usecs(const unsigned long j)
{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ /*
+ * Hz usually doesn't go much further MSEC_PER_SEC.
+ * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
+ */
+ BUILD_BUG_ON(HZ > USEC_PER_SEC);
+
+#if !(USEC_PER_SEC % HZ)
return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
@@ -287,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs);
* @t: Timespec
* @gran: Granularity in ns.
*
- * Truncate a timespec to a granularity. gran must be smaller than a second.
- * Always rounds down.
- *
- * This function should be only used for timestamps returned by
- * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
- * it doesn't handle the better resolution of the latter.
+ * Truncate a timespec to a granularity. Always rounds down. gran must
+ * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
*/
struct timespec timespec_trunc(struct timespec t, unsigned gran)
{
- /*
- * Division is pretty slow so avoid it for common cases.
- * Currently current_kernel_time() never returns better than
- * jiffies resolution. Exploit that.
- */
- if (gran <= jiffies_to_usecs(1) * 1000) {
+ /* Avoid division in the common cases 1 ns and 1 s. */
+ if (gran == 1) {
/* nothing */
- } else if (gran == 1000000000) {
+ } else if (gran == NSEC_PER_SEC) {
t.tv_nsec = 0;
- } else {
+ } else if (gran > 1 && gran < NSEC_PER_SEC) {
t.tv_nsec -= t.tv_nsec % gran;
+ } else {
+ WARN(1, "illegal file time granularity: %u", gran);
}
return t;
}
@@ -546,7 +544,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
* value to a scaled second value.
*/
static unsigned long
-__timespec_to_jiffies(unsigned long sec, long nsec)
+__timespec64_to_jiffies(u64 sec, long nsec)
{
nsec = nsec + TICK_NSEC - 1;
@@ -554,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec)
sec = MAX_SEC_IN_JIFFIES;
nsec = 0;
}
- return (((u64)sec * SEC_CONVERSION) +
+ return ((sec * SEC_CONVERSION) +
(((u64)nsec * NSEC_CONVERSION) >>
(NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
-unsigned long
-timespec_to_jiffies(const struct timespec *value)
+static unsigned long
+__timespec_to_jiffies(unsigned long sec, long nsec)
{
- return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
+ return __timespec64_to_jiffies((u64)sec, nsec);
}
-EXPORT_SYMBOL(timespec_to_jiffies);
+unsigned long
+timespec64_to_jiffies(const struct timespec64 *value)
+{
+ return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
+}
+EXPORT_SYMBOL(timespec64_to_jiffies);
void
-jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
+jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
/*
* Convert jiffies to nanoseconds and separate with
@@ -580,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
NSEC_PER_SEC, &rem);
value->tv_nsec = rem;
}
-EXPORT_SYMBOL(jiffies_to_timespec);
+EXPORT_SYMBOL(jiffies_to_timespec64);
/*
* We could use a similar algorithm to timespec_to_jiffies (with a
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index c7388dee8635..c48688904f9f 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -39,7 +39,7 @@ define fmuls(b,n,d) {
}
define timeconst(hz) {
- print "/* Automatically generated by kernel/timeconst.bc */\n"
+ print "/* Automatically generated by kernel/time/timeconst.bc */\n"
print "/* Time conversion constants for HZ == ", hz, " */\n"
print "\n"
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index bca3667a2de1..b1356b7ae570 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -849,7 +849,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
#ifdef CONFIG_NTP_PPS
/**
- * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
* @ts_raw: pointer to the timespec to be set to raw monotonic time
* @ts_real: pointer to the timespec to be set to the time of day
*
@@ -857,7 +857,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
* same time atomically and stores the resulting timestamps in timespec
* format.
*/
-void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
@@ -868,7 +868,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts_raw = timespec64_to_timespec(tk->raw_time);
+ *ts_raw = tk->raw_time;
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
@@ -877,10 +877,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
} while (read_seqcount_retry(&tk_core.seq, seq));
- timespec_add_ns(ts_raw, nsecs_raw);
- timespec_add_ns(ts_real, nsecs_real);
+ timespec64_add_ns(ts_raw, nsecs_raw);
+ timespec64_add_ns(ts_real, nsecs_real);
}
-EXPORT_SYMBOL(getnstime_raw_and_real);
+EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
#endif /* CONFIG_NTP_PPS */
@@ -911,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts)
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts_delta, xt;
unsigned long flags;
+ int ret = 0;
if (!timespec64_valid_strict(ts))
return -EINVAL;
@@ -924,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts)
ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
tk_set_xtime(tk, ts);
-
+out:
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
@@ -936,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts)
/* signal hrtimers about time change */
clock_was_set();
- return 0;
+ return ret;
}
EXPORT_SYMBOL(do_settimeofday64);
@@ -965,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts)
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tk), ts64);
- if (!timespec64_valid_strict(&tmp)) {
+ if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+ !timespec64_valid_strict(&tmp)) {
ret = -EINVAL;
goto error;
}
@@ -1244,7 +1251,7 @@ void __init timekeeping_init(void)
set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
tk_set_wall_to_mono(tk, tmp);
- timekeeping_update(tk, TK_MIRROR);
+ timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
write_seqcount_end(&tk_core.seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1607,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
negative = (tick_error < 0);
/* Sort out the magnitude of the correction */
- tick_error = abs(tick_error);
+ tick_error = abs64(tick_error);
for (adj = 0; tick_error > interval; adj++)
tick_error >>= 1;
@@ -1667,7 +1674,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
/**
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
- * Helper function that accumulates a the nsecs greater then a second
+ * Helper function that accumulates the nsecs greater than a second
* from the xtime_nsec field to the xtime_secs field.
* It also calls into the NTP code to handle leapsecond processing.
*
@@ -1719,7 +1726,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
cycle_t interval = tk->cycle_interval << shift;
u64 raw_nsecs;
- /* If the offset is smaller then a shifted interval, do nothing */
+ /* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
return offset;
@@ -1874,7 +1881,7 @@ struct timespec __current_kernel_time(void)
return timespec64_to_timespec(tk_xtime(tk));
}
-struct timespec current_kernel_time(void)
+struct timespec64 current_kernel_time64(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now;
@@ -1886,9 +1893,9 @@ struct timespec current_kernel_time(void)
now = tk_xtime(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
- return timespec64_to_timespec(now);
+ return now;
}
-EXPORT_SYMBOL(current_kernel_time);
+EXPORT_SYMBOL(current_kernel_time64);
struct timespec64 get_monotonic_coarse64(void)
{
@@ -2018,7 +2025,7 @@ int do_adjtimex(struct timex *txc)
/**
* hardpps() - Accessor function to NTP __hardpps function
*/
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
unsigned long flags;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 5e097fa9faf7..74591ba9474f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -461,10 +461,17 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
static void timer_stats_account_timer(struct timer_list *timer)
{
- if (likely(!timer->start_site))
+ void *site;
+
+ /*
+ * start_site can be concurrently reset by
+ * timer_stats_timer_clear_start_info()
+ */
+ site = READ_ONCE(timer->start_site);
+ if (likely(!site))
return;
- timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
+ timer_stats_update_stats(timer, timer->start_pid, site,
timer->function, timer->start_comm,
timer->flags);
}
@@ -807,8 +814,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
- timer->flags &= ~TIMER_BASEMASK;
- timer->flags |= base->cpu;
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | base->cpu);
}
}
@@ -867,7 +874,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
if (mask == 0)
return expires;
- bit = find_last_bit(&mask, BITS_PER_LONG);
+ bit = __fls(mask);
mask = (1UL << bit) - 1;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a4536e1e3e2a..f75e35b60149 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -137,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
(unsigned long long) ktime_to_ns(base->offset));
#endif
SEQ_printf(m, "active timers:\n");
- print_active_timers(m, base, now);
+ print_active_timers(m, base, now + ktime_to_ns(base->offset));
}
static void print_cpu(struct seq_file *m, int cpu, u64 now)
@@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
(unsigned long long) dev->min_delta_ns);
SEQ_printf(m, " mult: %u\n", dev->mult);
SEQ_printf(m, " shift: %u\n", dev->shift);
- SEQ_printf(m, " mode: %d\n", dev->mode);
+ SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev));
SEQ_printf(m, " next_event: %Ld nsecs\n",
(unsigned long long) ktime_to_ns(dev->next_event));
@@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->set_next_event);
SEQ_printf(m, "\n");
- if (dev->set_mode) {
- SEQ_printf(m, " set_mode: ");
- print_name_offset(m, dev->set_mode);
+ if (dev->set_state_shutdown) {
+ SEQ_printf(m, " shutdown: ");
+ print_name_offset(m, dev->set_state_shutdown);
SEQ_printf(m, "\n");
- } else {
- if (dev->set_state_shutdown) {
- SEQ_printf(m, " shutdown: ");
- print_name_offset(m, dev->set_state_shutdown);
- SEQ_printf(m, "\n");
- }
+ }
- if (dev->set_state_periodic) {
- SEQ_printf(m, " periodic: ");
- print_name_offset(m, dev->set_state_periodic);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_periodic) {
+ SEQ_printf(m, " periodic: ");
+ print_name_offset(m, dev->set_state_periodic);
+ SEQ_printf(m, "\n");
+ }
- if (dev->set_state_oneshot) {
- SEQ_printf(m, " oneshot: ");
- print_name_offset(m, dev->set_state_oneshot);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot) {
+ SEQ_printf(m, " oneshot: ");
+ print_name_offset(m, dev->set_state_oneshot);
+ SEQ_printf(m, "\n");
+ }
- if (dev->set_state_oneshot_stopped) {
- SEQ_printf(m, " oneshot stopped: ");
- print_name_offset(m, dev->set_state_oneshot_stopped);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot_stopped) {
+ SEQ_printf(m, " oneshot stopped: ");
+ print_name_offset(m, dev->set_state_oneshot_stopped);
+ SEQ_printf(m, "\n");
+ }
- if (dev->tick_resume) {
- SEQ_printf(m, " resume: ");
- print_name_offset(m, dev->tick_resume);
- SEQ_printf(m, "\n");
- }
+ if (dev->tick_resume) {
+ SEQ_printf(m, " resume: ");
+ print_name_offset(m, dev->tick_resume);
+ SEQ_printf(m, "\n");
}
SEQ_printf(m, " event_handler: ");
diff --git a/kernel/torture.c b/kernel/torture.c
index 3e4840633d3e..44aa462d033f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -523,6 +523,7 @@ static int stutter;
*/
void stutter_wait(const char *title)
{
+ cond_resched_rcu_qs();
while (READ_ONCE(stutter_pause_test) ||
(torture_runnable && !READ_ONCE(*torture_runnable))) {
if (stutter_pause_test)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48ae153a..8d6363f42169 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
config BPF_EVENTS
depends on BPF_SYSCALL
- depends on KPROBE_EVENT
+ depends on KPROBE_EVENT || UPROBE_EVENT
bool
default y
help
@@ -635,6 +635,13 @@ config TRACE_ENUM_MAP_FILE
If unsure, say N
+config TRACING_EVENTS_GPIO
+ bool "Trace gpio events"
+ depends on GPIOLIB
+ default y
+ help
+ Enable tracing events for gpio subsystem
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3e6b39b6cf9..a990824c8604 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -103,7 +103,7 @@ record_it:
memcpy((void *) t + sizeof(*t), data, len);
if (blk_tracer)
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
}
}
@@ -278,7 +278,7 @@ record_it:
memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
if (blk_tracer) {
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
return;
}
}
@@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct block_device *bdev,
struct blk_user_trace_setup *buts)
{
- struct blk_trace *old_bt, *bt = NULL;
+ struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
int ret;
@@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->trace_state = Blktrace_setup;
ret = -EBUSY;
- old_bt = xchg(&q->blk_trace, bt);
- if (old_bt) {
- (void) xchg(&q->blk_trace, old_bt);
+ if (cmpxchg(&q->blk_trace, NULL, bt))
goto err;
- }
if (atomic_inc_return(&blk_probes_ref) == 1)
blk_register_tracepoints();
@@ -778,9 +775,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
if (likely(!bt))
return;
- if (!error && !bio_flagged(bio, BIO_UPTODATE))
- error = EIO;
-
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, what, error, 0, NULL);
}
@@ -887,8 +881,7 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
- !bio_flagged(bio, BIO_UPTODATE),
- sizeof(rpdu), &rpdu);
+ bio->bi_error, sizeof(rpdu), &rpdu);
}
}
@@ -920,8 +913,8 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio->bi_rw, BLK_TA_REMAP,
- !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+ bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
+ sizeof(r), &r);
}
/**
@@ -1347,6 +1340,7 @@ static const struct {
static enum print_line_t print_one_line(struct trace_iterator *iter,
bool classic)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
const struct blk_io_trace *t;
u16 what;
@@ -1355,7 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
t = te_blk_io_trace(iter->ent);
what = t->action & ((1 << BLK_TC_SHIFT) - 1);
- long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+ long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
if (t->action == BLK_TN_MESSAGE) {
@@ -1417,9 +1411,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
/* don't output context-info for blk_classic output */
if (bit == TRACE_BLK_OPT_CLASSIC) {
if (set)
- trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
else
- trace_flags |= TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags |= TRACE_ITER_CONTEXT_INFO;
}
return 0;
}
@@ -1485,7 +1479,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
static int blk_trace_setup_queue(struct request_queue *q,
struct block_device *bdev)
{
- struct blk_trace *old_bt, *bt = NULL;
+ struct blk_trace *bt = NULL;
int ret = -ENOMEM;
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
@@ -1501,12 +1495,9 @@ static int blk_trace_setup_queue(struct request_queue *q,
blk_trace_setup_lba(bt, bdev);
- old_bt = xchg(&q->blk_trace, bt);
- if (old_bt != NULL) {
- (void)xchg(&q->blk_trace, old_bt);
- ret = -EBUSY;
+ ret = -EBUSY;
+ if (cmpxchg(&q->blk_trace, NULL, bt))
goto free_bt;
- }
if (atomic_inc_return(&blk_probes_ref) == 1)
blk_register_tracepoints();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..4228fd3682c3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
/*
* limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
*/
static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
{
char *fmt = (char *) (long) r1;
+ bool str_seen = false;
int mod[3] = {};
int fmt_cnt = 0;
+ u64 unsafe_addr;
+ char buf[64];
int i;
/*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
if (fmt[i] == 'l') {
mod[fmt_cnt]++;
i++;
- } else if (fmt[i] == 'p') {
+ } else if (fmt[i] == 'p' || fmt[i] == 's') {
mod[fmt_cnt]++;
i++;
if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
return -EINVAL;
fmt_cnt++;
+ if (fmt[i - 1] == 's') {
+ if (str_seen)
+ /* allow only one '%s' per fmt string */
+ return -EINVAL;
+ str_seen = true;
+
+ switch (fmt_cnt) {
+ case 1:
+ unsafe_addr = r3;
+ r3 = (long) buf;
+ break;
+ case 2:
+ unsafe_addr = r4;
+ r4 = (long) buf;
+ break;
+ case 3:
+ unsafe_addr = r5;
+ r5 = (long) buf;
+ break;
+ }
+ buf[0] = 0;
+ strncpy_from_unsafe(buf,
+ (void *) (long) unsafe_addr,
+ sizeof(buf));
+ }
continue;
}
@@ -158,6 +186,84 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
return &bpf_trace_printk_proto;
}
+static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_event *event;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -E2BIG;
+
+ event = (struct perf_event *)array->ptrs[index];
+ if (!event)
+ return -ENOENT;
+
+ /* make sure event is local and doesn't have pmu::count */
+ if (event->oncpu != smp_processor_id() ||
+ event->pmu->count)
+ return -EINVAL;
+
+ /*
+ * we don't know if the function is run successfully by the
+ * return value. It can be judged in other places, such as
+ * eBPF programs.
+ */
+ return perf_event_read_local(event);
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_proto = {
+ .func = bpf_perf_event_read,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+ struct pt_regs *regs = (struct pt_regs *) (long) r1;
+ struct bpf_map *map = (struct bpf_map *) (long) r2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *data = (void *) (long) r4;
+ struct perf_sample_data sample_data;
+ struct perf_event *event;
+ struct perf_raw_record raw = {
+ .size = size,
+ .data = data,
+ };
+
+ if (unlikely(index >= array->map.max_entries))
+ return -E2BIG;
+
+ event = (struct perf_event *)array->ptrs[index];
+ if (unlikely(!event))
+ return -ENOENT;
+
+ if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
+ event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+ return -EINVAL;
+
+ if (unlikely(event->oncpu != smp_processor_id()))
+ return -EOPNOTSUPP;
+
+ perf_sample_data_init(&sample_data, 0, 0);
+ sample_data.raw = &raw;
+ perf_event_output(event, &sample_data, regs);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto = {
+ .func = bpf_perf_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
+
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -183,6 +289,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return bpf_get_trace_printk_proto();
case BPF_FUNC_get_smp_processor_id:
return &bpf_get_smp_processor_id_proto;
+ case BPF_FUNC_perf_event_read:
+ return &bpf_perf_event_read_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto;
default:
return NULL;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb11011b5292..3f743b147247 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -243,6 +243,11 @@ static void ftrace_sync_ipi(void *data)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static void update_function_graph_func(void);
+
+/* Both enabled by default (can be cleared by function_graph tracer flags */
+static bool fgraph_sleep_time = true;
+static bool fgraph_graph_time = true;
+
#else
static inline void update_function_graph_func(void) { }
#endif
@@ -630,13 +635,18 @@ static int function_stat_show(struct seq_file *m, void *v)
goto out;
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ avg = rec->time;
+ do_div(avg, rec->counter);
+ if (tracing_thresh && (avg < tracing_thresh))
+ goto out;
+#endif
+
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_puts(m, " ");
- avg = rec->time;
- do_div(avg, rec->counter);
/* Sample standard deviation (s^2) */
if (rec->counter <= 1)
@@ -912,7 +922,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
calltime = trace->rettime - trace->calltime;
- if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+ if (!fgraph_graph_time) {
int index;
index = trace->depth;
@@ -3415,27 +3425,35 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-static int ftrace_match(char *str, char *regex, int len, int type)
+/* Type for quick search ftrace basic regexes (globs) from filter_parse_regex */
+struct ftrace_glob {
+ char *search;
+ unsigned len;
+ int type;
+};
+
+static int ftrace_match(char *str, struct ftrace_glob *g)
{
int matched = 0;
int slen;
- switch (type) {
+ switch (g->type) {
case MATCH_FULL:
- if (strcmp(str, regex) == 0)
+ if (strcmp(str, g->search) == 0)
matched = 1;
break;
case MATCH_FRONT_ONLY:
- if (strncmp(str, regex, len) == 0)
+ if (strncmp(str, g->search, g->len) == 0)
matched = 1;
break;
case MATCH_MIDDLE_ONLY:
- if (strstr(str, regex))
+ if (strstr(str, g->search))
matched = 1;
break;
case MATCH_END_ONLY:
slen = strlen(str);
- if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
+ if (slen >= g->len &&
+ memcmp(str + slen - g->len, g->search, g->len) == 0)
matched = 1;
break;
}
@@ -3444,13 +3462,13 @@ static int ftrace_match(char *str, char *regex, int len, int type)
}
static int
-enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
+enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
{
struct ftrace_func_entry *entry;
int ret = 0;
entry = ftrace_lookup_ip(hash, rec->ip);
- if (not) {
+ if (clear_filter) {
/* Do nothing if it doesn't exist */
if (!entry)
return 0;
@@ -3467,42 +3485,68 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
}
static int
-ftrace_match_record(struct dyn_ftrace *rec, char *mod,
- char *regex, int len, int type)
+ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
+ struct ftrace_glob *mod_g, int exclude_mod)
{
char str[KSYM_SYMBOL_LEN];
char *modname;
kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
- if (mod) {
- /* module lookup requires matching the module */
- if (!modname || strcmp(modname, mod))
+ if (mod_g) {
+ int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0;
+
+ /* blank module name to match all modules */
+ if (!mod_g->len) {
+ /* blank module globbing: modname xor exclude_mod */
+ if ((!exclude_mod) != (!modname))
+ goto func_match;
+ return 0;
+ }
+
+ /* not matching the module */
+ if (!modname || !mod_matches) {
+ if (exclude_mod)
+ goto func_match;
+ else
+ return 0;
+ }
+
+ if (mod_matches && exclude_mod)
return 0;
+func_match:
/* blank search means to match all funcs in the mod */
- if (!len)
+ if (!func_g->len)
return 1;
}
- return ftrace_match(str, regex, len, type);
+ return ftrace_match(str, func_g);
}
static int
-match_records(struct ftrace_hash *hash, char *buff,
- int len, char *mod, int not)
+match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
{
- unsigned search_len = 0;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
- int type = MATCH_FULL;
- char *search = buff;
+ struct ftrace_glob func_g = { .type = MATCH_FULL };
+ struct ftrace_glob mod_g = { .type = MATCH_FULL };
+ struct ftrace_glob *mod_match = (mod) ? &mod_g : NULL;
+ int exclude_mod = 0;
int found = 0;
int ret;
+ int clear_filter;
- if (len) {
- type = filter_parse_regex(buff, len, &search, &not);
- search_len = strlen(search);
+ if (func) {
+ func_g.type = filter_parse_regex(func, len, &func_g.search,
+ &clear_filter);
+ func_g.len = strlen(func_g.search);
+ }
+
+ if (mod) {
+ mod_g.type = filter_parse_regex(mod, strlen(mod),
+ &mod_g.search, &exclude_mod);
+ mod_g.len = strlen(mod_g.search);
}
mutex_lock(&ftrace_lock);
@@ -3511,8 +3555,8 @@ match_records(struct ftrace_hash *hash, char *buff,
goto out_unlock;
do_for_each_ftrace_rec(pg, rec) {
- if (ftrace_match_record(rec, mod, search, search_len, type)) {
- ret = enter_record(hash, rec, not);
+ if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
+ ret = enter_record(hash, rec, clear_filter);
if (ret < 0) {
found = ret;
goto out_unlock;
@@ -3529,26 +3573,9 @@ match_records(struct ftrace_hash *hash, char *buff,
static int
ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
{
- return match_records(hash, buff, len, NULL, 0);
+ return match_records(hash, buff, len, NULL);
}
-static int
-ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
-{
- int not = 0;
-
- /* blank or '*' mean the same */
- if (strcmp(buff, "*") == 0)
- buff[0] = 0;
-
- /* handle the case of 'dont filter this module' */
- if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
- buff[0] = 0;
- not = 1;
- }
-
- return match_records(hash, buff, strlen(buff), mod, not);
-}
/*
* We register the module command as a template to show others how
@@ -3557,10 +3584,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
static int
ftrace_mod_callback(struct ftrace_hash *hash,
- char *func, char *cmd, char *param, int enable)
+ char *func, char *cmd, char *module, int enable)
{
- char *mod;
- int ret = -EINVAL;
+ int ret;
/*
* cmd == 'mod' because we only registered this func
@@ -3569,21 +3595,11 @@ ftrace_mod_callback(struct ftrace_hash *hash,
* you can tell which command was used by the cmd
* parameter.
*/
-
- /* we must have a module name */
- if (!param)
- return ret;
-
- mod = strsep(&param, ":");
- if (!strlen(mod))
- return ret;
-
- ret = ftrace_match_module_records(hash, func, mod);
+ ret = match_records(hash, func, strlen(func), module);
if (!ret)
- ret = -EINVAL;
+ return -EINVAL;
if (ret < 0)
return ret;
-
return 0;
}
@@ -3694,19 +3710,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
{
struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_probe *entry;
+ struct ftrace_glob func_g;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct ftrace_hash *old_hash = *orig_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
- int type, len, not;
+ int not;
unsigned long key;
int count = 0;
- char *search;
int ret;
- type = filter_parse_regex(glob, strlen(glob), &search, &not);
- len = strlen(search);
+ func_g.type = filter_parse_regex(glob, strlen(glob),
+ &func_g.search, &not);
+ func_g.len = strlen(func_g.search);
/* we do not support '!' for function probes */
if (WARN_ON(not))
@@ -3733,7 +3750,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
do_for_each_ftrace_rec(pg, rec) {
- if (!ftrace_match_record(rec, NULL, search, len, type))
+ if (!ftrace_match_record(rec, &func_g, NULL, 0))
continue;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -3806,24 +3823,24 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
+ struct ftrace_glob func_g;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct ftrace_hash *old_hash = *orig_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
- int type = MATCH_FULL;
- int i, len = 0;
- char *search;
- int ret;
+ int i, ret;
if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
- glob = NULL;
+ func_g.search = NULL;
else if (glob) {
int not;
- type = filter_parse_regex(glob, strlen(glob), &search, &not);
- len = strlen(search);
+ func_g.type = filter_parse_regex(glob, strlen(glob),
+ &func_g.search, &not);
+ func_g.len = strlen(func_g.search);
+ func_g.search = glob;
/* we do not support '!' for function probes */
if (WARN_ON(not))
@@ -3852,10 +3869,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
continue;
/* do this last, since it is the most expensive */
- if (glob) {
+ if (func_g.search) {
kallsyms_lookup(entry->ip, NULL, NULL,
NULL, str);
- if (!ftrace_match(str, glob, len, type))
+ if (!ftrace_match(str, &func_g))
continue;
}
@@ -3884,7 +3901,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
ftrace_free_entry(entry);
}
mutex_unlock(&ftrace_lock);
-
+
out_unlock:
mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
@@ -4600,21 +4617,21 @@ ftrace_graph_release(struct inode *inode, struct file *file)
static int
ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
{
+ struct ftrace_glob func_g;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
- int search_len;
int fail = 1;
- int type, not;
- char *search;
+ int not;
bool exists;
int i;
/* decode regex */
- type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
+ func_g.type = filter_parse_regex(buffer, strlen(buffer),
+ &func_g.search, &not);
if (!not && *idx >= size)
return -EBUSY;
- search_len = strlen(search);
+ func_g.len = strlen(func_g.search);
mutex_lock(&ftrace_lock);
@@ -4625,7 +4642,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
do_for_each_ftrace_rec(pg, rec) {
- if (ftrace_match_record(rec, NULL, search, search_len, type)) {
+ if (ftrace_match_record(rec, &func_g, NULL, 0)) {
/* if it is in the array */
exists = false;
for (i = 0; i < *idx; i++) {
@@ -4778,17 +4795,6 @@ static int ftrace_cmp_ips(const void *a, const void *b)
return 0;
}
-static void ftrace_swap_ips(void *a, void *b, int size)
-{
- unsigned long *ipa = a;
- unsigned long *ipb = b;
- unsigned long t;
-
- t = *ipa;
- *ipa = *ipb;
- *ipb = t;
-}
-
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
@@ -4808,7 +4814,7 @@ static int ftrace_process_locs(struct module *mod,
return 0;
sort(start, count, sizeof(*start),
- ftrace_cmp_ips, ftrace_swap_ips);
+ ftrace_cmp_ips, NULL);
start_pg = ftrace_allocate_pages(count);
if (!start_pg)
@@ -5634,6 +5640,16 @@ static struct ftrace_ops graph_ops = {
ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
+void ftrace_graph_sleep_time_control(bool enable)
+{
+ fgraph_sleep_time = enable;
+}
+
+void ftrace_graph_graph_time_control(bool enable)
+{
+ fgraph_graph_time = enable;
+}
+
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
return 0;
@@ -5692,7 +5708,7 @@ free:
}
static void
-ftrace_graph_probe_sched_switch(void *ignore,
+ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
unsigned long long timestamp;
@@ -5702,7 +5718,7 @@ ftrace_graph_probe_sched_switch(void *ignore,
* Does the user want to count the time a function was asleep.
* If so, do not update the time stamps.
*/
- if (trace_flags & TRACE_ITER_SLEEP_TIME)
+ if (fgraph_sleep_time)
return;
timestamp = trace_clock_local();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6260717c18e3..75f1d05ea82d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -400,6 +400,17 @@ struct rb_irq_work {
};
/*
+ * Structure to hold event state and handle nested events.
+ */
+struct rb_event_info {
+ u64 ts;
+ u64 delta;
+ unsigned long length;
+ struct buffer_page *tail_page;
+ int add_timestamp;
+};
+
+/*
* Used for which event context the event is in.
* NMI = 0
* IRQ = 1
@@ -818,7 +829,7 @@ rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* writer is ever on it, the previous pointer never points
* back to the reader page.
*/
-static int rb_is_reader_page(struct buffer_page *page)
+static bool rb_is_reader_page(struct buffer_page *page)
{
struct list_head *list = page->list.prev;
@@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
}
-static inline int
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static void
-rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
-{
- unsigned long max_count;
-
- /*
- * We only race with interrupts and NMIs on this CPU.
- * If we own the commit event, then we can commit
- * all others that interrupted us, since the interruptions
- * are in stack format (they finish before they come
- * back to us). This allows us to do a simple loop to
- * assign the commit to the tail.
- */
- again:
- max_count = cpu_buffer->nr_pages * 100;
-
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
- if (RB_WARN_ON(cpu_buffer, !(--max_count)))
- return;
- if (RB_WARN_ON(cpu_buffer,
- rb_is_reader_page(cpu_buffer->tail_page)))
- return;
- local_set(&cpu_buffer->commit_page->page->commit,
- rb_page_write(cpu_buffer->commit_page));
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- /* add barrier to keep gcc from optimizing too much */
- barrier();
- }
- while (rb_commit_index(cpu_buffer) !=
- rb_page_write(cpu_buffer->commit_page)) {
-
- local_set(&cpu_buffer->commit_page->page->commit,
- rb_page_write(cpu_buffer->commit_page));
- RB_WARN_ON(cpu_buffer,
- local_read(&cpu_buffer->commit_page->page->commit) &
- ~RB_WRITE_MASK);
- barrier();
- }
-
- /* again, keep gcc from optimizing */
- barrier();
-
- /*
- * If an interrupt came in just after the first while loop
- * and pushed the tail page forward, we will be left with
- * a dangling commit that will never go forward.
- */
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
- goto again;
-}
-
static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-{
- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-
- /* Not the first event on the page? */
- if (rb_event_index(event)) {
- event->time_delta = delta & TS_MASK;
- event->array[0] = delta >> TS_SHIFT;
- } else {
- /* nope, just zero it */
- event->time_delta = 0;
- event->array[0] = 0;
- }
-
- return skip_time_extend(event);
-}
-
-/**
- * rb_update_event - update event type and data
- * @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
- *
- * Update the type and data fields of the event. The length
- * is the actual size that is written to the ring buffer,
- * and with this, we can determine what to place into the
- * data field.
- */
-static void
-rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event, unsigned length,
- int add_timestamp, u64 delta)
-{
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
-
- /*
- * If we need to add a timestamp, then we
- * add it to the start of the resevered space.
- */
- if (unlikely(add_timestamp)) {
- event = rb_add_time_stamp(event, delta);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
-
- event->time_delta = delta;
- length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
- event->type_len = 0;
- event->array[0] = length;
- } else
- event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-}
-
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
}
-static unsigned rb_calculate_event_length(unsigned length)
-{
- struct ring_buffer_event event; /* Used only for sizeof array */
-
- /* zero length can cause confusions */
- if (!length)
- length++;
-
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
- length += sizeof(event.array[0]);
-
- length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ARCH_ALIGNMENT);
-
- return length;
-}
-
static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page *tail_page,
- unsigned long tail, unsigned long length)
+ unsigned long tail, struct rb_event_info *info)
{
+ struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
+ unsigned long length = info->length;
/*
* Only the event that crossed the page boundary
@@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
*/
static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length, unsigned long tail,
- struct buffer_page *tail_page, u64 ts)
+ unsigned long tail, struct rb_event_info *info)
{
+ struct buffer_page *tail_page = info->tail_page;
struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
+ u64 ts;
next_page = tail_page;
@@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
out_again:
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ rb_reset_tail(cpu_buffer, tail, info);
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
out_reset:
/* reset write */
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ rb_reset_tail(cpu_buffer, tail, info);
return NULL;
}
-static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length, u64 ts,
- u64 delta, int add_timestamp)
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
{
- struct buffer_page *tail_page;
- struct ring_buffer_event *event;
- unsigned long tail, write;
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
- /*
- * If the time delta since the last event is too big to
- * hold in the time field of the event, then we append a
- * TIME EXTEND event ahead of the data event.
- */
- if (unlikely(add_timestamp))
- length += RB_LEN_TIME_EXTEND;
+ /* Not the first event on the page? */
+ if (rb_event_index(event)) {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ } else {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
- tail_page = cpu_buffer->tail_page;
- write = local_add_return(length, &tail_page->write);
+ return skip_time_extend(event);
+}
- /* set write to only the index of the write */
- write &= RB_WRITE_MASK;
- tail = write - length;
+static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event);
+
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event,
+ struct rb_event_info *info)
+{
+ unsigned length = info->length;
+ u64 delta = info->delta;
+
+ /* Only a commit updates the timestamp */
+ if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+ delta = 0;
/*
- * If this is the first commit on the page, then it has the same
- * timestamp as the page itself.
+ * If we need to add a timestamp, then we
+ * add it to the start of the resevered space.
*/
- if (!tail)
+ if (unlikely(info->add_timestamp)) {
+ event = rb_add_time_stamp(event, delta);
+ length -= RB_LEN_TIME_EXTEND;
delta = 0;
+ }
- /* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE))
- return rb_move_tail(cpu_buffer, length, tail,
- tail_page, ts);
+ event->time_delta = delta;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ event->type_len = 0;
+ event->array[0] = length;
+ } else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
- /* We reserved something on the buffer */
+static unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
- event = __rb_page_index(tail_page, tail);
- kmemcheck_annotate_bitfield(event, bitfield);
- rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+ /* zero length can cause confusions */
+ if (!length)
+ length++;
- local_inc(&tail_page->entries);
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
- * If this is the first commit on the page, then update
- * its timestamp.
+ * In case the time delta is larger than the 27 bits for it
+ * in the header, we need to add a timestamp. If another
+ * event comes in when trying to discard this one to increase
+ * the length, then the timestamp will be added in the allocated
+ * space of this event. If length is bigger than the size needed
+ * for the TIME_EXTEND, then padding has to be used. The events
+ * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+ * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+ * As length is a multiple of 4, we only need to worry if it
+ * is 12 (RB_LEN_TIME_EXTEND + 4).
*/
- if (!tail)
- tail_page->page->time_stamp = ts;
+ if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+ length += RB_ALIGNMENT;
- /* account for these added bytes */
- local_add(length, &cpu_buffer->entries_bytes);
+ return length;
+}
- return event;
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
}
+#endif
static inline int
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
local_inc(&cpu_buffer->commits);
}
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long max_count;
+
+ /*
+ * We only race with interrupts and NMIs on this CPU.
+ * If we own the commit event, then we can commit
+ * all others that interrupted us, since the interruptions
+ * are in stack format (they finish before they come
+ * back to us). This allows us to do a simple loop to
+ * assign the commit to the tail.
+ */
+ again:
+ max_count = cpu_buffer->nr_pages * 100;
+
+ while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+ return;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_is_reader_page(cpu_buffer->tail_page)))
+ return;
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ /* add barrier to keep gcc from optimizing too much */
+ barrier();
+ }
+ while (rb_commit_index(cpu_buffer) !=
+ rb_page_write(cpu_buffer->commit_page)) {
+
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ RB_WARN_ON(cpu_buffer,
+ local_read(&cpu_buffer->commit_page->page->commit) &
+ ~RB_WRITE_MASK);
+ barrier();
+ }
+
+ /* again, keep gcc from optimizing */
+ barrier();
+
+ /*
+ * If an interrupt came in just after the first while loop
+ * and pushed the tail page forward, we will be left with
+ * a dangling commit that will never go forward.
+ */
+ if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ goto again;
+}
+
static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
}
}
-static struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer *buffer,
- struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length)
+static inline void rb_event_discard(struct ring_buffer_event *event)
{
- struct ring_buffer_event *event;
- u64 ts, delta;
- int nr_loops = 0;
- int add_timestamp;
- u64 diff;
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
- rb_start_commit(cpu_buffer);
+ /* array[0] holds the actual length for the discarded event */
+ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ if (!event->time_delta)
+ event->time_delta = 1;
+}
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- /*
- * Due to the ability to swap a cpu buffer from a buffer
- * it is possible it was swapped before we committed.
- * (committing stops a swap). We check for it here and
- * if it happened, we have to fail the write.
- */
- barrier();
- if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
- local_dec(&cpu_buffer->committing);
- local_dec(&cpu_buffer->commits);
- return NULL;
- }
-#endif
+static inline bool
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
- length = rb_calculate_event_length(length);
- again:
- add_timestamp = 0;
- delta = 0;
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
/*
- * We allow for interrupts to reenter here and do a trace.
- * If one does, it will cause this original code to loop
- * back here. Even with heavy interrupts happening, this
- * should only happen a few times in a row. If this happens
- * 1000 times in a row, there must be either an interrupt
- * storm or we have something buggy.
- * Bail!
+ * The event first in the commit queue updates the
+ * time stamp.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
- goto out_fail;
+ if (rb_event_is_commit(cpu_buffer, event)) {
+ /*
+ * A commit event that is first on a page
+ * updates the write timestamp with the page stamp
+ */
+ if (!rb_event_index(event))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->write_stamp += delta;
+ } else
+ cpu_buffer->write_stamp += event->time_delta;
+ }
+}
- ts = rb_time_stamp(cpu_buffer->buffer);
- diff = ts - cpu_buffer->write_stamp;
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ local_inc(&cpu_buffer->entries);
+ rb_update_write_stamp(cpu_buffer, event);
+ rb_end_commit(cpu_buffer);
+}
- /* make sure this diff is calculated here */
- barrier();
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+ bool pagebusy;
- /* Did the write stamp get updated already? */
- if (likely(ts >= cpu_buffer->write_stamp)) {
- delta = diff;
- if (unlikely(test_time_stamp(delta))) {
- int local_clock_stable = 1;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- local_clock_stable = sched_clock_stable();
-#endif
- WARN_ONCE(delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
- (unsigned long long)delta,
- (unsigned long long)ts,
- (unsigned long long)cpu_buffer->write_stamp,
- local_clock_stable ? "" :
- "If you just came from a suspend/resume,\n"
- "please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n");
- add_timestamp = 1;
- }
+ if (buffer->irq_work.waiters_pending) {
+ buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&buffer->irq_work.work);
}
- event = __rb_reserve_next(cpu_buffer, length, ts,
- delta, add_timestamp);
- if (unlikely(PTR_ERR(event) == -EAGAIN))
- goto again;
-
- if (!event)
- goto out_fail;
+ if (cpu_buffer->irq_work.waiters_pending) {
+ cpu_buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
- return event;
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- out_fail:
- rb_end_commit(cpu_buffer);
- return NULL;
+ if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+ cpu_buffer->irq_work.wakeup_full = true;
+ cpu_buffer->irq_work.full_waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
}
/*
@@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
}
/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ rb_commit(cpu_buffer, event);
+
+ rb_wakeups(buffer, cpu_buffer);
+
+ trace_recursive_unlock(cpu_buffer);
+
+ preempt_enable_notrace();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ WARN_ONCE(info->delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)cpu_buffer->write_stamp,
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n");
+ info->add_timestamp = 1;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ struct ring_buffer_event *event;
+ struct buffer_page *tail_page;
+ unsigned long tail, write;
+
+ /*
+ * If the time delta since the last event is too big to
+ * hold in the time field of the event, then we append a
+ * TIME EXTEND event ahead of the data event.
+ */
+ if (unlikely(info->add_timestamp))
+ info->length += RB_LEN_TIME_EXTEND;
+
+ tail_page = info->tail_page = cpu_buffer->tail_page;
+ write = local_add_return(info->length, &tail_page->write);
+
+ /* set write to only the index of the write */
+ write &= RB_WRITE_MASK;
+ tail = write - info->length;
+
+ /*
+ * If this is the first commit on the page, then it has the same
+ * timestamp as the page itself.
+ */
+ if (!tail)
+ info->delta = 0;
+
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > BUF_PAGE_SIZE))
+ return rb_move_tail(cpu_buffer, tail, info);
+
+ /* We reserved something on the buffer */
+
+ event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
+ rb_update_event(cpu_buffer, event, info);
+
+ local_inc(&tail_page->entries);
+
+ /*
+ * If this is the first commit on the page, then update
+ * its timestamp.
+ */
+ if (!tail)
+ tail_page->page->time_stamp = info->ts;
+
+ /* account for these added bytes */
+ local_add(info->length, &cpu_buffer->entries_bytes);
+
+ return event;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer *buffer,
+ struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length)
+{
+ struct ring_buffer_event *event;
+ struct rb_event_info info;
+ int nr_loops = 0;
+ u64 diff;
+
+ rb_start_commit(cpu_buffer);
+
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ /*
+ * Due to the ability to swap a cpu buffer from a buffer
+ * it is possible it was swapped before we committed.
+ * (committing stops a swap). We check for it here and
+ * if it happened, we have to fail the write.
+ */
+ barrier();
+ if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ local_dec(&cpu_buffer->committing);
+ local_dec(&cpu_buffer->commits);
+ return NULL;
+ }
+#endif
+
+ info.length = rb_calculate_event_length(length);
+ again:
+ info.add_timestamp = 0;
+ info.delta = 0;
+
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+ goto out_fail;
+
+ info.ts = rb_time_stamp(cpu_buffer->buffer);
+ diff = info.ts - cpu_buffer->write_stamp;
+
+ /* make sure this diff is calculated here */
+ barrier();
+
+ /* Did the write stamp get updated already? */
+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
+ info.delta = diff;
+ if (unlikely(test_time_stamp(info.delta)))
+ rb_handle_timestamp(cpu_buffer, &info);
+ }
+
+ event = __rb_reserve_next(cpu_buffer, &info);
+
+ if (unlikely(PTR_ERR(event) == -EAGAIN))
+ goto again;
+
+ if (!event)
+ goto out_fail;
+
+ return event;
+
+ out_fail:
+ rb_end_commit(cpu_buffer);
+ return NULL;
+}
+
+/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
* @length: the length of the data to reserve (excluding event header)
@@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
}
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
- cpu_buffer->write_stamp += delta;
- } else
- cpu_buffer->write_stamp += event->time_delta;
- }
-}
-
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- local_inc(&cpu_buffer->entries);
- rb_update_write_stamp(cpu_buffer, event);
- rb_end_commit(cpu_buffer);
-}
-
-static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
-{
- bool pagebusy;
-
- if (buffer->irq_work.waiters_pending) {
- buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&buffer->irq_work.work);
- }
-
- if (cpu_buffer->irq_work.waiters_pending) {
- cpu_buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
- }
-
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
-
- if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
- cpu_buffer->irq_work.wakeup_full = true;
- cpu_buffer->irq_work.full_waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
- }
-}
-
-/**
- * ring_buffer_unlock_commit - commit a reserved
- * @buffer: The buffer to commit to
- * @event: The event pointer to commit.
- *
- * This commits the data to the ring buffer, and releases any locks held.
- *
- * Must be paired with ring_buffer_lock_reserve.
- */
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- int cpu = raw_smp_processor_id();
-
- cpu_buffer = buffer->buffers[cpu];
-
- rb_commit(cpu_buffer, event);
-
- rb_wakeups(buffer, cpu_buffer);
-
- trace_recursive_unlock(cpu_buffer);
-
- preempt_enable_notrace();
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-
-static inline void rb_event_discard(struct ring_buffer_event *event)
-{
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
- event = skip_time_extend(event);
-
- /* array[0] holds the actual length for the discarded event */
- event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
- event->type_len = RINGBUF_TYPE_PADDING;
- /* time delta must be non zero */
- if (!event->time_delta)
- event->time_delta = 1;
-}
-
/*
* Decrement the entries to the page that an event is on.
* The event does not even need to exist, only the pointer
@@ -2999,7 +3039,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = cpu_buffer->reader_page;
struct buffer_page *head = rb_set_head_page(cpu_buffer);
@@ -3007,7 +3047,7 @@ static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
/* In case of error, head will be NULL */
if (unlikely(!head))
- return 1;
+ return true;
return reader->read == rb_page_commit(reader) &&
(commit == reader ||
@@ -4227,7 +4267,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
* rind_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
-int ring_buffer_empty(struct ring_buffer *buffer)
+bool ring_buffer_empty(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4245,10 +4285,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
local_irq_restore(flags);
if (!ret)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -4257,7 +4297,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
* @buffer: The ring buffer
* @cpu: The CPU buffer to test
*/
-int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4265,7 +4305,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
int ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return 1;
+ return true;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a1503a027ee2..6df9a83e20d7 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -24,8 +24,8 @@ struct rb_page {
static int wakeup_interval = 100;
static int reader_finish;
-static struct completion read_start;
-static struct completion read_done;
+static DECLARE_COMPLETION(read_start);
+static DECLARE_COMPLETION(read_done);
static struct ring_buffer *buffer;
static struct task_struct *producer;
@@ -60,12 +60,12 @@ MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
static int read_events;
-static int kill_test;
+static int test_error;
-#define KILL_TEST() \
+#define TEST_ERROR() \
do { \
- if (!kill_test) { \
- kill_test = 1; \
+ if (!test_error) { \
+ test_error = 1; \
WARN_ON(1); \
} \
} while (0)
@@ -75,6 +75,11 @@ enum event_status {
EVENT_DROPPED,
};
+static bool break_test(void)
+{
+ return test_error || kthread_should_stop();
+}
+
static enum event_status read_event(int cpu)
{
struct ring_buffer_event *event;
@@ -87,7 +92,7 @@ static enum event_status read_event(int cpu)
entry = ring_buffer_event_data(event);
if (*entry != cpu) {
- KILL_TEST();
+ TEST_ERROR();
return EVENT_DROPPED;
}
@@ -115,10 +120,10 @@ static enum event_status read_page(int cpu)
rpage = bpage;
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
- for (i = 0; i < commit && !kill_test; i += inc) {
+ for (i = 0; i < commit && !test_error ; i += inc) {
if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
@@ -128,7 +133,7 @@ static enum event_status read_page(int cpu)
case RINGBUF_TYPE_PADDING:
/* failed writes may be discarded events */
if (!event->time_delta)
- KILL_TEST();
+ TEST_ERROR();
inc = event->array[0] + 4;
break;
case RINGBUF_TYPE_TIME_EXTEND:
@@ -137,12 +142,12 @@ static enum event_status read_page(int cpu)
case 0:
entry = ring_buffer_event_data(event);
if (*entry != cpu) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
read++;
if (!event->array[0]) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
inc = event->array[0] + 4;
@@ -150,17 +155,17 @@ static enum event_status read_page(int cpu)
default:
entry = ring_buffer_event_data(event);
if (*entry != cpu) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
read++;
inc = ((event->type_len + 1) * 4);
}
- if (kill_test)
+ if (test_error)
break;
if (inc <= 0) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
}
@@ -178,10 +183,14 @@ static void ring_buffer_consumer(void)
read_events ^= 1;
read = 0;
- while (!reader_finish && !kill_test) {
- int found;
+ /*
+ * Continue running until the producer specifically asks to stop
+ * and is ready for the completion.
+ */
+ while (!READ_ONCE(reader_finish)) {
+ int found = 1;
- do {
+ while (found && !test_error) {
int cpu;
found = 0;
@@ -193,19 +202,25 @@ static void ring_buffer_consumer(void)
else
stat = read_page(cpu);
- if (kill_test)
+ if (test_error)
break;
+
if (stat == EVENT_FOUND)
found = 1;
+
}
- } while (found && !kill_test);
+ }
+ /* Wait till the producer wakes us up when there is more data
+ * available or when the producer wants us to finish reading.
+ */
set_current_state(TASK_INTERRUPTIBLE);
if (reader_finish)
break;
schedule();
}
+ __set_current_state(TASK_RUNNING);
reader_finish = 0;
complete(&read_done);
}
@@ -263,10 +278,7 @@ static void ring_buffer_producer(void)
if (cnt % wakeup_interval)
cond_resched();
#endif
- if (kthread_should_stop())
- kill_test = 1;
-
- } while (ktime_before(end_time, timeout) && !kill_test);
+ } while (ktime_before(end_time, timeout) && !break_test());
trace_printk("End ring buffer hammer\n");
if (consumer) {
@@ -276,8 +288,6 @@ static void ring_buffer_producer(void)
/* the completions must be visible before the finish var */
smp_wmb();
reader_finish = 1;
- /* finish var visible before waking up the consumer */
- smp_wmb();
wake_up_process(consumer);
wait_for_completion(&read_done);
}
@@ -287,7 +297,7 @@ static void ring_buffer_producer(void)
entries = ring_buffer_entries(buffer);
overruns = ring_buffer_overruns(buffer);
- if (kill_test && !kthread_should_stop())
+ if (test_error)
trace_printk("ERROR!\n");
if (!disable_reader) {
@@ -368,15 +378,14 @@ static void wait_to_die(void)
static int ring_buffer_consumer_thread(void *arg)
{
- while (!kthread_should_stop() && !kill_test) {
+ while (!break_test()) {
complete(&read_start);
ring_buffer_consumer();
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop() || kill_test)
+ if (break_test())
break;
-
schedule();
}
__set_current_state(TASK_RUNNING);
@@ -389,27 +398,27 @@ static int ring_buffer_consumer_thread(void *arg)
static int ring_buffer_producer_thread(void *arg)
{
- init_completion(&read_start);
-
- while (!kthread_should_stop() && !kill_test) {
+ while (!break_test()) {
ring_buffer_reset(buffer);
if (consumer) {
- smp_wmb();
wake_up_process(consumer);
wait_for_completion(&read_start);
}
ring_buffer_producer();
- if (kill_test)
+ if (break_test())
goto out_kill;
trace_printk("Sleeping for 10 secs\n");
set_current_state(TASK_INTERRUPTIBLE);
+ if (break_test())
+ goto out_kill;
schedule_timeout(HZ * SLEEP_TIME);
}
out_kill:
+ __set_current_state(TASK_RUNNING);
if (!kthread_should_stop())
wait_to_die();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abcbf7ff8743..2198a630ef58 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -214,12 +214,10 @@ __setup("alloc_snapshot", boot_alloc_snapshot);
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
-static char *trace_boot_options __initdata;
static int __init set_trace_boot_options(char *str)
{
strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
- trace_boot_options = trace_boot_options_buf;
return 0;
}
__setup("trace_options=", set_trace_boot_options);
@@ -250,6 +248,19 @@ unsigned long long ns2usecs(cycle_t nsec)
return nsec;
}
+/* trace_flags holds trace_options default values */
+#define TRACE_DEFAULT_FLAGS \
+ (FUNCTION_DEFAULT_FLAGS | \
+ TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
+ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
+ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)
+
+/* trace_options that are only supported by global_trace */
+#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
+ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
+
+
/*
* The global_trace is the descriptor that holds the tracing
* buffers for the live tracing. For each CPU, it contains
@@ -262,7 +273,9 @@ unsigned long long ns2usecs(cycle_t nsec)
* pages for the buffer for that CPU. Each CPU has the same number
* of pages allocated for its buffer.
*/
-static struct trace_array global_trace;
+static struct trace_array global_trace = {
+ .trace_flags = TRACE_DEFAULT_FLAGS,
+};
LIST_HEAD(ftrace_trace_arrays);
@@ -468,11 +481,29 @@ static inline void trace_access_lock_init(void)
#endif
-/* trace_flags holds trace_options default values */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
- TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
+#ifdef CONFIG_STACKTRACE
+static void __ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs);
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs);
+
+#else
+static inline void __ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
+{
+}
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
+{
+}
+
+#endif
static void tracer_tracing_on(struct trace_array *tr)
{
@@ -518,7 +549,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
int alloc;
int pc;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
pc = preempt_count();
@@ -548,7 +579,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, irq_flags, 4, pc);
+ ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
return size;
}
@@ -568,7 +599,7 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
int pc;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
pc = preempt_count();
@@ -588,7 +619,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, irq_flags, 4, pc);
+ ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
return 1;
}
@@ -834,34 +865,18 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
return nsecs / 1000;
}
+/*
+ * TRACE_FLAGS is defined as a tuple matching bit masks with strings.
+ * It uses C(a, b) where 'a' is the enum name and 'b' is the string that
+ * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
+ * of strings in the order that the enums were defined.
+ */
+#undef C
+#define C(a, b) b
+
/* These must match the bit postions in trace_iterator_flags */
static const char *trace_options[] = {
- "print-parent",
- "sym-offset",
- "sym-addr",
- "verbose",
- "raw",
- "hex",
- "bin",
- "block",
- "stacktrace",
- "trace_printk",
- "ftrace_preempt",
- "branch",
- "annotate",
- "userstacktrace",
- "sym-userobj",
- "printk-msg-only",
- "context-info",
- "latency-format",
- "sleep-time",
- "graph-time",
- "record-cmd",
- "overwrite",
- "disable_on_free",
- "irq-info",
- "markers",
- "function-trace",
+ TRACE_FLAGS
NULL
};
@@ -1204,13 +1219,17 @@ static inline int run_tracer_selftest(struct tracer *type)
}
#endif /* CONFIG_FTRACE_STARTUP_TEST */
+static void add_tracer_options(struct trace_array *tr, struct tracer *t);
+
+static void __init apply_trace_boot_options(void);
+
/**
* register_tracer - register a tracer with the ftrace system.
* @type - the plugin for the tracer
*
* Register a new plugin tracer.
*/
-int register_tracer(struct tracer *type)
+int __init register_tracer(struct tracer *type)
{
struct tracer *t;
int ret = 0;
@@ -1253,6 +1272,7 @@ int register_tracer(struct tracer *type)
type->next = trace_types;
trace_types = type;
+ add_tracer_options(&global_trace, type);
out:
tracing_selftest_running = false;
@@ -1268,6 +1288,9 @@ int register_tracer(struct tracer *type)
/* Do we want this tracer to start on bootup? */
tracing_set_tracer(&global_trace, type->name);
default_bootup_tracer = NULL;
+
+ apply_trace_boot_options();
+
/* disable other selftests, since this will break it. */
tracing_selftest_disabled = true;
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1671,23 +1694,16 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve
ring_buffer_unlock_commit(buffer, event);
}
-static inline void
-__trace_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
+void trace_buffer_unlock_commit(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
ftrace_trace_userstack(buffer, flags, pc);
}
-
-void trace_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
-{
- __trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
static struct ring_buffer *temp_buffer;
@@ -1729,22 +1745,15 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
}
EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
-void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
-{
- __trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
-
-void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+void trace_buffer_unlock_commit_regs(struct trace_array *tr,
+ struct ring_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc,
struct pt_regs *regs)
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, 6, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
@@ -1873,24 +1882,17 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
}
-void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
{
- if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
__ftrace_trace_stack(buffer, flags, skip, pc, regs);
}
-void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc)
-{
- if (!(trace_flags & TRACE_ITER_STACKTRACE))
- return;
-
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
-}
-
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
@@ -1929,7 +1931,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
struct userstack_entry *entry;
struct stack_trace trace;
- if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
/*
@@ -2173,7 +2175,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
}
out:
@@ -2225,7 +2227,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
out:
preempt_enable_notrace();
@@ -2246,7 +2248,7 @@ int trace_array_printk(struct trace_array *tr,
int ret;
va_list ap;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -2261,7 +2263,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer,
int ret;
va_list ap;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -2602,7 +2604,7 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK);
struct trace_buffer *buf = iter->trace_buffer;
struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
struct tracer *type = iter->trace;
@@ -2664,20 +2666,22 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
static void test_cpu_buff_start(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
+ struct trace_array *tr = iter->tr;
- if (!(trace_flags & TRACE_ITER_ANNOTATE))
+ if (!(tr->trace_flags & TRACE_ITER_ANNOTATE))
return;
if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
return;
- if (cpumask_test_cpu(iter->cpu, iter->started))
+ if (iter->started && cpumask_test_cpu(iter->cpu, iter->started))
return;
if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
return;
- cpumask_set_cpu(iter->cpu, iter->started);
+ if (iter->started)
+ cpumask_set_cpu(iter->cpu, iter->started);
/* Don't print started cpu buffer for the first entry of the trace */
if (iter->idx > 1)
@@ -2687,8 +2691,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK);
struct trace_entry *entry;
struct trace_event *event;
@@ -2698,7 +2703,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
trace_print_lat_context(iter);
else
@@ -2718,13 +2723,14 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
struct trace_event *event;
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO)
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO)
trace_seq_printf(s, "%d %d %llu ",
entry->pid, iter->cpu, iter->ts);
@@ -2742,6 +2748,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
unsigned char newline = '\n';
struct trace_entry *entry;
@@ -2749,7 +2756,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
SEQ_PUT_HEX_FIELD(s, entry->pid);
SEQ_PUT_HEX_FIELD(s, iter->cpu);
SEQ_PUT_HEX_FIELD(s, iter->ts);
@@ -2771,13 +2778,14 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
struct trace_event *event;
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
SEQ_PUT_FIELD(s, entry->pid);
SEQ_PUT_FIELD(s, iter->cpu);
SEQ_PUT_FIELD(s, iter->ts);
@@ -2826,6 +2834,8 @@ int trace_empty(struct trace_iterator *iter)
/* Called with trace_event_read_lock() held. */
enum print_line_t print_trace_line(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
+ unsigned long trace_flags = tr->trace_flags;
enum print_line_t ret;
if (iter->lost_events) {
@@ -2871,6 +2881,7 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
void trace_latency_header(struct seq_file *m)
{
struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
/* print nothing if the buffers are empty */
if (trace_empty(iter))
@@ -2879,13 +2890,15 @@ void trace_latency_header(struct seq_file *m)
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
print_trace_header(m, iter);
- if (!(trace_flags & TRACE_ITER_VERBOSE))
+ if (!(tr->trace_flags & TRACE_ITER_VERBOSE))
print_lat_help_header(m);
}
void trace_default_header(struct seq_file *m)
{
struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ unsigned long trace_flags = tr->trace_flags;
if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
@@ -3035,7 +3048,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (!iter)
return ERR_PTR(-ENOMEM);
- iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+ iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
GFP_KERNEL);
if (!iter->buffer_iter)
goto release;
@@ -3230,7 +3243,7 @@ static int tracing_open(struct inode *inode, struct file *file)
iter = __tracing_open(inode, file, false);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
- else if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
}
@@ -3477,7 +3490,7 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
trace_opts = tr->current_trace->flags->opts;
for (i = 0; trace_options[i]; i++) {
- if (trace_flags & (1 << i))
+ if (tr->trace_flags & (1 << i))
seq_printf(m, "%s\n", trace_options[i]);
else
seq_printf(m, "no%s\n", trace_options[i]);
@@ -3542,7 +3555,7 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
/* do nothing if flag is already set */
- if (!!(trace_flags & mask) == !!enabled)
+ if (!!(tr->trace_flags & mask) == !!enabled)
return 0;
/* Give the tracer a chance to approve the change */
@@ -3551,9 +3564,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
return -EINVAL;
if (enabled)
- trace_flags |= mask;
+ tr->trace_flags |= mask;
else
- trace_flags &= ~mask;
+ tr->trace_flags &= ~mask;
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
@@ -3565,8 +3578,10 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
#endif
}
- if (mask == TRACE_ITER_PRINTK)
+ if (mask == TRACE_ITER_PRINTK) {
trace_printk_start_stop_comm(enabled);
+ trace_printk_control(enabled);
+ }
return 0;
}
@@ -3577,6 +3592,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
int neg = 0;
int ret = -ENODEV;
int i;
+ size_t orig_len = strlen(option);
cmp = strstrip(option);
@@ -3600,9 +3616,36 @@ static int trace_set_options(struct trace_array *tr, char *option)
mutex_unlock(&trace_types_lock);
+ /*
+ * If the first trailing whitespace is replaced with '\0' by strstrip,
+ * turn it back into a space.
+ */
+ if (orig_len > strlen(option))
+ option[strlen(option)] = ' ';
+
return ret;
}
+static void __init apply_trace_boot_options(void)
+{
+ char *buf = trace_boot_options_buf;
+ char *option;
+
+ while (true) {
+ option = strsep(&buf, ",");
+
+ if (!option)
+ break;
+
+ if (*option)
+ trace_set_options(&global_trace, option);
+
+ /* Put back the comma to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ',';
+ }
+}
+
static ssize_t
tracing_trace_options_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -4297,11 +4340,8 @@ int tracing_update_buffers(void)
struct trace_option_dentry;
-static struct trace_option_dentry *
-create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
-
static void
-destroy_trace_option_files(struct trace_option_dentry *topts);
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
/*
* Used to clear out the tracer before deletion of an instance.
@@ -4320,20 +4360,13 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
-static void update_tracer_options(struct trace_array *tr, struct tracer *t)
+static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
- static struct trace_option_dentry *topts;
-
/* Only enable if the directory has been created already. */
if (!tr->dir)
return;
- /* Currently, only the top instance has options */
- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
- return;
-
- destroy_trace_option_files(topts);
- topts = create_trace_option_files(tr, t);
+ create_trace_option_files(tr, t);
}
static int tracing_set_tracer(struct trace_array *tr, const char *buf)
@@ -4402,7 +4435,6 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
free_snapshot(tr);
}
#endif
- update_tracer_options(tr, t);
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
@@ -4569,7 +4601,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
/* trace pipe does not show start of buffer */
cpumask_setall(iter->started);
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -4626,11 +4658,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
static unsigned int
trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
+ struct trace_array *tr = iter->tr;
+
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return POLLIN | POLLRDNORM;
- if (trace_flags & TRACE_ITER_BLOCK)
+ if (tr->trace_flags & TRACE_ITER_BLOCK)
/*
* Always select as readable when in blocking mode
*/
@@ -5047,7 +5081,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
/* disable tracing ? */
- if (trace_flags & TRACE_ITER_STOP_ON_FREE)
+ if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE)
tracer_tracing_off(tr);
/* resize the ring buffer to 0 */
tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
@@ -5080,7 +5114,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (tracing_disabled)
return -EINVAL;
- if (!(trace_flags & TRACE_ITER_MARKERS))
+ if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
if (cnt > TRACE_BUF_SIZE)
@@ -6132,13 +6166,6 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
#include "trace_selftest.c"
#endif
-struct trace_option_dentry {
- struct tracer_opt *opt;
- struct tracer_flags *flags;
- struct trace_array *tr;
- struct dentry *entry;
-};
-
static ssize_t
trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -6191,14 +6218,51 @@ static const struct file_operations trace_options_fops = {
.llseek = generic_file_llseek,
};
+/*
+ * In order to pass in both the trace_array descriptor as well as the index
+ * to the flag that the trace option file represents, the trace_array
+ * has a character array of trace_flags_index[], which holds the index
+ * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc.
+ * The address of this character array is passed to the flag option file
+ * read/write callbacks.
+ *
+ * In order to extract both the index and the trace_array descriptor,
+ * get_tr_index() uses the following algorithm.
+ *
+ * idx = *ptr;
+ *
+ * As the pointer itself contains the address of the index (remember
+ * index[1] == 1).
+ *
+ * Then to get the trace_array descriptor, by subtracting that index
+ * from the ptr, we get to the start of the index itself.
+ *
+ * ptr - idx == &index[0]
+ *
+ * Then a simple container_of() from that pointer gets us to the
+ * trace_array descriptor.
+ */
+static void get_tr_index(void *data, struct trace_array **ptr,
+ unsigned int *pindex)
+{
+ *pindex = *(unsigned char *)data;
+
+ *ptr = container_of(data - *pindex, struct trace_array,
+ trace_flags_index);
+}
+
static ssize_t
trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- long index = (long)filp->private_data;
+ void *tr_index = filp->private_data;
+ struct trace_array *tr;
+ unsigned int index;
char *buf;
- if (trace_flags & (1 << index))
+ get_tr_index(tr_index, &tr, &index);
+
+ if (tr->trace_flags & (1 << index))
buf = "1\n";
else
buf = "0\n";
@@ -6210,11 +6274,14 @@ static ssize_t
trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct trace_array *tr = &global_trace;
- long index = (long)filp->private_data;
+ void *tr_index = filp->private_data;
+ struct trace_array *tr;
+ unsigned int index;
unsigned long val;
int ret;
+ get_tr_index(tr_index, &tr, &index);
+
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
@@ -6298,21 +6365,39 @@ create_trace_option_file(struct trace_array *tr,
}
-static struct trace_option_dentry *
+static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
struct trace_option_dentry *topts;
+ struct trace_options *tr_topts;
struct tracer_flags *flags;
struct tracer_opt *opts;
int cnt;
+ int i;
if (!tracer)
- return NULL;
+ return;
flags = tracer->flags;
if (!flags || !flags->opts)
- return NULL;
+ return;
+
+ /*
+ * If this is an instance, only create flags for tracers
+ * the instance may have.
+ */
+ if (!trace_ok_for_array(tracer, tr))
+ return;
+
+ for (i = 0; i < tr->nr_topts; i++) {
+ /*
+ * Check if these flags have already been added.
+ * Some tracers share flags.
+ */
+ if (tr->topts[i].tracer->flags == tracer->flags)
+ return;
+ }
opts = flags->opts;
@@ -6321,27 +6406,27 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
if (!topts)
- return NULL;
-
- for (cnt = 0; opts[cnt].name; cnt++)
- create_trace_option_file(tr, &topts[cnt], flags,
- &opts[cnt]);
-
- return topts;
-}
-
-static void
-destroy_trace_option_files(struct trace_option_dentry *topts)
-{
- int cnt;
+ return;
- if (!topts)
+ tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
+ GFP_KERNEL);
+ if (!tr_topts) {
+ kfree(topts);
return;
+ }
- for (cnt = 0; topts[cnt].opt; cnt++)
- tracefs_remove(topts[cnt].entry);
+ tr->topts = tr_topts;
+ tr->topts[tr->nr_topts].tracer = tracer;
+ tr->topts[tr->nr_topts].topts = topts;
+ tr->nr_topts++;
- kfree(topts);
+ for (cnt = 0; opts[cnt].name; cnt++) {
+ create_trace_option_file(tr, &topts[cnt], flags,
+ &opts[cnt]);
+ WARN_ONCE(topts[cnt].entry == NULL,
+ "Failed to create trace option: %s",
+ opts[cnt].name);
+ }
}
static struct dentry *
@@ -6354,21 +6439,26 @@ create_trace_option_core_file(struct trace_array *tr,
if (!t_options)
return NULL;
- return trace_create_file(option, 0644, t_options, (void *)index,
- &trace_options_core_fops);
+ return trace_create_file(option, 0644, t_options,
+ (void *)&tr->trace_flags_index[index],
+ &trace_options_core_fops);
}
-static __init void create_trace_options_dir(struct trace_array *tr)
+static void create_trace_options_dir(struct trace_array *tr)
{
struct dentry *t_options;
+ bool top_level = tr == &global_trace;
int i;
t_options = trace_options_init_dentry(tr);
if (!t_options)
return;
- for (i = 0; trace_options[i]; i++)
- create_trace_option_core_file(tr, trace_options[i], i);
+ for (i = 0; trace_options[i]; i++) {
+ if (top_level ||
+ !((1 << i) & TOP_LEVEL_TRACE_FLAGS))
+ create_trace_option_core_file(tr, trace_options[i], i);
+ }
}
static ssize_t
@@ -6435,7 +6525,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
{
enum ring_buffer_flags rb_flags;
- rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+ rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
buf->tr = tr;
@@ -6505,6 +6595,30 @@ static void free_trace_buffers(struct trace_array *tr)
#endif
}
+static void init_trace_flags_index(struct trace_array *tr)
+{
+ int i;
+
+ /* Used by the trace options files */
+ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++)
+ tr->trace_flags_index[i] = i;
+}
+
+static void __update_tracer_options(struct trace_array *tr)
+{
+ struct tracer *t;
+
+ for (t = trace_types; t; t = t->next)
+ add_tracer_options(tr, t);
+}
+
+static void update_tracer_options(struct trace_array *tr)
+{
+ mutex_lock(&trace_types_lock);
+ __update_tracer_options(tr);
+ mutex_unlock(&trace_types_lock);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -6530,6 +6644,8 @@ static int instance_mkdir(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ tr->trace_flags = global_trace.trace_flags;
+
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
raw_spin_lock_init(&tr->start_lock);
@@ -6555,6 +6671,8 @@ static int instance_mkdir(const char *name)
}
init_tracer_tracefs(tr, tr->dir);
+ init_trace_flags_index(tr);
+ __update_tracer_options(tr);
list_add(&tr->list, &ftrace_trace_arrays);
@@ -6580,6 +6698,7 @@ static int instance_rmdir(const char *name)
struct trace_array *tr;
int found = 0;
int ret;
+ int i;
mutex_lock(&trace_types_lock);
@@ -6602,9 +6721,14 @@ static int instance_rmdir(const char *name)
tracing_set_nop(tr);
event_trace_del_tracer(tr);
ftrace_destroy_function_files(tr);
- debugfs_remove_recursive(tr->dir);
+ tracefs_remove_recursive(tr->dir);
free_trace_buffers(tr);
+ for (i = 0; i < tr->nr_topts; i++) {
+ kfree(tr->topts[i].topts);
+ }
+ kfree(tr->topts);
+
kfree(tr->name);
kfree(tr);
@@ -6666,6 +6790,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("tracing_on", 0644, d_tracer,
tr, &rb_simple_fops);
+ create_trace_options_dir(tr);
+
#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tr->max_latency, &tracing_max_lat_fops);
@@ -6861,11 +6987,7 @@ static __init int tracer_init_tracefs(void)
create_trace_instances(d_tracer);
- create_trace_options_dir(&global_trace);
-
- /* If the tracer was started via cmdline, create options for it here */
- if (global_trace.current_trace != &nop_trace)
- update_tracer_options(&global_trace, global_trace.current_trace);
+ update_tracer_options(&global_trace);
return 0;
}
@@ -6964,6 +7086,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
static atomic_t dump_running;
+ struct trace_array *tr = &global_trace;
unsigned int old_userobj;
unsigned long flags;
int cnt = 0, cpu;
@@ -6990,13 +7113,13 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
+ atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
+ old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
/* don't look at user memory in panic mode */
- trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
switch (oops_dump_mode) {
case DUMP_ALL:
@@ -7059,7 +7182,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
printk(KERN_TRACE "---------------------------------\n");
out_enable:
- trace_flags |= old_userobj;
+ tr->trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -7074,6 +7197,12 @@ __init static int tracer_alloc_buffers(void)
int ring_buf_size;
int ret = -ENOMEM;
+ /*
+ * Make sure we don't accidently add more trace options
+ * than we have bits for.
+ */
+ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);
+
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
goto out;
@@ -7132,6 +7261,8 @@ __init static int tracer_alloc_buffers(void)
ftrace_init_global_array_ops(&global_trace);
+ init_trace_flags_index(&global_trace);
+
register_tracer(&nop_trace);
/* All seems OK, enable tracing */
@@ -7148,12 +7279,7 @@ __init static int tracer_alloc_buffers(void)
INIT_LIST_HEAD(&global_trace.events);
list_add(&global_trace.list, &ftrace_trace_arrays);
- while (trace_boot_options) {
- char *option;
-
- option = strsep(&trace_boot_options, ",");
- trace_set_options(&global_trace, option);
- }
+ apply_trace_boot_options();
register_snapshot_cmd();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 74bde81601a9..dd7620802e72 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -71,9 +71,6 @@ enum trace_type {
tstruct \
}
-#undef TP_ARGS
-#define TP_ARGS(args...) args
-
#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
@@ -156,9 +153,12 @@ struct trace_array_cpu {
pid_t pid;
kuid_t uid;
char comm[TASK_COMM_LEN];
+
+ bool ignore_pid;
};
struct tracer;
+struct trace_option_dentry;
struct trace_buffer {
struct trace_array *tr;
@@ -168,6 +168,19 @@ struct trace_buffer {
int cpu;
};
+#define TRACE_FLAGS_MAX_SIZE 32
+
+struct trace_options {
+ struct tracer *tracer;
+ struct trace_option_dentry *topts;
+};
+
+struct trace_pid_list {
+ unsigned int nr_pids;
+ int order;
+ pid_t *pids;
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -193,6 +206,7 @@ struct trace_array {
bool allocated_snapshot;
unsigned long max_latency;
#endif
+ struct trace_pid_list __rcu *filtered_pids;
/*
* max_lock is used to protect the swapping of buffers
* when taking a max snapshot. The buffers themselves are
@@ -216,13 +230,17 @@ struct trace_array {
#endif
int stop_count;
int clock_id;
+ int nr_topts;
struct tracer *current_trace;
+ unsigned int trace_flags;
+ unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
struct dentry *dir;
struct dentry *options;
struct dentry *percpu_dir;
struct dentry *event_dir;
+ struct trace_options *topts;
struct list_head systems;
struct list_head events;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
@@ -333,6 +351,13 @@ struct tracer_flags {
#define TRACER_OPT(s, b) .name = #s, .bit = b
+struct trace_option_dentry {
+ struct tracer_opt *opt;
+ struct tracer_flags *flags;
+ struct trace_array *tr;
+ struct dentry *entry;
+};
+
/**
* struct tracer - a specific tracer and its callbacks to interact with tracefs
* @name: the name chosen to select it on the available_tracers file
@@ -611,29 +636,12 @@ void update_max_tr_single(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef CONFIG_STACKTRACE
-void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc);
-
-void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
-
void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
int pc);
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_stack(struct ring_buffer *buffer,
- unsigned long flags, int skip, int pc)
-{
-}
-
-static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
- unsigned long flags, int skip,
- int pc, struct pt_regs *regs)
-{
-}
-
static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
unsigned long flags, int pc)
{
@@ -707,8 +715,6 @@ int trace_array_printk_buf(struct ring_buffer *buffer,
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);
-extern unsigned long trace_flags;
-
extern char trace_find_mark(unsigned long long duration);
/* Standard output formatting function used for function return traces */
@@ -723,9 +729,14 @@ extern char trace_find_mark(unsigned long long duration);
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
#define TRACE_GRAPH_PRINT_IRQS 0x40
#define TRACE_GRAPH_PRINT_TAIL 0x80
+#define TRACE_GRAPH_SLEEP_TIME 0x100
+#define TRACE_GRAPH_GRAPH_TIME 0x200
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
+extern void ftrace_graph_sleep_time_control(bool enable);
+extern void ftrace_graph_graph_time_control(bool enable);
+
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
@@ -859,7 +870,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops);
#define ftrace_destroy_filter_files(ops) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
-int ftrace_event_is_function(struct trace_event_call *call);
+bool ftrace_event_is_function(struct trace_event_call *call);
/*
* struct trace_parser - servers for reading the user input separated by spaces
@@ -897,42 +908,94 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
size_t cnt, loff_t *ppos);
/*
+ * Only create function graph options if function graph is configured.
+ */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+# define FGRAPH_FLAGS \
+ C(DISPLAY_GRAPH, "display-graph"),
+#else
+# define FGRAPH_FLAGS
+#endif
+
+#ifdef CONFIG_BRANCH_TRACER
+# define BRANCH_FLAGS \
+ C(BRANCH, "branch"),
+#else
+# define BRANCH_FLAGS
+#endif
+
+#ifdef CONFIG_FUNCTION_TRACER
+# define FUNCTION_FLAGS \
+ C(FUNCTION, "function-trace"),
+# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION
+#else
+# define FUNCTION_FLAGS
+# define FUNCTION_DEFAULT_FLAGS 0UL
+#endif
+
+#ifdef CONFIG_STACKTRACE
+# define STACK_FLAGS \
+ C(STACKTRACE, "stacktrace"),
+#else
+# define STACK_FLAGS
+#endif
+
+/*
* trace_iterator_flags is an enumeration that defines bit
* positions into trace_flags that controls the output.
*
* NOTE: These bits must match the trace_options array in
- * trace.c.
+ * trace.c (this macro guarantees it).
+ */
+#define TRACE_FLAGS \
+ C(PRINT_PARENT, "print-parent"), \
+ C(SYM_OFFSET, "sym-offset"), \
+ C(SYM_ADDR, "sym-addr"), \
+ C(VERBOSE, "verbose"), \
+ C(RAW, "raw"), \
+ C(HEX, "hex"), \
+ C(BIN, "bin"), \
+ C(BLOCK, "block"), \
+ C(PRINTK, "trace_printk"), \
+ C(ANNOTATE, "annotate"), \
+ C(USERSTACKTRACE, "userstacktrace"), \
+ C(SYM_USEROBJ, "sym-userobj"), \
+ C(PRINTK_MSGONLY, "printk-msg-only"), \
+ C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
+ C(LATENCY_FMT, "latency-format"), \
+ C(RECORD_CMD, "record-cmd"), \
+ C(OVERWRITE, "overwrite"), \
+ C(STOP_ON_FREE, "disable_on_free"), \
+ C(IRQ_INFO, "irq-info"), \
+ C(MARKERS, "markers"), \
+ FUNCTION_FLAGS \
+ FGRAPH_FLAGS \
+ STACK_FLAGS \
+ BRANCH_FLAGS
+
+/*
+ * By defining C, we can make TRACE_FLAGS a list of bit names
+ * that will define the bits for the flag masks.
*/
-enum trace_iterator_flags {
- TRACE_ITER_PRINT_PARENT = 0x01,
- TRACE_ITER_SYM_OFFSET = 0x02,
- TRACE_ITER_SYM_ADDR = 0x04,
- TRACE_ITER_VERBOSE = 0x08,
- TRACE_ITER_RAW = 0x10,
- TRACE_ITER_HEX = 0x20,
- TRACE_ITER_BIN = 0x40,
- TRACE_ITER_BLOCK = 0x80,
- TRACE_ITER_STACKTRACE = 0x100,
- TRACE_ITER_PRINTK = 0x200,
- TRACE_ITER_PREEMPTONLY = 0x400,
- TRACE_ITER_BRANCH = 0x800,
- TRACE_ITER_ANNOTATE = 0x1000,
- TRACE_ITER_USERSTACKTRACE = 0x2000,
- TRACE_ITER_SYM_USEROBJ = 0x4000,
- TRACE_ITER_PRINTK_MSGONLY = 0x8000,
- TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
- TRACE_ITER_LATENCY_FMT = 0x20000,
- TRACE_ITER_SLEEP_TIME = 0x40000,
- TRACE_ITER_GRAPH_TIME = 0x80000,
- TRACE_ITER_RECORD_CMD = 0x100000,
- TRACE_ITER_OVERWRITE = 0x200000,
- TRACE_ITER_STOP_ON_FREE = 0x400000,
- TRACE_ITER_IRQ_INFO = 0x800000,
- TRACE_ITER_MARKERS = 0x1000000,
- TRACE_ITER_FUNCTION = 0x2000000,
+#undef C
+#define C(a, b) TRACE_ITER_##a##_BIT
+
+enum trace_iterator_bits {
+ TRACE_FLAGS
+ /* Make sure we don't go more than we have bits for */
+ TRACE_ITER_LAST_BIT
};
/*
+ * By redefining C, we can make TRACE_FLAGS a list of masks that
+ * use the bits as defined above.
+ */
+#undef C
+#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT)
+
+enum trace_iterator_flags { TRACE_FLAGS };
+
+/*
* TRACE_ITER_SYM_MASK masks the options in trace_flags that
* control the output of kernel symbols.
*/
@@ -946,7 +1009,7 @@ extern int enable_branch_tracing(struct trace_array *tr);
extern void disable_branch_tracing(void);
static inline int trace_branch_enable(struct trace_array *tr)
{
- if (trace_flags & TRACE_ITER_BRANCH)
+ if (tr->trace_flags & TRACE_ITER_BRANCH)
return enable_branch_tracing(tr);
return 0;
}
@@ -1269,6 +1332,7 @@ extern const char *__stop___trace_bprintk_fmt[];
extern const char *__start___tracepoint_str[];
extern const char *__stop___tracepoint_str[];
+void trace_printk_control(bool enabled);
void trace_printk_init_buffers(void);
void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 40a14cbcf8e0..0f109c4130d3 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -43,7 +43,7 @@ static void trace_do_benchmark(void)
unsigned int std = 0;
/* Only run if the tracepoint is actually active */
- if (!trace_benchmark_event_enabled())
+ if (!trace_benchmark_event_enabled() || !tracing_is_on())
return;
local_irq_disable();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e2e12ad3186f..3a2a73716a5b 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -125,25 +125,14 @@ void disable_branch_tracing(void)
mutex_unlock(&branch_tracing_mutex);
}
-static void start_branch_trace(struct trace_array *tr)
-{
- enable_branch_tracing(tr);
-}
-
-static void stop_branch_trace(struct trace_array *tr)
-{
- disable_branch_tracing();
-}
-
static int branch_trace_init(struct trace_array *tr)
{
- start_branch_trace(tr);
- return 0;
+ return enable_branch_tracing(tr);
}
static void branch_trace_reset(struct trace_array *tr)
{
- stop_branch_trace(tr);
+ disable_branch_tracing();
}
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 404a372ad85a..6bbc5f652355 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,11 +15,15 @@
#include <linux/kthread.h>
#include <linux/tracefs.h>
#include <linux/uaccess.h>
+#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/ctype.h>
+#include <linux/sort.h>
#include <linux/slab.h>
#include <linux/delay.h>
+#include <trace/events/sched.h>
+
#include <asm/setup.h>
#include "trace_output.h"
@@ -30,6 +34,7 @@
DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -37,21 +42,19 @@ static LIST_HEAD(ftrace_common_fields);
static struct kmem_cache *field_cachep;
static struct kmem_cache *file_cachep;
-#define SYSTEM_FL_FREE_NAME (1 << 31)
-
static inline int system_refcount(struct event_subsystem *system)
{
- return system->ref_count & ~SYSTEM_FL_FREE_NAME;
+ return system->ref_count;
}
static int system_refcount_inc(struct event_subsystem *system)
{
- return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
+ return system->ref_count++;
}
static int system_refcount_dec(struct event_subsystem *system)
{
- return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
+ return --system->ref_count;
}
/* Double loops, do not use break, only goto's work */
@@ -94,6 +97,10 @@ trace_find_event_field(struct trace_event_call *call, char *name)
struct ftrace_event_field *field;
struct list_head *head;
+ field = __find_event_field(&ftrace_generic_fields, name);
+ if (field)
+ return field;
+
field = __find_event_field(&ftrace_common_fields, name);
if (field)
return field;
@@ -144,6 +151,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
}
EXPORT_SYMBOL_GPL(trace_define_field);
+#define __generic_field(type, item, filter_type) \
+ ret = __trace_define_field(&ftrace_generic_fields, #type, \
+ #item, 0, 0, is_signed_type(type), \
+ filter_type); \
+ if (ret) \
+ return ret;
+
#define __common_field(type, item) \
ret = __trace_define_field(&ftrace_common_fields, #type, \
"common_" #item, \
@@ -153,6 +167,16 @@ EXPORT_SYMBOL_GPL(trace_define_field);
if (ret) \
return ret;
+static int trace_define_generic_fields(void)
+{
+ int ret;
+
+ __generic_field(int, cpu, FILTER_OTHER);
+ __generic_field(char *, comm, FILTER_PTR_STRING);
+
+ return ret;
+}
+
static int trace_define_common_fields(void)
{
int ret;
@@ -190,12 +214,32 @@ int trace_event_raw_init(struct trace_event_call *call)
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
+bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
+{
+ struct trace_array *tr = trace_file->tr;
+ struct trace_array_cpu *data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+ if (!pid_list)
+ return false;
+
+ data = this_cpu_ptr(tr->trace_buffer.data);
+
+ return data->ignore_pid;
+}
+EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
+
void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
struct trace_event_file *trace_file,
unsigned long len)
{
struct trace_event_call *event_call = trace_file->event_call;
+ if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
+ trace_event_ignore_this_pid(trace_file))
+ return NULL;
+
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
fbuffer->trace_file = trace_file;
@@ -316,6 +360,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
struct trace_event_call *call = file->event_call;
+ struct trace_array *tr = file->tr;
int ret = 0;
int disable;
@@ -379,7 +424,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
- if (trace_flags & TRACE_ITER_RECORD_CMD) {
+ if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
tracing_start_cmdline_record();
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
@@ -424,6 +469,142 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(&event_mutex);
}
+static int cmp_pid(const void *key, const void *elt)
+{
+ const pid_t *search_pid = key;
+ const pid_t *pid = elt;
+
+ if (*search_pid == *pid)
+ return 0;
+ if (*search_pid < *pid)
+ return -1;
+ return 1;
+}
+
+static bool
+check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+ pid_t search_pid;
+ pid_t *pid;
+
+ /*
+ * Return false, because if filtered_pids does not exist,
+ * all pids are good to trace.
+ */
+ if (!filtered_pids)
+ return false;
+
+ search_pid = task->pid;
+
+ pid = bsearch(&search_pid, filtered_pids->pids,
+ filtered_pids->nr_pids, sizeof(pid_t),
+ cmp_pid);
+ if (!pid)
+ return true;
+
+ return false;
+}
+
+static void
+event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, prev) &&
+ check_ignore_pid(pid_list, next));
+}
+
+static void
+event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, next));
+}
+
+static void
+event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /* Nothing to do if we are already tracing */
+ if (!this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ return;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, task));
+}
+
+static void
+event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /* Nothing to do if we are not tracing */
+ if (this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ return;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ /* Set tracing if current is enabled */
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, current));
+}
+
+static void __ftrace_clear_event_pids(struct trace_array *tr)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_event_file *file;
+ int cpu;
+
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ if (!pid_list)
+ return;
+
+ unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr);
+ unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr);
+
+ unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr);
+ unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr);
+
+ list_for_each_entry(file, &tr->events, list) {
+ clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+ }
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false;
+
+ rcu_assign_pointer(tr->filtered_pids, NULL);
+
+ /* Wait till all users are no longer using pid filtering */
+ synchronize_sched();
+
+ free_pages((unsigned long)pid_list->pids, pid_list->order);
+ kfree(pid_list);
+}
+
+static void ftrace_clear_event_pids(struct trace_array *tr)
+{
+ mutex_lock(&event_mutex);
+ __ftrace_clear_event_pids(tr);
+ mutex_unlock(&event_mutex);
+}
+
static void __put_system(struct event_subsystem *system)
{
struct event_filter *filter = system->filter;
@@ -438,8 +619,7 @@ static void __put_system(struct event_subsystem *system)
kfree(filter->filter_string);
kfree(filter);
}
- if (system->ref_count & SYSTEM_FL_FREE_NAME)
- kfree(system->name);
+ kfree_const(system->name);
kfree(system);
}
@@ -757,6 +937,58 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+static void *p_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = m->private;
+
+ /*
+ * Grab the mutex, to keep calls to p_next() having the same
+ * tr->filtered_pids as p_start() has.
+ * If we just passed the tr->filtered_pids around, then RCU would
+ * have been enough, but doing that makes things more complex.
+ */
+ mutex_lock(&event_mutex);
+ rcu_read_lock_sched();
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ if (!pid_list || *pos >= pid_list->nr_pids)
+ return NULL;
+
+ return (void *)&pid_list->pids[*pos];
+}
+
+static void p_stop(struct seq_file *m, void *p)
+ __releases(RCU)
+{
+ rcu_read_unlock_sched();
+ mutex_unlock(&event_mutex);
+}
+
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ (*pos)++;
+
+ if (*pos >= pid_list->nr_pids)
+ return NULL;
+
+ return (void *)&pid_list->pids[*pos];
+}
+
+static int p_show(struct seq_file *m, void *v)
+{
+ pid_t *pid = v;
+
+ seq_printf(m, "%d\n", *pid);
+ return 0;
+}
+
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -1314,8 +1546,209 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return r;
}
+static int max_pids(struct trace_pid_list *pid_list)
+{
+ return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
+}
+
+static void ignore_task_cpu(void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /*
+ * This function is called by on_each_cpu() while the
+ * event_mutex is held.
+ */
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ mutex_is_locked(&event_mutex));
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, current));
+}
+
+static ssize_t
+ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *pid_list = NULL;
+ struct trace_event_file *file;
+ struct trace_parser parser;
+ unsigned long val;
+ loff_t this_pos;
+ ssize_t read = 0;
+ ssize_t ret = 0;
+ pid_t pid;
+ int i;
+
+ if (!cnt)
+ return 0;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ mutex_lock(&event_mutex);
+ /*
+ * Load as many pids into the array before doing a
+ * swap from the tr->filtered_pids to the new list.
+ */
+ while (cnt > 0) {
+
+ this_pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
+ if (ret < 0 || !trace_parser_loaded(&parser))
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ parser.buffer[parser.idx] = 0;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+ if (val > INT_MAX)
+ break;
+
+ pid = (pid_t)val;
+
+ ret = -ENOMEM;
+ if (!pid_list) {
+ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ break;
+
+ filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ if (filtered_pids)
+ pid_list->order = filtered_pids->order;
+ else
+ pid_list->order = 0;
+
+ pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
+ pid_list->order);
+ if (!pid_list->pids)
+ break;
+
+ if (filtered_pids) {
+ pid_list->nr_pids = filtered_pids->nr_pids;
+ memcpy(pid_list->pids, filtered_pids->pids,
+ pid_list->nr_pids * sizeof(pid_t));
+ } else
+ pid_list->nr_pids = 0;
+ }
+
+ if (pid_list->nr_pids >= max_pids(pid_list)) {
+ pid_t *pid_page;
+
+ pid_page = (void *)__get_free_pages(GFP_KERNEL,
+ pid_list->order + 1);
+ if (!pid_page)
+ break;
+ memcpy(pid_page, pid_list->pids,
+ pid_list->nr_pids * sizeof(pid_t));
+ free_pages((unsigned long)pid_list->pids, pid_list->order);
+
+ pid_list->order++;
+ pid_list->pids = pid_page;
+ }
+
+ pid_list->pids[pid_list->nr_pids++] = pid;
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ if (pid_list)
+ free_pages((unsigned long)pid_list->pids, pid_list->order);
+ kfree(pid_list);
+ mutex_unlock(&event_mutex);
+ return ret;
+ }
+
+ if (!pid_list) {
+ mutex_unlock(&event_mutex);
+ return ret;
+ }
+
+ sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
+
+ /* Remove duplicates */
+ for (i = 1; i < pid_list->nr_pids; i++) {
+ int start = i;
+
+ while (i < pid_list->nr_pids &&
+ pid_list->pids[i - 1] == pid_list->pids[i])
+ i++;
+
+ if (start != i) {
+ if (i < pid_list->nr_pids) {
+ memmove(&pid_list->pids[start], &pid_list->pids[i],
+ (pid_list->nr_pids - i) * sizeof(pid_t));
+ pid_list->nr_pids -= i - start;
+ i = start;
+ } else
+ pid_list->nr_pids = start;
+ }
+ }
+
+ rcu_assign_pointer(tr->filtered_pids, pid_list);
+
+ list_for_each_entry(file, &tr->events, list) {
+ set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+ }
+
+ if (filtered_pids) {
+ synchronize_sched();
+
+ free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
+ kfree(filtered_pids);
+ } else {
+ /*
+ * Register a probe that is called before all other probes
+ * to set ignore_pid if next or prev do not match.
+ * Register a probe this is called after all other probes
+ * to only keep ignore_pid set if next pid matches.
+ */
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
+ tr, 0);
+
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
+ tr, 0);
+ }
+
+ /*
+ * Ignoring of pids is done at task switch. But we have to
+ * check for those tasks that are currently running.
+ * Always do this in case a pid was appended or removed.
+ */
+ on_each_cpu(ignore_task_cpu, tr, 1);
+
+ mutex_unlock(&event_mutex);
+
+ ret = read;
+ *ppos += read;
+
+ return ret;
+}
+
static int ftrace_event_avail_open(struct inode *inode, struct file *file);
static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_set_pid_open(struct inode *inode, struct file *file);
static int ftrace_event_release(struct inode *inode, struct file *file);
static const struct seq_operations show_event_seq_ops = {
@@ -1332,6 +1765,13 @@ static const struct seq_operations show_set_event_seq_ops = {
.stop = t_stop,
};
+static const struct seq_operations show_set_pid_seq_ops = {
+ .start = p_start,
+ .next = p_next,
+ .show = p_show,
+ .stop = p_stop,
+};
+
static const struct file_operations ftrace_avail_fops = {
.open = ftrace_event_avail_open,
.read = seq_read,
@@ -1347,6 +1787,14 @@ static const struct file_operations ftrace_set_event_fops = {
.release = ftrace_event_release,
};
+static const struct file_operations ftrace_set_event_pid_fops = {
+ .open = ftrace_event_set_pid_open,
+ .read = seq_read,
+ .write = ftrace_event_pid_write,
+ .llseek = seq_lseek,
+ .release = ftrace_event_release,
+};
+
static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
@@ -1457,6 +1905,26 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
return ret;
}
+static int
+ftrace_event_set_pid_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops = &show_set_pid_seq_ops;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_clear_event_pids(tr);
+
+ ret = ftrace_event_open(inode, file, seq_ops);
+ if (ret < 0)
+ trace_array_put(tr);
+ return ret;
+}
+
static struct event_subsystem *
create_new_subsystem(const char *name)
{
@@ -1470,13 +1938,9 @@ create_new_subsystem(const char *name)
system->ref_count = 1;
/* Only allocate if dynamic (kprobes and modules) */
- if (!core_kernel_data((unsigned long)name)) {
- system->ref_count |= SYSTEM_FL_FREE_NAME;
- system->name = kstrdup(name, GFP_KERNEL);
- if (!system->name)
- goto out_free;
- } else
- system->name = name;
+ system->name = kstrdup_const(name, GFP_KERNEL);
+ if (!system->name)
+ goto out_free;
system->filter = NULL;
@@ -1489,8 +1953,7 @@ create_new_subsystem(const char *name)
return system;
out_free:
- if (system->ref_count & SYSTEM_FL_FREE_NAME)
- kfree(system->name);
+ kfree_const(system->name);
kfree(system);
return NULL;
}
@@ -2456,6 +2919,9 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
+ entry = tracefs_create_file("set_event_pid", 0644, parent,
+ tr, &ftrace_set_event_pid_fops);
+
/* ring buffer internal formats */
trace_create_file("header_page", 0444, d_events,
ring_buffer_print_page_header,
@@ -2536,6 +3002,9 @@ int event_trace_del_tracer(struct trace_array *tr)
/* Disable any event triggers and associated soft-disabled events */
clear_event_triggers(tr);
+ /* Clear the pid list */
+ __ftrace_clear_event_pids(tr);
+
/* Disable any running events */
__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
@@ -2573,16 +3042,16 @@ early_enable_events(struct trace_array *tr, bool disable_first)
if (!token)
break;
- if (!*token)
- continue;
- /* Restarting syscalls requires that we stop them first */
- if (disable_first)
- ftrace_set_clr_event(tr, token, 0);
+ if (*token) {
+ /* Restarting syscalls requires that we stop them first */
+ if (disable_first)
+ ftrace_set_clr_event(tr, token, 0);
- ret = ftrace_set_clr_event(tr, token, 1);
- if (ret)
- pr_warn("Failed to enable trace event: %s\n", token);
+ ret = ftrace_set_clr_event(tr, token, 1);
+ if (ret)
+ pr_warn("Failed to enable trace event: %s\n", token);
+ }
/* Put back the comma to allow this to be called again */
if (buf)
@@ -2671,6 +3140,9 @@ static __init int event_trace_init(void)
if (!entry)
pr_warn("Could not create tracefs 'available_events' entry\n");
+ if (trace_define_generic_fields())
+ pr_warn("tracing: Failed to allocated generic fields");
+
if (trace_define_common_fields())
pr_warn("tracing: Failed to allocate common fields");
@@ -2866,7 +3338,9 @@ static __init void event_trace_self_tests(void)
static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
-static void
+static struct trace_array *event_tr;
+
+static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
@@ -2897,7 +3371,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
- trace_buffer_unlock_commit(buffer, event, flags, pc);
+ trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
@@ -2913,6 +3387,9 @@ static struct ftrace_ops trace_ops __initdata =
static __init void event_trace_self_test_with_function(void)
{
int ret;
+ event_tr = top_trace_array();
+ if (WARN_ON(!event_tr))
+ return;
ret = register_ftrace_function(&trace_ops);
if (WARN_ON(ret < 0)) {
pr_info("Failed to enable function tracer for event tests\n");
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d81d6f302b14..f93a219b18da 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
return match;
}
+/* Filter predicate for CPUs. */
+static int filter_pred_cpu(struct filter_pred *pred, void *event)
+{
+ int cpu, cmp;
+ int match = 0;
+
+ cpu = raw_smp_processor_id();
+ cmp = pred->val;
+
+ switch (pred->op) {
+ case OP_EQ:
+ match = cpu == cmp;
+ break;
+ case OP_LT:
+ match = cpu < cmp;
+ break;
+ case OP_LE:
+ match = cpu <= cmp;
+ break;
+ case OP_GT:
+ match = cpu > cmp;
+ break;
+ case OP_GE:
+ match = cpu >= cmp;
+ break;
+ default:
+ break;
+ }
+
+ return !!match == !pred->not;
+}
+
+/* Filter predicate for COMM. */
+static int filter_pred_comm(struct filter_pred *pred, void *event)
+{
+ int cmp, match;
+
+ cmp = pred->regex.match(current->comm, &pred->regex,
+ pred->regex.field_len);
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
static int filter_pred_none(struct filter_pred *pred, void *event)
{
return 0;
@@ -929,15 +973,15 @@ static bool is_string_field(struct ftrace_event_field *field)
field->filter_type == FILTER_PTR_STRING;
}
-static int is_legal_op(struct ftrace_event_field *field, int op)
+static bool is_legal_op(struct ftrace_event_field *field, int op)
{
if (is_string_field(field) &&
(op != OP_EQ && op != OP_NE && op != OP_GLOB))
- return 0;
+ return false;
if (!is_string_field(field) && op == OP_GLOB)
- return 0;
+ return false;
- return 1;
+ return true;
}
static filter_pred_fn_t select_comparison_fn(int op, int field_size,
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps,
if (is_string_field(field)) {
filter_build_regex(pred);
- if (field->filter_type == FILTER_STATIC_STRING) {
+ if (!strcmp(field->name, "comm")) {
+ fn = filter_pred_comm;
+ pred->regex.field_len = TASK_COMM_LEN;
+ } else if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps,
}
pred->val = val;
- fn = select_comparison_fn(pred->op, field->size,
+ if (!strcmp(field->name, "cpu"))
+ fn = filter_pred_cpu;
+ else
+ fn = select_comparison_fn(pred->op, field->size,
field->is_signed);
if (!fn) {
parse_error(ps, FILT_ERR_INVALID_OP, 0);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index adabf7da9113..39aa7aa66468 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
FTRACE_ENTRY_REG(call, struct_name, etype, \
PARAMS(tstruct), PARAMS(print), filter, NULL)
-int ftrace_event_is_function(struct trace_event_call *call)
+bool ftrace_event_is_function(struct trace_event_call *call)
{
return call == &event_function;
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8968bf720c12..92382af7a213 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -83,13 +83,18 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
/* Display function name after trailing } */
{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
+ /* Include sleep time (scheduled out) between entry and return */
+ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
+ /* Include time within nested functions */
+ { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
/* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
- TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
+ TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS |
+ TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME,
.opts = trace_opts
};
@@ -107,8 +112,8 @@ enum {
};
static void
-print_graph_duration(unsigned long long duration, struct trace_seq *s,
- u32 flags);
+print_graph_duration(struct trace_array *tr, unsigned long long duration,
+ struct trace_seq *s, u32 flags);
/* Add a function return address to the trace stack on thread info.*/
int
@@ -653,6 +658,7 @@ static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -660,7 +666,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
addr >= (unsigned long)__irqentry_text_end)
return;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
/* Absolute time */
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
@@ -676,19 +682,19 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
print_graph_lat_fmt(s, ent);
}
/* No overhead */
- print_graph_duration(0, s, flags | FLAGS_FILL_START);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_START);
if (type == TRACE_GRAPH_ENT)
trace_seq_puts(s, "==========>");
else
trace_seq_puts(s, "<==========");
- print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_END);
trace_seq_putc(s, '\n');
}
@@ -715,22 +721,22 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
trace_seq_printf(s, ".%s", nsecs_str);
- len += strlen(nsecs_str);
+ len += strlen(nsecs_str) + 1;
}
trace_seq_puts(s, " us ");
/* Print remaining spaces to fit the row's width */
- for (i = len; i < 7; i++)
+ for (i = len; i < 8; i++)
trace_seq_putc(s, ' ');
}
static void
-print_graph_duration(unsigned long long duration, struct trace_seq *s,
- u32 flags)
+print_graph_duration(struct trace_array *tr, unsigned long long duration,
+ struct trace_seq *s, u32 flags)
{
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
- !(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
/* No real adata, just filling the column with spaces */
@@ -764,6 +770,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct trace_seq *s, u32 flags)
{
struct fgraph_data *data = iter->private;
+ struct trace_array *tr = iter->tr;
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
@@ -792,7 +799,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
/* Overhead and duration */
- print_graph_duration(duration, s, flags);
+ print_graph_duration(tr, duration, s, flags);
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
@@ -810,6 +817,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
+ struct trace_array *tr = iter->tr;
int i;
if (data) {
@@ -825,7 +833,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL);
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
@@ -849,6 +857,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
+ struct trace_array *tr = iter->tr;
int cpu = iter->cpu;
/* Pid */
@@ -858,7 +867,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
/* Interrupt */
print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
/* Absolute time */
@@ -876,7 +885,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
print_graph_lat_fmt(s, ent);
return;
@@ -1027,6 +1036,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
{
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
+ struct trace_array *tr = iter->tr;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
@@ -1058,7 +1068,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
print_graph_prologue(iter, s, 0, 0, flags);
/* Overhead and duration */
- print_graph_duration(duration, s, flags);
+ print_graph_duration(tr, duration, s, flags);
/* Closing brace */
for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
@@ -1091,7 +1101,8 @@ static enum print_line_t
print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
struct trace_iterator *iter, u32 flags)
{
- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct trace_array *tr = iter->tr;
+ unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK);
struct fgraph_data *data = iter->private;
struct trace_event *event;
int depth = 0;
@@ -1104,7 +1115,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
print_graph_prologue(iter, s, 0, 0, flags);
/* No time */
- print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL);
/* Indentation */
if (depth > 0)
@@ -1245,9 +1256,10 @@ static void print_lat_header(struct seq_file *s, u32 flags)
seq_printf(s, "#%.*s||| / \n", size, spaces);
}
-static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct trace_array *tr,
+ struct seq_file *s, u32 flags)
{
- int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
+ int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT;
if (lat)
print_lat_header(s, flags);
@@ -1289,11 +1301,12 @@ static void print_graph_headers(struct seq_file *s)
void print_graph_headers_flags(struct seq_file *s, u32 flags)
{
struct trace_iterator *iter = s->private;
+ struct trace_array *tr = iter->tr;
- if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
- if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) {
/* print nothing if the buffers are empty */
if (trace_empty(iter))
return;
@@ -1301,7 +1314,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
print_trace_header(s, iter);
}
- __print_graph_headers_flags(s, flags);
+ __print_graph_headers_flags(tr, s, flags);
}
void graph_trace_open(struct trace_iterator *iter)
@@ -1362,6 +1375,12 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (bit == TRACE_GRAPH_PRINT_IRQS)
ftrace_graph_skip_irqs = !set;
+ if (bit == TRACE_GRAPH_SLEEP_TIME)
+ ftrace_graph_sleep_time_control(set);
+
+ if (bit == TRACE_GRAPH_GRAPH_TIME)
+ ftrace_graph_graph_time_control(set);
+
return 0;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8523ea345f2b..e4e56589ec1d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -31,7 +31,6 @@ enum {
static int trace_type __read_mostly;
static int save_flags;
-static bool function_enabled;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -57,22 +56,16 @@ irq_trace(void)
# define irq_trace() (0)
#endif
-#define TRACE_DISPLAY_GRAPH 1
-
-static struct tracer_opt trace_opts[] = {
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- /* display latency trace as call graph */
- { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+static int irqsoff_display_graph(struct trace_array *tr, int set);
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+#else
+static inline int irqsoff_display_graph(struct trace_array *tr, int set)
+{
+ return -EINVAL;
+}
+# define is_graph(tr) false
#endif
- { } /* Empty entry */
-};
-
-static struct tracer_flags tracer_flags = {
- .val = 0,
- .opts = trace_opts,
-};
-
-#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
/*
* Sequence count - we record it when starting a measurement and
@@ -152,15 +145,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int
-irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+static int irqsoff_display_graph(struct trace_array *tr, int set)
{
int cpu;
- if (!(bit & TRACE_DISPLAY_GRAPH))
- return -EINVAL;
-
- if (!(is_graph() ^ set))
+ if (!(is_graph(tr) ^ set))
return 0;
stop_irqsoff_tracer(irqsoff_trace, !set);
@@ -209,7 +198,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
static void irqsoff_trace_open(struct trace_iterator *iter)
{
- if (is_graph())
+ if (is_graph(iter->tr))
graph_trace_open(iter);
}
@@ -231,7 +220,7 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
* In graph mode call the graph tracer output function,
* otherwise go with the TRACE_FN event handler
*/
- if (is_graph())
+ if (is_graph(iter->tr))
return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
return TRACE_TYPE_UNHANDLED;
@@ -239,7 +228,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
static void irqsoff_print_header(struct seq_file *s)
{
- if (is_graph())
+ struct trace_array *tr = irqsoff_trace;
+
+ if (is_graph(tr))
print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
else
trace_default_header(s);
@@ -250,7 +241,7 @@ __trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
unsigned long flags, int pc)
{
- if (is_graph())
+ if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, flags, pc);
else
trace_function(tr, ip, parent_ip, flags, pc);
@@ -259,27 +250,23 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int
-irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
-{
- return -EINVAL;
-}
-
+#ifdef CONFIG_FUNCTION_TRACER
static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
{
return -1;
}
+#endif
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
return TRACE_TYPE_UNHANDLED;
}
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
static void irqsoff_trace_open(struct trace_iterator *iter) { }
static void irqsoff_trace_close(struct trace_iterator *iter) { }
#ifdef CONFIG_FUNCTION_TRACER
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
static void irqsoff_print_header(struct seq_file *s)
{
trace_default_header(s);
@@ -295,16 +282,16 @@ static void irqsoff_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
- return 0;
+ return false;
} else {
if (delta <= tr->max_latency)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static void
@@ -523,12 +510,15 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
+#ifdef CONFIG_FUNCTION_TRACER
+static bool function_enabled;
+
static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
int ret;
/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
return 0;
if (graph)
@@ -556,20 +546,40 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph)
function_enabled = false;
}
-static void irqsoff_function_set(struct trace_array *tr, int set)
+static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set)
{
+ if (!(mask & TRACE_ITER_FUNCTION))
+ return 0;
+
if (set)
- register_irqsoff_function(tr, is_graph(), 1);
+ register_irqsoff_function(tr, is_graph(tr), 1);
else
- unregister_irqsoff_function(tr, is_graph());
+ unregister_irqsoff_function(tr, is_graph(tr));
+ return 1;
+}
+#else
+static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
+{
+ return 0;
+}
+static void unregister_irqsoff_function(struct trace_array *tr, int graph) { }
+static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set)
+{
+ return 0;
}
+#endif /* CONFIG_FUNCTION_TRACER */
static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
{
struct tracer *tracer = tr->current_trace;
- if (mask & TRACE_ITER_FUNCTION)
- irqsoff_function_set(tr, set);
+ if (irqsoff_function_set(tr, mask, set))
+ return 0;
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ return irqsoff_display_graph(tr, set);
+#endif
return trace_keep_overwrite(tracer, mask, set);
}
@@ -602,7 +612,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
if (irqsoff_busy)
return -EBUSY;
- save_flags = trace_flags;
+ save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
@@ -618,7 +628,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
/* Only toplevel instance supports graph tracing */
if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
- is_graph())))
+ is_graph(tr))))
printk(KERN_ERR "failed to start irqsoff tracer\n");
irqsoff_busy = true;
@@ -630,7 +640,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
- stop_irqsoff_tracer(tr, is_graph());
+ stop_irqsoff_tracer(tr, is_graph(tr));
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
@@ -666,8 +676,6 @@ static struct tracer irqsoff_tracer __read_mostly =
.print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
- .flags = &tracer_flags,
- .set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
@@ -700,8 +708,6 @@ static struct tracer preemptoff_tracer __read_mostly =
.print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
- .flags = &tracer_flags,
- .set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
@@ -736,8 +742,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
- .flags = &tracer_flags,
- .set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3ccf5c2c1320..57149bce6aad 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -21,20 +21,22 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
+ struct trace_array *tr;
unsigned int old_userobj;
int cnt = 0, cpu;
trace_init_global_iter(&iter);
iter.buffer_iter = buffer_iter;
+ tr = iter.tr;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- old_userobj = trace_flags;
+ old_userobj = tr->trace_flags;
/* don't look at user memory in panic mode */
- trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
kdb_printf("Dumping ftrace buffer:\n");
@@ -82,7 +84,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
kdb_printf("---------------------------------\n");
out:
- trace_flags = old_userobj;
+ tr->trace_flags = old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
void *addr, void *dest)
{
- long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
u8 *dst = get_rloc_data(dest);
- u8 *src = addr;
- mm_segment_t old_fs = get_fs();
+ long ret;
if (!maxlen)
return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
* Try to get string again, since the string can be changed while
* probing.
*/
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do
- ret = __copy_from_user_inatomic(dst++, src++, 1);
- while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
-
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
+ ret = strncpy_from_unsafe(dst, addr, maxlen);
if (ret < 0) { /* Failed to fetch string */
- ((u8 *)get_rloc_data(dest))[0] = '\0';
+ dst[0] = '\0';
*(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
} else {
- *(u32 *)dest = make_data_rloc(src - (u8 *)addr,
- get_rloc_offs(*(u32 *)dest));
+ *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
}
}
NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 638e110c5bfd..2be8c4f2403d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -314,7 +314,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry->rw = *rw;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -344,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry->map = *map;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index dfab253727dc..282982195e09 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -322,8 +322,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
# define IP_FMT "%016lx"
#endif
-int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
- unsigned long ip, unsigned long sym_flags)
+static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags)
{
struct file *file = NULL;
unsigned long vmstart = 0;
@@ -355,50 +355,6 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
}
int
-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
- unsigned long sym_flags)
-{
- struct mm_struct *mm = NULL;
- unsigned int i;
-
- if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
- struct task_struct *task;
- /*
- * we do the lookup on the thread group leader,
- * since individual threads might have already quit!
- */
- rcu_read_lock();
- task = find_task_by_vpid(entry->tgid);
- if (task)
- mm = get_task_mm(task);
- rcu_read_unlock();
- }
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- unsigned long ip = entry->caller[i];
-
- if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
- break;
-
- trace_seq_puts(s, " => ");
-
- if (!ip) {
- trace_seq_puts(s, "??");
- trace_seq_putc(s, '\n');
- continue;
- }
-
- seq_print_user_ip(s, mm, ip, sym_flags);
- trace_seq_putc(s, '\n');
- }
-
- if (mm)
- mmput(mm);
-
- return !trace_seq_has_overflowed(s);
-}
-
-int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
{
if (!ip) {
@@ -496,6 +452,8 @@ static const struct trace_mark {
char sym;
} mark[] = {
MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(100000000ULL , '@'), /* 100 msec */
+ MARK(10000000ULL , '*'), /* 10 msec */
MARK(1000000ULL , '#'), /* 1000 usecs */
MARK(100000ULL , '!'), /* 100 usecs */
MARK(10000ULL , '+'), /* 10 usecs */
@@ -508,7 +466,7 @@ char trace_find_mark(unsigned long long d)
int size = ARRAY_SIZE(mark);
for (i = 0; i < size; i++) {
- if (d >= mark[i].val)
+ if (d > mark[i].val)
break;
}
@@ -518,7 +476,8 @@ char trace_find_mark(unsigned long long d)
static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
{
- unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
+ struct trace_array *tr = iter->tr;
+ unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE;
unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
unsigned long long rel_ts = next_ts - iter->ts;
@@ -561,6 +520,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
int trace_print_context(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
unsigned long long t;
@@ -572,7 +532,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
- if (trace_flags & TRACE_ITER_IRQ_INFO)
+ if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
@@ -588,14 +548,15 @@ int trace_print_context(struct trace_iterator *iter)
int trace_print_lat_context(struct trace_iterator *iter)
{
- u64 next_ts;
+ struct trace_array *tr = iter->tr;
/* trace_find_next_entry will reset ent_size */
int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
+ u64 next_ts;
struct trace_entry *entry = iter->ent,
*next_entry = trace_find_next_entry(iter, NULL,
&next_ts);
- unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+ unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE);
/* Restore the original ent_size */
iter->ent_size = ent_size;
@@ -1077,13 +1038,49 @@ static struct trace_event trace_stack_event = {
static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
int flags, struct trace_event *event)
{
+ struct trace_array *tr = iter->tr;
struct userstack_entry *field;
struct trace_seq *s = &iter->seq;
+ struct mm_struct *mm = NULL;
+ unsigned int i;
trace_assign_type(field, iter->ent);
trace_seq_puts(s, "<user stack trace>\n");
- seq_print_userip_objs(field, s, flags);
+
+ if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ struct task_struct *task;
+ /*
+ * we do the lookup on the thread group leader,
+ * since individual threads might have already quit!
+ */
+ rcu_read_lock();
+ task = find_task_by_vpid(field->tgid);
+ if (task)
+ mm = get_task_mm(task);
+ rcu_read_unlock();
+ }
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ unsigned long ip = field->caller[i];
+
+ if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
+ break;
+
+ trace_seq_puts(s, " => ");
+
+ if (!ip) {
+ trace_seq_puts(s, "??");
+ trace_seq_putc(s, '\n');
+ continue;
+ }
+
+ seq_print_user_ip(s, mm, ip, flags);
+ trace_seq_putc(s, '\n');
+ }
+
+ if (mm)
+ mmput(mm);
return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 4cbfe85b99c8..fabc49bcd493 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -14,10 +14,6 @@ trace_print_printk_msg_only(struct trace_iterator *iter);
extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
-extern int seq_print_userip_objs(const struct userstack_entry *entry,
- struct trace_seq *s, unsigned long sym_flags);
-extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
- unsigned long ip, unsigned long sym_flags);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 36c1455b7567..1c2b28536feb 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -178,6 +178,12 @@ static inline void format_mod_start(void) { }
static inline void format_mod_stop(void) { }
#endif /* CONFIG_MODULES */
+static bool __read_mostly trace_printk_enabled = true;
+
+void trace_printk_control(bool enabled)
+{
+ trace_printk_enabled = enabled;
+}
__initdata_or_module static
struct notifier_block module_trace_bprintk_format_nb = {
@@ -192,7 +198,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
if (unlikely(!fmt))
return 0;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
va_start(ap, fmt);
@@ -207,7 +213,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
if (unlikely(!fmt))
return 0;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
return trace_vbprintk(ip, fmt, ap);
@@ -219,7 +225,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
int ret;
va_list ap;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
va_start(ap, fmt);
@@ -231,7 +237,7 @@ EXPORT_SYMBOL_GPL(__trace_printk);
int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
{
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
return trace_vprintk(ip, fmt, ap);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b98dee914542..f6398db09114 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -302,15 +302,15 @@ static nokprobe_inline void call_fetch(struct fetch_param *fprm,
}
/* Check the name is good for event/group/fields */
-static inline int is_good_name(const char *name)
+static inline bool is_good_name(const char *name)
{
if (!isalpha(*name) && *name != '_')
- return 0;
+ return false;
while (*++name != '\0') {
if (!isalpha(*name) && !isdigit(*name) && *name != '_')
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static inline struct event_file_link *
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 419ca37e72c9..4c896a0101bd 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@ static int sched_ref;
static DEFINE_MUTEX(sched_register_mutex);
static void
-probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
+probe_sched_switch(void *ignore, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
{
if (unlikely(!sched_ref))
return;
@@ -26,7 +27,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
}
static void
-probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
+probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
if (unlikely(!sched_ref))
return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9b33dd117f3f..9d4399b553a3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -34,31 +34,28 @@ static arch_spinlock_t wakeup_lock =
static void wakeup_reset(struct trace_array *tr);
static void __wakeup_reset(struct trace_array *tr);
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
-static void wakeup_graph_return(struct ftrace_graph_ret *trace);
static int save_flags;
-static bool function_enabled;
-
-#define TRACE_DISPLAY_GRAPH 1
-static struct tracer_opt trace_opts[] = {
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- /* display latency trace as call graph */
- { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+static int wakeup_display_graph(struct trace_array *tr, int set);
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+#else
+static inline int wakeup_display_graph(struct trace_array *tr, int set)
+{
+ return 0;
+}
+# define is_graph(tr) false
#endif
- { } /* Empty entry */
-};
-
-static struct tracer_flags tracer_flags = {
- .val = 0,
- .opts = trace_opts,
-};
-#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
#ifdef CONFIG_FUNCTION_TRACER
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
+
+static bool function_enabled;
+
/*
* Prologue for the wakeup function tracers.
*
@@ -128,14 +125,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
atomic_dec(&data->disabled);
preempt_enable_notrace();
}
-#endif /* CONFIG_FUNCTION_TRACER */
static int register_wakeup_function(struct trace_array *tr, int graph, int set)
{
int ret;
/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
return 0;
if (graph)
@@ -163,20 +159,40 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph)
function_enabled = false;
}
-static void wakeup_function_set(struct trace_array *tr, int set)
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
{
+ if (!(mask & TRACE_ITER_FUNCTION))
+ return 0;
+
if (set)
- register_wakeup_function(tr, is_graph(), 1);
+ register_wakeup_function(tr, is_graph(tr), 1);
else
- unregister_wakeup_function(tr, is_graph());
+ unregister_wakeup_function(tr, is_graph(tr));
+ return 1;
+}
+#else
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
+{
+ return 0;
+}
+static void unregister_wakeup_function(struct trace_array *tr, int graph) { }
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
+{
+ return 0;
}
+#endif /* CONFIG_FUNCTION_TRACER */
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
{
struct tracer *tracer = tr->current_trace;
- if (mask & TRACE_ITER_FUNCTION)
- wakeup_function_set(tr, set);
+ if (wakeup_function_set(tr, mask, set))
+ return 0;
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ return wakeup_display_graph(tr, set);
+#endif
return trace_keep_overwrite(tracer, mask, set);
}
@@ -203,14 +219,9 @@ static void stop_func_tracer(struct trace_array *tr, int graph)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int
-wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+static int wakeup_display_graph(struct trace_array *tr, int set)
{
-
- if (!(bit & TRACE_DISPLAY_GRAPH))
- return -EINVAL;
-
- if (!(is_graph() ^ set))
+ if (!(is_graph(tr) ^ set))
return 0;
stop_func_tracer(tr, !set);
@@ -259,7 +270,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
static void wakeup_trace_open(struct trace_iterator *iter)
{
- if (is_graph())
+ if (is_graph(iter->tr))
graph_trace_open(iter);
}
@@ -279,7 +290,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
* In graph mode call the graph tracer output function,
* otherwise go with the TRACE_FN event handler
*/
- if (is_graph())
+ if (is_graph(iter->tr))
return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
return TRACE_TYPE_UNHANDLED;
@@ -287,7 +298,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
static void wakeup_print_header(struct seq_file *s)
{
- if (is_graph())
+ if (is_graph(wakeup_trace))
print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
else
trace_default_header(s);
@@ -298,7 +309,7 @@ __trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
unsigned long flags, int pc)
{
- if (is_graph())
+ if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, flags, pc);
else
trace_function(tr, ip, parent_ip, flags, pc);
@@ -306,27 +317,20 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int
-wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
-{
- return -EINVAL;
-}
-
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
-{
- return -1;
-}
-
static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
{
return TRACE_TYPE_UNHANDLED;
}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
static void wakeup_trace_open(struct trace_iterator *iter) { }
static void wakeup_trace_close(struct trace_iterator *iter) { }
#ifdef CONFIG_FUNCTION_TRACER
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
static void wakeup_print_header(struct seq_file *s)
{
trace_default_header(s);
@@ -342,16 +346,16 @@ static void wakeup_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
- return 0;
+ return false;
} else {
if (delta <= tr->max_latency)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static void
@@ -388,7 +392,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
}
static void
@@ -416,11 +420,11 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
}
static void notrace
-probe_wakeup_sched_switch(void *ignore,
+probe_wakeup_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
@@ -514,7 +518,7 @@ static void wakeup_reset(struct trace_array *tr)
}
static void
-probe_wakeup(void *ignore, struct task_struct *p, int success)
+probe_wakeup(void *ignore, struct task_struct *p)
{
struct trace_array_cpu *data;
int cpu = smp_processor_id();
@@ -635,7 +639,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
*/
smp_wmb();
- if (start_func_tracer(tr, is_graph()))
+ if (start_func_tracer(tr, is_graph(tr)))
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
@@ -648,7 +652,7 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- stop_func_tracer(tr, is_graph());
+ stop_func_tracer(tr, is_graph(tr));
unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -659,7 +663,7 @@ static bool wakeup_busy;
static int __wakeup_tracer_init(struct trace_array *tr)
{
- save_flags = trace_flags;
+ save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
@@ -740,8 +744,6 @@ static struct tracer wakeup_tracer __read_mostly =
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
- .flags = &tracer_flags,
- .set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
@@ -762,8 +764,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
- .flags = &tracer_flags,
- .set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
@@ -784,8 +784,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
- .flags = &tracer_flags,
- .set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3f34496244e9..dda9e6742950 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -16,30 +16,22 @@
#include "trace.h"
-#define STACK_TRACE_ENTRIES 500
-
-#ifdef CC_USING_FENTRY
-# define fentry 1
-#else
-# define fentry 0
-#endif
-
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
-static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+unsigned stack_trace_index[STACK_TRACE_ENTRIES];
/*
* Reserve one entry for the passed in ip. This will allow
* us to remove most or all of the stack size overhead
* added by the stack tracer itself.
*/
-static struct stack_trace max_stack_trace = {
+struct stack_trace stack_trace_max = {
.max_entries = STACK_TRACE_ENTRIES - 1,
- .entries = &stack_dump_trace[1],
+ .entries = &stack_dump_trace[0],
};
-static unsigned long max_stack_size;
-static arch_spinlock_t max_stack_lock =
+unsigned long stack_trace_max_size;
+arch_spinlock_t stack_trace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
static DEFINE_PER_CPU(int, trace_active);
@@ -48,83 +40,95 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
-static inline void print_max_stack(void)
+void stack_trace_print(void)
{
long i;
int size;
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries - 1);
+ stack_trace_max.nr_entries);
- for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ for (i = 0; i < stack_trace_max.nr_entries; i++) {
if (stack_dump_trace[i] == ULONG_MAX)
break;
- if (i+1 == max_stack_trace.nr_entries ||
+ if (i+1 == stack_trace_max.nr_entries ||
stack_dump_trace[i+1] == ULONG_MAX)
- size = stack_dump_index[i];
+ size = stack_trace_index[i];
else
- size = stack_dump_index[i] - stack_dump_index[i+1];
+ size = stack_trace_index[i] - stack_trace_index[i+1];
- pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
+ pr_emerg("%3ld) %8d %5d %pS\n", i, stack_trace_index[i],
size, (void *)stack_dump_trace[i]);
}
}
-static inline void
+/*
+ * When arch-specific code overides this function, the following
+ * data should be filled up, assuming stack_trace_max_lock is held to
+ * prevent concurrent updates.
+ * stack_trace_index[]
+ * stack_trace_max
+ * stack_trace_max_size
+ */
+void __weak
check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
int frame_size = ACCESS_ONCE(tracer_frame);
- int i;
+ int i, x;
this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
/* Remove the frame of the tracer */
this_size -= frame_size;
- if (this_size <= max_stack_size)
+ if (this_size <= stack_trace_max_size)
return;
/* we do not handle interrupt stacks yet */
if (!object_is_on_stack(stack))
return;
+ /* Can't do this from NMI context (can cause deadlocks) */
+ if (in_nmi())
+ return;
+
local_irq_save(flags);
- arch_spin_lock(&max_stack_lock);
+ arch_spin_lock(&stack_trace_max_lock);
+
+ /*
+ * RCU may not be watching, make it see us.
+ * The stack trace code uses rcu_sched.
+ */
+ rcu_irq_enter();
/* In case another CPU set the tracer_frame on us */
if (unlikely(!frame_size))
this_size -= tracer_frame;
/* a race could have already updated it */
- if (this_size <= max_stack_size)
+ if (this_size <= stack_trace_max_size)
goto out;
- max_stack_size = this_size;
+ stack_trace_max_size = this_size;
- max_stack_trace.nr_entries = 0;
+ stack_trace_max.nr_entries = 0;
+ stack_trace_max.skip = 3;
- if (using_ftrace_ops_list_func())
- max_stack_trace.skip = 4;
- else
- max_stack_trace.skip = 3;
+ save_stack_trace(&stack_trace_max);
- save_stack_trace(&max_stack_trace);
-
- /*
- * Add the passed in ip from the function tracer.
- * Searching for this on the stack will skip over
- * most of the overhead from the stack tracer itself.
- */
- stack_dump_trace[0] = ip;
- max_stack_trace.nr_entries++;
+ /* Skip over the overhead of the stack tracer itself */
+ for (i = 0; i < stack_trace_max.nr_entries; i++) {
+ if (stack_dump_trace[i] == ip)
+ break;
+ }
/*
* Now find where in the stack these are.
*/
- i = 0;
+ x = 0;
start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -136,15 +140,18 @@ check_stack(unsigned long ip, unsigned long *stack)
* loop will only happen once. This code only takes place
* on a new max, so it is far from a fast path.
*/
- while (i < max_stack_trace.nr_entries) {
+ while (i < stack_trace_max.nr_entries) {
int found = 0;
- stack_dump_index[i] = this_size;
+ stack_trace_index[x] = this_size;
p = start;
- for (; p < top && i < max_stack_trace.nr_entries; p++) {
+ for (; p < top && i < stack_trace_max.nr_entries; p++) {
+ if (stack_dump_trace[i] == ULONG_MAX)
+ break;
if (*p == stack_dump_trace[i]) {
- this_size = stack_dump_index[i++] =
+ stack_dump_trace[x] = stack_dump_trace[i++];
+ this_size = stack_trace_index[x++] =
(top - p) * sizeof(unsigned long);
found = 1;
/* Start the search from here */
@@ -156,10 +163,10 @@ check_stack(unsigned long ip, unsigned long *stack)
* out what that is, then figure it out
* now.
*/
- if (unlikely(!tracer_frame) && i == 1) {
+ if (unlikely(!tracer_frame)) {
tracer_frame = (p - stack) *
sizeof(unsigned long);
- max_stack_size -= tracer_frame;
+ stack_trace_max_size -= tracer_frame;
}
}
}
@@ -168,13 +175,18 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
+ stack_trace_max.nr_entries = x;
+ for (; x < i; x++)
+ stack_dump_trace[x] = ULONG_MAX;
+
if (task_stack_end_corrupted(current)) {
- print_max_stack();
+ stack_trace_print();
BUG();
}
out:
- arch_spin_unlock(&max_stack_lock);
+ rcu_irq_exit();
+ arch_spin_unlock(&stack_trace_max_lock);
local_irq_restore(flags);
}
@@ -192,24 +204,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
- /*
- * When fentry is used, the traced function does not get
- * its stack frame set up, and we lose the parent.
- * The ip is pretty useless because the function tracer
- * was called before that function set up its stack frame.
- * In this case, we use the parent ip.
- *
- * By adding the return address of either the parent ip
- * or the current ip we can disregard most of the stack usage
- * caused by the stack tracer itself.
- *
- * The function tracer always reports the address of where the
- * mcount call was, but the stack will hold the return address.
- */
- if (fentry)
- ip = parent_ip;
- else
- ip += MCOUNT_INSN_SIZE;
+ ip += MCOUNT_INSN_SIZE;
check_stack(ip, &stack);
@@ -262,9 +257,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
cpu = smp_processor_id();
per_cpu(trace_active, cpu)++;
- arch_spin_lock(&max_stack_lock);
+ arch_spin_lock(&stack_trace_max_lock);
*ptr = val;
- arch_spin_unlock(&max_stack_lock);
+ arch_spin_unlock(&stack_trace_max_lock);
per_cpu(trace_active, cpu)--;
local_irq_restore(flags);
@@ -284,7 +279,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
m->private = (void *)n;
@@ -307,7 +302,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
cpu = smp_processor_id();
per_cpu(trace_active, cpu)++;
- arch_spin_lock(&max_stack_lock);
+ arch_spin_lock(&stack_trace_max_lock);
if (*pos == 0)
return SEQ_START_TOKEN;
@@ -319,7 +314,7 @@ static void t_stop(struct seq_file *m, void *p)
{
int cpu;
- arch_spin_unlock(&max_stack_lock);
+ arch_spin_unlock(&stack_trace_max_lock);
cpu = smp_processor_id();
per_cpu(trace_active, cpu)--;
@@ -354,9 +349,9 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries - 1);
+ stack_trace_max.nr_entries);
- if (!stack_tracer_enabled && !max_stack_size)
+ if (!stack_tracer_enabled && !stack_trace_max_size)
print_disabled(m);
return 0;
@@ -364,17 +359,17 @@ static int t_show(struct seq_file *m, void *v)
i = *(long *)v;
- if (i >= max_stack_trace.nr_entries ||
+ if (i >= stack_trace_max.nr_entries ||
stack_dump_trace[i] == ULONG_MAX)
return 0;
- if (i+1 == max_stack_trace.nr_entries ||
+ if (i+1 == stack_trace_max.nr_entries ||
stack_dump_trace[i+1] == ULONG_MAX)
- size = stack_dump_index[i];
+ size = stack_trace_index[i];
else
- size = stack_dump_index[i] - stack_dump_index[i+1];
+ size = stack_trace_index[i] - stack_trace_index[i+1];
- seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
+ seq_printf(m, "%3ld) %8d %5d ", i, stack_trace_index[i], size);
trace_lookup_stack(m, i);
@@ -464,7 +459,7 @@ static __init int stack_trace_init(void)
return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
- &max_stack_size, &stack_max_size_fops);
+ &stack_trace_max_size, &stack_max_size_fops);
trace_create_file("stack_trace", 0444, d_tracer,
NULL, &stack_trace_fops);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7d567a4b9fa7..0655afbea83f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -110,6 +110,7 @@ static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
@@ -136,7 +137,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
/* parameter types */
- if (trace_flags & TRACE_ITER_VERBOSE)
+ if (tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b36fa8..d2f6d0be3503 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v)
seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
trace_event_name(&tu->tp.call));
- seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
+ seq_printf(m, " %s:", tu->filename);
+
+ /* Don't print "0x (null)" when offset is 0 */
+ if (tu->offset) {
+ seq_printf(m, "0x%p", (void *)tu->offset);
+ } else {
+ switch (sizeof(void *)) {
+ case 4:
+ seq_printf(m, "0x00000000");
+ break;
+ case 8:
+ default:
+ seq_printf(m, "0x0000000000000000");
+ break;
+ }
+ }
for (i = 0; i < tu->tp.nr_args; i++)
seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -1095,11 +1110,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
{
struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
+ struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
size = esize + tu->tp.size + dsize;
@@ -1289,6 +1308,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
return -ENODEV;
}
+ call->flags = TRACE_EVENT_FL_UPROBE;
call->class->reg = trace_uprobe_register;
call->data = tu;
ret = trace_add_event_call(call);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 3490407dc7b7..ecd536de603a 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -91,11 +91,13 @@ static void debug_print_probes(struct tracepoint_func *funcs)
printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
}
-static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
- struct tracepoint_func *tp_func)
+static struct tracepoint_func *
+func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
+ int prio)
{
- int nr_probes = 0;
struct tracepoint_func *old, *new;
+ int nr_probes = 0;
+ int pos = -1;
if (WARN_ON(!tp_func->func))
return ERR_PTR(-EINVAL);
@@ -104,18 +106,33 @@ static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ /* Insert before probes of lower priority */
+ if (pos < 0 && old[nr_probes].prio < prio)
+ pos = nr_probes;
if (old[nr_probes].func == tp_func->func &&
old[nr_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
+ }
}
/* + 2 : one for new probe, one for NULL func */
new = allocate_probes(nr_probes + 2);
if (new == NULL)
return ERR_PTR(-ENOMEM);
- if (old)
- memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- new[nr_probes] = *tp_func;
+ if (old) {
+ if (pos < 0) {
+ pos = nr_probes;
+ memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+ } else {
+ /* Copy higher priority probes ahead of the new probe */
+ memcpy(new, old, pos * sizeof(struct tracepoint_func));
+ /* Copy the rest after it. */
+ memcpy(new + pos + 1, old + pos,
+ (nr_probes - pos) * sizeof(struct tracepoint_func));
+ }
+ } else
+ pos = 0;
+ new[pos] = *tp_func;
new[nr_probes + 1].func = NULL;
*funcs = new;
debug_print_probes(*funcs);
@@ -174,7 +191,7 @@ static void *func_remove(struct tracepoint_func **funcs,
* Add the probe function to a tracepoint.
*/
static int tracepoint_add_func(struct tracepoint *tp,
- struct tracepoint_func *func)
+ struct tracepoint_func *func, int prio)
{
struct tracepoint_func *old, *tp_funcs;
@@ -183,7 +200,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
- old = func_add(&tp_funcs, func);
+ old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
WARN_ON_ONCE(1);
return PTR_ERR(old);
@@ -240,6 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
+ * @prio: priority of this function over other registered functions
*
* Returns 0 if ok, error value on error.
* Note: if @tp is within a module, the caller is responsible for
@@ -247,7 +265,8 @@ static int tracepoint_remove_func(struct tracepoint *tp,
* performed either with a tracepoint module going notifier, or from
* within module exit functions.
*/
-int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
+int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
+ void *data, int prio)
{
struct tracepoint_func tp_func;
int ret;
@@ -255,10 +274,30 @@ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
mutex_lock(&tracepoints_mutex);
tp_func.func = probe;
tp_func.data = data;
- ret = tracepoint_add_func(tp, &tp_func);
+ tp_func.prio = prio;
+ ret = tracepoint_add_func(tp, &tp_func, prio);
mutex_unlock(&tracepoints_mutex);
return ret;
}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
+
+/**
+ * tracepoint_probe_register - Connect a probe to a tracepoint
+ * @tp: tracepoint
+ * @probe: probe handler
+ * @data: tracepoint data
+ * @prio: priority of this function over other registered functions
+ *
+ * Returns 0 if ok, error value on error.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
+ */
+int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
+{
+ return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO);
+}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
/**
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4109f8320684..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->cap_inheritable = CAP_EMPTY_SET;
cred->cap_permitted = CAP_FULL_SET;
cred->cap_effective = CAP_FULL_SET;
+ cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
@@ -976,8 +977,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
if (user_ns == current_user_ns())
return -EINVAL;
- /* Threaded processes may not enter a different user namespace */
- if (atomic_read(&current->mm->mm_users) > 1)
+ /* Tasks that share a thread group must share a user namespace */
+ if (!thread_group_empty(current))
return -EINVAL;
if (current->fs->users != 1)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..18f34cf75f74 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
+#include <linux/kthread.h>
/*
* The run state of the lockup detectors is controlled by the content of the
@@ -56,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -66,7 +69,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -90,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn;
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic =
+unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -153,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_hardlockup_all_cpu_backtrace =
+ !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
#endif
/*
@@ -243,15 +273,15 @@ void touch_softlockup_watchdog_sync(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */
-static int is_hardlockup(void)
+static bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return 1;
+ return true;
__this_cpu_write(hrtimer_interrupts_saved, hrint);
- return 0;
+ return false;
}
#endif
@@ -259,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
- if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+ if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
/* Warn about unreasonable delays. */
if (time_after(now, touch_ts + get_softlockup_thresh()))
return now - touch_ts;
@@ -298,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
+ struct pt_regs *regs = get_irq_regs();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d",
- this_cpu);
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
- this_cpu);
+ dump_stack();
+
+ /*
+ * Perform all-CPU dump only once to avoid multiple hardlockups
+ * generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ panic("Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -327,6 +370,9 @@ static void watchdog_interrupt_count(void)
static int watchdog_nmi_enable(unsigned int cpu);
static void watchdog_nmi_disable(unsigned int cpu);
+static int watchdog_enable_all_cpus(void);
+static void watchdog_disable_all_cpus(void);
+
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -613,46 +659,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
}
}
-void watchdog_nmi_enable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_enable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!watchdog_running)
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_disable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +675,107 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function returns an error if kthread_park() of a watchdog thread
+ * fails. In this situation, the watchdog threads of some CPUs can already
+ * be parked and the watchdog threads of other CPUs can still be runnable.
+ * Callers are expected to handle this special condition as appropriate in
+ * their context.
+ *
+ * This function may only be called in a context that is protected against
+ * races with CPU hotplug - for example, via get_online_cpus().
+ */
+static int watchdog_park_threads(void)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
- int ret;
+ int cpu, ret = 0;
+ for_each_watchdog_cpu(cpu) {
+ ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function may only be called in a context that is protected against
+ * races with CPU hotplug - for example, via get_online_cpus().
+ */
+static void watchdog_unpark_threads(void)
+{
+ int cpu;
+
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+}
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+ int ret = 0;
+
+ get_online_cpus();
+ mutex_lock(&watchdog_proc_mutex);
/*
- * No need to cancel and restart hrtimer if it is currently executing
- * because it will reprogram itself with the new period now.
- * We should never see it unqueued here because we are running per-cpu
- * with interrupts disabled.
+ * Multiple suspend requests can be active in parallel (counted by
+ * the 'watchdog_suspended' variable). If the watchdog threads are
+ * running, the first caller takes care that they will be parked.
+ * The state of 'watchdog_running' cannot change while a suspend
+ * request is active (see related code in 'proc' handlers).
*/
- ret = hrtimer_try_to_cancel(hrtimer);
- if (ret == 1)
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
+ if (watchdog_running && !watchdog_suspended)
+ ret = watchdog_park_threads();
+
+ if (ret == 0)
+ watchdog_suspended++;
+ else {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to suspend lockup detectors, disabled\n");
+ watchdog_enabled = 0;
+ }
+
+ mutex_unlock(&watchdog_proc_mutex);
+
+ return ret;
}
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
{
+ mutex_lock(&watchdog_proc_mutex);
+
+ watchdog_suspended--;
/*
- * Make sure that perf event counter will adopt to a new
- * sampling period. Updating the sampling period directly would
- * be much nicer but we do not have an API for that now so
- * let's use a big hammer.
- * Hrtimer will adopt the new period on the next tick but this
- * might be late already so we have to restart the timer as well.
+ * The watchdog threads are unparked if they were previously running
+ * and if there is no more active suspend request.
*/
- watchdog_nmi_disable(cpu);
- smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
- watchdog_nmi_enable(cpu);
+ if (watchdog_running && !watchdog_suspended)
+ watchdog_unpark_threads();
+
+ mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
}
-static void update_watchdog_all_cpus(void)
+static int update_watchdog_all_cpus(void)
{
- int cpu;
+ int ret;
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- update_watchdog(cpu);
- put_online_cpus();
+ ret = watchdog_park_threads();
+ if (ret)
+ return ret;
+
+ watchdog_unpark_threads();
+
+ return 0;
}
static int watchdog_enable_all_cpus(void)
@@ -713,29 +783,31 @@ static int watchdog_enable_all_cpus(void)
int err = 0;
if (!watchdog_running) {
- err = smpboot_register_percpu_thread(&watchdog_threads);
+ err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_cpumask);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
- else {
- if (smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask))
- pr_err("Failed to set cpumask for watchdog threads\n");
+ else
watchdog_running = 1;
- }
} else {
/*
* Enable/disable the lockup detectors or
* change the sample period 'on the fly'.
*/
- update_watchdog_all_cpus();
+ err = update_watchdog_all_cpus();
+
+ if (err) {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to update lockup detectors, disabled\n");
+ }
}
+ if (err)
+ watchdog_enabled = 0;
+
return err;
}
-/* prepare/enable/disable routines */
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
static void watchdog_disable_all_cpus(void)
{
if (watchdog_running) {
@@ -744,6 +816,8 @@ static void watchdog_disable_all_cpus(void)
}
}
+#ifdef CONFIG_SYSCTL
+
/*
* Update the run state of the lockup detectors.
*/
@@ -785,8 +859,15 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
int err, old, new;
int *watchdog_param = (int *)table->data;
+ get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
/*
* If the parameter is being read return the state of the corresponding
* bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -820,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
} while (cmpxchg(&watchdog_enabled, old, new) != old);
/*
- * Update the run state of the lockup detectors.
- * Restore 'watchdog_enabled' on failure.
+ * Update the run state of the lockup detectors. There is _no_
+ * need to check the value returned by proc_watchdog_update()
+ * and to restore the previous value of 'watchdog_enabled' as
+ * both lockup detectors are disabled if proc_watchdog_update()
+ * returns an error.
*/
err = proc_watchdog_update();
- if (err)
- watchdog_enabled = old;
}
out:
mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
return err;
}
@@ -870,8 +953,15 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
{
int err, old;
+ get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
old = ACCESS_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -879,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
goto out;
/*
- * Update the sample period.
- * Restore 'watchdog_thresh' on failure.
+ * Update the sample period. Restore on failure.
*/
set_sample_period();
err = proc_watchdog_update();
- if (err)
+ if (err) {
watchdog_thresh = old;
+ set_sample_period();
+ }
out:
mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
return err;
}
@@ -902,7 +994,15 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
{
int err;
+ get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
+
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write) {
/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,7 +1020,9 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
pr_err("cpumask update failed\n");
}
}
+out:
mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
return err;
}
@@ -932,10 +1034,8 @@ void __init lockup_detector_init(void)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
- if (!cpumask_empty(tick_nohz_full_mask))
- pr_info("Disabling watchdog on nohz_full cores by default\n");
- cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
- tick_nohz_full_mask);
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_copy(&watchdog_cpumask, housekeeping_mask);
} else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f06176f74..c579dbab2e36 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
#include <trace/events/workqueue.h>
#define assert_rcu_or_pool_mutex() \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq_pool_mutex), \
- "sched RCU or wq_pool_mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU or wq_pool_mutex should be held")
#define assert_rcu_or_wq_mutex(wq) \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq->mutex), \
- "sched RCU or wq->mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq->mutex), \
+ "sched RCU or wq->mutex should be held")
#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&wq->mutex) || \
- lockdep_is_held(&wq_pool_mutex), \
- "sched RCU, wq->mutex or wq_pool_mutex should be held")
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&wq->mutex) && \
+ !lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU, wq->mutex or wq_pool_mutex should be held")
#define for_each_cpu_worker_pool(pool, cpu) \
for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
@@ -1458,13 +1458,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
timer_stats_timer_set_start_info(&dwork->timer);
dwork->wq = wq;
+ /* timer isn't guaranteed to run in this cpu, record earlier */
+ if (cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
dwork->cpu = cpu;
timer->expires = jiffies + delay;
- if (unlikely(cpu != WORK_CPU_UNBOUND))
- add_timer_on(timer, cpu);
- else
- add_timer(timer);
+ add_timer_on(timer, cpu);
}
/**
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
goto fail;
set_user_nice(worker->task, pool->attrs->nice);
-
- /* prevent userland from meddling with cpumask of workqueue workers */
- worker->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(worker->task, pool->attrs->cpumask);
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
@@ -2614,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq)
out_unlock:
mutex_unlock(&wq->mutex);
}
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
/**
* drain_workqueue - drain a workqueue
@@ -3201,6 +3199,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
int node;
+ int target_node = NUMA_NO_NODE;
lockdep_assert_held(&wq_pool_mutex);
@@ -3212,13 +3211,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
}
}
+ /* if cpumask is contained inside a NUMA node, we belong to that node */
+ if (wq_numa_enabled) {
+ for_each_node(node) {
+ if (cpumask_subset(attrs->cpumask,
+ wq_numa_possible_cpumask[node])) {
+ target_node = node;
+ break;
+ }
+ }
+ }
+
/* nope, create a new one */
- pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
if (!pool || init_worker_pool(pool) < 0)
goto fail;
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
+ pool->node = target_node;
/*
* no_numa isn't a worker_pool attribute, always clear it. See
@@ -3226,17 +3237,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
*/
pool->attrs->no_numa = false;
- /* if cpumask is contained inside a NUMA node, we belong to that node */
- if (wq_numa_enabled) {
- for_each_node(node) {
- if (cpumask_subset(pool->attrs->cpumask,
- wq_numa_possible_cpumask[node])) {
- pool->node = node;
- break;
- }
- }
- }
-
if (worker_pool_assign_id(pool) < 0)
goto fail;
@@ -3856,7 +3856,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
}
wq->rescuer = rescuer;
- rescuer->task->flags |= PF_NO_SETAFFINITY;
+ kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
}