diff options
Diffstat (limited to 'kernel')
52 files changed, 1117 insertions, 706 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 94fabd534b03..2a202a846757 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) + def_bool HIGH_RES_TIMERS diff --git a/kernel/Makefile b/kernel/Makefile index 09a9c94f42bd..bbaf7d59c1bb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,8 +41,9 @@ ifneq ($(CONFIG_SMP),y) obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o +obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o @@ -122,19 +123,52 @@ targets += timeconst.h $(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE $(call if_changed,bc) -ifeq ($(CONFIG_MODULE_SIG),y) +############################################################################### +# +# Roll all the X.509 certificates that we can find together and pull them into +# the kernel so that they get loaded into the system trusted keyring during +# boot. # -# Pull the signing certificate and any extra certificates into the kernel +# We look in the source root and the build root for all files whose name ends +# in ".x509". Unfortunately, this will generate duplicate filenames, so we +# have make canonicalise the pathnames and then sort them to discard the +# duplicates. # +############################################################################### +ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y) +X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509) +X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += signing_key.x509 +X509_CERTIFICATES := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \ + $(or $(realpath $(CERT)),$(CERT)))) + +ifeq ($(X509_CERTIFICATES),) +$(warning *** No X.509 certificates found ***) +endif + +ifneq ($(wildcard $(obj)/.x509.list),) +ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES)) +$(info X.509 certificate list changed) +$(shell rm $(obj)/.x509.list) +endif +endif + +kernel/system_certificates.o: $(obj)/x509_certificate_list -quiet_cmd_touch = TOUCH $@ - cmd_touch = touch $@ +quiet_cmd_x509certs = CERTS $@ + cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; echo " - Including cert $(X509)") -extra_certificates: - $(call cmd,touch) +targets += $(obj)/x509_certificate_list +$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list + $(call if_changed,x509certs) -kernel/modsign_certificate.o: signing_key.x509 extra_certificates +targets += $(obj)/.x509.list +$(obj)/.x509.list: + @echo $(X509_CERTIFICATES) >$@ +clean-files := x509_certificate_list .x509.list +endif + +ifeq ($(CONFIG_MODULE_SIG),y) ############################################################################### # # If module signing is requested, say by allyesconfig, but a key has not been diff --git a/kernel/audit.c b/kernel/audit.c index 7b0e23a740ce..906ae5a0233a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -60,7 +60,6 @@ #ifdef CONFIG_SECURITY #include <linux/security.h> #endif -#include <net/netlink.h> #include <linux/freezer.h> #include <linux/tty.h> #include <linux/pid_namespace.h> @@ -140,6 +139,17 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); +static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION, + .mask = -1, + .features = 0, + .lock = 0,}; + +static char *audit_feature_names[2] = { + "only_unset_loginuid", + "loginuid_immutable", +}; + + /* Serialize requests from userspace. */ DEFINE_MUTEX(audit_cmd_mutex); @@ -584,6 +594,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return -EOPNOTSUPP; case AUDIT_GET: case AUDIT_SET: + case AUDIT_GET_FEATURE: + case AUDIT_SET_FEATURE: case AUDIT_LIST_RULES: case AUDIT_ADD_RULE: case AUDIT_DEL_RULE: @@ -613,7 +625,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) int rc = 0; uid_t uid = from_kuid(&init_user_ns, current_uid()); - if (!audit_enabled) { + if (!audit_enabled && msg_type != AUDIT_USER_AVC) { *ab = NULL; return rc; } @@ -628,6 +640,94 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return rc; } +int is_audit_feature_set(int i) +{ + return af.features & AUDIT_FEATURE_TO_MASK(i); +} + + +static int audit_get_feature(struct sk_buff *skb) +{ + u32 seq; + + seq = nlmsg_hdr(skb)->nlmsg_seq; + + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, + &af, sizeof(af)); + + return 0; +} + +static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature, + u32 old_lock, u32 new_lock, int res) +{ + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); + audit_log_format(ab, "feature=%s new=%d old=%d old_lock=%d new_lock=%d res=%d", + audit_feature_names[which], !!old_feature, !!new_feature, + !!old_lock, !!new_lock, res); + audit_log_end(ab); +} + +static int audit_set_feature(struct sk_buff *skb) +{ + struct audit_features *uaf; + int i; + + BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); + uaf = nlmsg_data(nlmsg_hdr(skb)); + + /* if there is ever a version 2 we should handle that here */ + + for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { + u32 feature = AUDIT_FEATURE_TO_MASK(i); + u32 old_feature, new_feature, old_lock, new_lock; + + /* if we are not changing this feature, move along */ + if (!(feature & uaf->mask)) + continue; + + old_feature = af.features & feature; + new_feature = uaf->features & feature; + new_lock = (uaf->lock | af.lock) & feature; + old_lock = af.lock & feature; + + /* are we changing a locked feature? */ + if ((af.lock & feature) && (new_feature != old_feature)) { + audit_log_feature_change(i, old_feature, new_feature, + old_lock, new_lock, 0); + return -EPERM; + } + } + /* nothing invalid, do the changes */ + for (i = 0; i <= AUDIT_LAST_FEATURE; i++) { + u32 feature = AUDIT_FEATURE_TO_MASK(i); + u32 old_feature, new_feature, old_lock, new_lock; + + /* if we are not changing this feature, move along */ + if (!(feature & uaf->mask)) + continue; + + old_feature = af.features & feature; + new_feature = uaf->features & feature; + old_lock = af.lock & feature; + new_lock = (uaf->lock | af.lock) & feature; + + if (new_feature != old_feature) + audit_log_feature_change(i, old_feature, new_feature, + old_lock, new_lock, 1); + + if (new_feature) + af.features |= feature; + else + af.features &= ~feature; + af.lock |= new_lock; + } + + return 0; +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -659,6 +759,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) switch (msg_type) { case AUDIT_GET: + memset(&status_set, 0, sizeof(status_set)); status_set.enabled = audit_enabled; status_set.failure = audit_failure; status_set.pid = audit_pid; @@ -670,7 +771,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) &status_set, sizeof(status_set)); break; case AUDIT_SET: - if (nlh->nlmsg_len < sizeof(struct audit_status)) + if (nlmsg_len(nlh) < sizeof(struct audit_status)) return -EINVAL; status_get = (struct audit_status *)data; if (status_get->mask & AUDIT_STATUS_ENABLED) { @@ -699,6 +800,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit); break; + case AUDIT_GET_FEATURE: + err = audit_get_feature(skb); + if (err) + return err; + break; + case AUDIT_SET_FEATURE: + err = audit_set_feature(skb); + if (err) + return err; + break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: @@ -715,7 +826,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } audit_log_common_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) - audit_log_format(ab, " msg='%.1024s'", + audit_log_format(ab, " msg='%.*s'", + AUDIT_MESSAGE_TEXT_MAX, (char *)data); else { int size; @@ -818,7 +930,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct task_struct *tsk = current; spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty != 0; + s.enabled = tsk->signal->audit_tty; s.log_passwd = tsk->signal->audit_tty_log_passwd; spin_unlock(&tsk->sighand->siglock); @@ -832,7 +944,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ - memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len)); + memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh))); if ((s.enabled != 0 && s.enabled != 1) || (s.log_passwd != 0 && s.log_passwd != 1)) return -EINVAL; @@ -1067,13 +1179,6 @@ static void wait_for_auditd(unsigned long sleep_time) remove_wait_queue(&audit_backlog_wait, &wait); } -/* Obtain an audit buffer. This routine does locking to obtain the - * audit buffer, but then no locking is required for calls to - * audit_log_*format. If the tsk is a task that is currently in a - * syscall, then the syscall is marked as auditable and an audit record - * will be written at syscall exit. If there is no associated task, tsk - * should be NULL. */ - /** * audit_log_start - obtain an audit buffer * @ctx: audit_context (may be NULL) @@ -1389,7 +1494,7 @@ void audit_log_session_info(struct audit_buffer *ab) u32 sessionid = audit_get_sessionid(current); uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current)); - audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid); + audit_log_format(ab, " auid=%u ses=%u", auid, sessionid); } void audit_log_key(struct audit_buffer *ab, char *key) @@ -1536,6 +1641,26 @@ void audit_log_name(struct audit_context *context, struct audit_names *n, } } + /* log the audit_names record type */ + audit_log_format(ab, " nametype="); + switch(n->type) { + case AUDIT_TYPE_NORMAL: + audit_log_format(ab, "NORMAL"); + break; + case AUDIT_TYPE_PARENT: + audit_log_format(ab, "PARENT"); + break; + case AUDIT_TYPE_CHILD_DELETE: + audit_log_format(ab, "DELETE"); + break; + case AUDIT_TYPE_CHILD_CREATE: + audit_log_format(ab, "CREATE"); + break; + default: + audit_log_format(ab, "UNKNOWN"); + break; + } + audit_log_fcaps(ab, n); audit_log_end(ab); } diff --git a/kernel/audit.h b/kernel/audit.h index 123c9b7c3979..b779642b29af 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -197,6 +197,9 @@ struct audit_context { int fd; int flags; } mmap; + struct { + int argc; + } execve; }; int fds[2]; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f7aee8be7fb2..51f3fd4c1ed3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -343,6 +343,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: + case AUDIT_INODE: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; @@ -423,7 +424,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->lsm_rule = NULL; /* Support legacy tests for a valid loginuid */ - if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) { + if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { f->type = AUDIT_LOGINUID_SET; f->val = 0; } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9845cb32b60a..90594c9f7552 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -95,13 +95,6 @@ struct audit_aux_data { /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 -struct audit_aux_data_execve { - struct audit_aux_data d; - int argc; - int envc; - struct mm_struct *mm; -}; - struct audit_aux_data_pids { struct audit_aux_data d; pid_t target_pid[AUDIT_AUX_PIDS]; @@ -121,12 +114,6 @@ struct audit_aux_data_bprm_fcaps { struct audit_cap_data new_pcap; }; -struct audit_aux_data_capset { - struct audit_aux_data d; - pid_t pid; - struct audit_cap_data cap; -}; - struct audit_tree_refs { struct audit_tree_refs *next; struct audit_chunk *c[31]; @@ -566,7 +553,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_INODE: if (name) - result = (name->ino == f->val); + result = audit_comparator(name->ino, f->op, f->val); else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { if (audit_comparator(n->ino, f->op, f->val)) { @@ -943,8 +930,10 @@ int audit_alloc(struct task_struct *tsk) return 0; /* Return if not auditing. */ state = audit_filter_task(tsk, &key); - if (state == AUDIT_DISABLED) + if (state == AUDIT_DISABLED) { + clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); return 0; + } if (!(context = audit_alloc_context(state))) { kfree(key); @@ -1149,20 +1138,16 @@ static int audit_log_single_execve_arg(struct audit_context *context, } static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab, - struct audit_aux_data_execve *axi) + struct audit_buffer **ab) { int i, len; size_t len_sent = 0; const char __user *p; char *buf; - if (axi->mm != current->mm) - return; /* execve failed, no additional info */ - - p = (const char __user *)axi->mm->arg_start; + p = (const char __user *)current->mm->arg_start; - audit_log_format(*ab, "argc=%d", axi->argc); + audit_log_format(*ab, "argc=%d", context->execve.argc); /* * we need some kernel buffer to hold the userspace args. Just @@ -1176,7 +1161,7 @@ static void audit_log_execve_info(struct audit_context *context, return; } - for (i = 0; i < axi->argc; i++) { + for (i = 0; i < context->execve.argc; i++) { len = audit_log_single_execve_arg(context, ab, i, &len_sent, p, buf); if (len <= 0) @@ -1279,6 +1264,9 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, context->mmap.flags); break; } + case AUDIT_EXECVE: { + audit_log_execve_info(context, &ab); + break; } } audit_log_end(ab); } @@ -1325,11 +1313,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts switch (aux->type) { - case AUDIT_EXECVE: { - struct audit_aux_data_execve *axi = (void *)aux; - audit_log_execve_info(context, &ab, axi); - break; } - case AUDIT_BPRM_FCAPS: { struct audit_aux_data_bprm_fcaps *axs = (void *)aux; audit_log_format(ab, "fver=%x", axs->fcap_ver); @@ -1964,6 +1947,43 @@ int auditsc_get_stamp(struct audit_context *ctx, /* global counter which is incremented every time something logs in */ static atomic_t session_id = ATOMIC_INIT(0); +static int audit_set_loginuid_perm(kuid_t loginuid) +{ + /* if we are unset, we don't need privs */ + if (!audit_loginuid_set(current)) + return 0; + /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ + if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) + return -EPERM; + /* it is set, you need permission */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + /* reject if this is not an unset and we don't allow that */ + if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) + return -EPERM; + return 0; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, + unsigned int oldsessionid, unsigned int sessionid, + int rc) +{ + struct audit_buffer *ab; + uid_t uid, ologinuid, nloginuid; + + uid = from_kuid(&init_user_ns, task_uid(current)); + ologinuid = from_kuid(&init_user_ns, koldloginuid); + nloginuid = from_kuid(&init_user_ns, kloginuid), + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + audit_log_format(ab, "pid=%d uid=%u old auid=%u new auid=%u old " + "ses=%u new ses=%u res=%d", current->pid, uid, ologinuid, + nloginuid, oldsessionid, sessionid, !rc); + audit_log_end(ab); +} + /** * audit_set_loginuid - set current task's audit_context loginuid * @loginuid: loginuid value @@ -1975,37 +1995,26 @@ static atomic_t session_id = ATOMIC_INIT(0); int audit_set_loginuid(kuid_t loginuid) { struct task_struct *task = current; - struct audit_context *context = task->audit_context; - unsigned int sessionid; + unsigned int oldsessionid, sessionid = (unsigned int)-1; + kuid_t oldloginuid; + int rc; -#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE - if (audit_loginuid_set(task)) - return -EPERM; -#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; -#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ + oldloginuid = audit_get_loginuid(current); + oldsessionid = audit_get_sessionid(current); - sessionid = atomic_inc_return(&session_id); - if (context && context->in_syscall) { - struct audit_buffer *ab; + rc = audit_set_loginuid_perm(loginuid); + if (rc) + goto out; + + /* are we setting or clearing? */ + if (uid_valid(loginuid)) + sessionid = atomic_inc_return(&session_id); - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u" - " old ses=%u new ses=%u", - task->pid, - from_kuid(&init_user_ns, task_uid(task)), - from_kuid(&init_user_ns, task->loginuid), - from_kuid(&init_user_ns, loginuid), - task->sessionid, sessionid); - audit_log_end(ab); - } - } task->sessionid = sessionid; task->loginuid = loginuid; - return 0; +out: + audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); + return rc; } /** @@ -2126,22 +2135,12 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mo context->ipc.has_perm = 1; } -int __audit_bprm(struct linux_binprm *bprm) +void __audit_bprm(struct linux_binprm *bprm) { - struct audit_aux_data_execve *ax; struct audit_context *context = current->audit_context; - ax = kmalloc(sizeof(*ax), GFP_KERNEL); - if (!ax) - return -ENOMEM; - - ax->argc = bprm->argc; - ax->envc = bprm->envc; - ax->mm = bprm->mm; - ax->d.type = AUDIT_EXECVE; - ax->d.next = context->aux; - context->aux = (void *)ax; - return 0; + context->type = AUDIT_EXECVE; + context->execve.argc = bprm->argc; } diff --git a/kernel/bounds.c b/kernel/bounds.c index e8ca97b5c386..5253204afdca 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -11,6 +11,7 @@ #include <linux/kbuild.h> #include <linux/page_cgroup.h> #include <linux/log2.h> +#include <linux/spinlock_types.h> void foo(void) { @@ -21,5 +22,6 @@ void foo(void) #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif + DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); /* End of constants */ } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0839bcd48c8..8b729c278b64 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -90,6 +90,14 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); /* + * cgroup destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_destroy_wq; + +/* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by @@ -191,6 +199,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static int cgroup_file_release(struct inode *inode, struct file *file); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -871,7 +880,7 @@ static void cgroup_free_rcu(struct rcu_head *head) struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); INIT_WORK(&cgrp->destroy_work, cgroup_free_fn); - schedule_work(&cgrp->destroy_work); + queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } static void cgroup_diput(struct dentry *dentry, struct inode *inode) @@ -895,11 +904,6 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } -static int cgroup_delete(const struct dentry *d) -{ - return 1; -} - static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1486,7 +1490,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, - .d_delete = cgroup_delete, + .d_delete = always_delete_dentry, }; struct inode *inode = @@ -2426,7 +2430,7 @@ static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = single_release, + .release = cgroup_file_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2487,6 +2491,8 @@ static int cgroup_file_release(struct inode *inode, struct file *file) ret = cft->release(inode, file); if (css->ss) css_put(css); + if (file->f_op == &cgroup_seqfile_operations) + single_release(inode, file); return ret; } @@ -4254,7 +4260,7 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } static void css_release(struct percpu_ref *ref) @@ -4544,7 +4550,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_killed_work_fn); - schedule_work(&css->destroy_work); + queue_work(cgroup_destroy_wq, &css->destroy_work); } /** @@ -5068,6 +5074,22 @@ out: return err; } +static int __init cgroup_wq_init(void) +{ + /* + * There isn't much point in executing destruction path in + * parallel. Good chunk is serialized with cgroup_mutex anyway. + * Use 1 for @max_active. + * + * We would prefer to do this in cgroup_init() above, but that + * is called before init_workqueues(): so leave this until after. + */ + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + BUG_ON(!cgroup_destroy_wq); + return 0; +} +core_initcall(cgroup_wq_init); + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6bf981e13c43..4772034b4b17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1033,8 +1033,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) + if (need_loop) { + local_irq_disable(); write_seqcount_begin(&tsk->mems_allowed_seq); + } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -1042,8 +1044,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; - if (need_loop) + if (need_loop) { write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); + } task_unlock(tsk); } diff --git a/kernel/events/core.c b/kernel/events/core.c index d724e7757cd1..72348dc192c1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5680,11 +5680,6 @@ static void swevent_hlist_put(struct perf_event *event) { int cpu; - if (event->cpu != -1) { - swevent_hlist_put_cpu(event, event->cpu); - return; - } - for_each_possible_cpu(cpu) swevent_hlist_put_cpu(event, cpu); } @@ -5718,9 +5713,6 @@ static int swevent_hlist_get(struct perf_event *event) int err; int cpu, failed_cpu; - if (event->cpu != -1) - return swevent_hlist_get_cpu(event, event->cpu); - get_online_cpus(); for_each_possible_cpu(cpu) { err = swevent_hlist_get_cpu(event, cpu); diff --git a/kernel/extable.c b/kernel/extable.c index 832cb28105bb..763faf037ec1 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -61,7 +61,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) static inline int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) + addr < (unsigned long)_einittext) return 1; return 0; } @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && - addr <= (unsigned long)_etext) + addr < (unsigned long)_etext) return 1; if (system_state == SYSTEM_BOOTING && diff --git a/kernel/fork.c b/kernel/fork.c index f6d11fc67f72..728d5be9548c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; - mm->nr_ptes = 0; + atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); @@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON(mm->pmd_huge_pte); #endif } @@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif if (!mm_init(mm, tsk)) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8807061ca004..9328b80eaf14 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -207,6 +207,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +static atomic_t reset_hung_task = ATOMIC_INIT(0); + +void reset_hung_task_detector(void) +{ + atomic_set(&reset_hung_task, 1); +} +EXPORT_SYMBOL_GPL(reset_hung_task_detector); + /* * kthread which checks for tasks stuck in D state */ @@ -220,6 +228,9 @@ static int watchdog(void *dummy) while (schedule_timeout_interruptible(timeout_jiffies(timeout))) timeout = sysctl_hung_task_timeout_secs; + if (atomic_xchg(&reset_hung_task, 0)) + continue; + check_hung_uninterruptible_tasks(timeout); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3bb14fbe5c6..dc04c166c54d 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -214,7 +214,7 @@ void irq_enable(struct irq_desc *desc) } /** - * irq_disable - Mark interupt disabled + * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled * * If the chip does not implement the irq_disable callback, we diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3e59f951d42f..481a13c43b17 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -786,7 +786,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) } /* - * Interrupts explicitely requested as threaded interupts want to be + * Interrupts explicitly requested as threaded interrupts want to be * preemtible - many of them need to sleep and wait for slow busses to * complete. */ diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 1162f1030f18..3320b84cc60f 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,6 +14,7 @@ enum { _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, + _IRQ_IS_POLLED = IRQ_IS_POLLED, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -26,6 +27,7 @@ enum { #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON #define IRQ_PER_CPU_DEVID GOT_YOU_MORON +#define IRQ_IS_POLLED GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -147,3 +149,8 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { return desc->status_use_accessors & _IRQ_NESTED_THREAD; } + +static inline bool irq_settings_is_polled(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_IS_POLLED; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7b5f012bde9d..a1d8cc63b56e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -67,8 +67,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); - /* PER_CPU and nested thread interrupts are never polled */ - if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) + /* + * PER_CPU, nested thread interrupts and interrupts explicitely + * marked polled are excluded from polling. + */ + if (irq_settings_is_per_cpu(desc) || + irq_settings_is_nested_thread(desc) || + irq_settings_is_polled(desc)) goto out; /* @@ -268,7 +273,8 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { - if (desc->istate & IRQS_POLL_INPROGRESS) + if (desc->istate & IRQS_POLL_INPROGRESS || + irq_settings_is_polled(desc)) return; /* we get here again via the threaded handler */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 2a74f307c5ec..490afc03627e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -921,7 +921,7 @@ static int kimage_load_segment(struct kimage *image, * reinitialize them. * * - A machine specific part that includes the syscall number - * and the copies the image to it's final destination. And + * and then copies the image to it's final destination. And * jumps into the image at entry. * * kexec does not sync, or unmount filesystems so if you need diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S deleted file mode 100644 index 4a9a86d12c8b..000000000000 --- a/kernel/modsign_certificate.S +++ /dev/null @@ -1,12 +0,0 @@ -#include <linux/export.h> - -#define GLOBAL(name) \ - .globl VMLINUX_SYMBOL(name); \ - VMLINUX_SYMBOL(name): - - .section ".init.data","aw" - -GLOBAL(modsign_certificate_list) - .incbin "signing_key.x509" - .incbin "extra_certificates" -GLOBAL(modsign_certificate_list_end) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c deleted file mode 100644 index 7cbd4507a7e6..000000000000 --- a/kernel/modsign_pubkey.c +++ /dev/null @@ -1,104 +0,0 @@ -/* Public keys for module signature verification - * - * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/cred.h> -#include <linux/err.h> -#include <keys/asymmetric-type.h> -#include "module-internal.h" - -struct key *modsign_keyring; - -extern __initconst const u8 modsign_certificate_list[]; -extern __initconst const u8 modsign_certificate_list_end[]; - -/* - * We need to make sure ccache doesn't cache the .o file as it doesn't notice - * if modsign.pub changes. - */ -static __initconst const char annoy_ccache[] = __TIME__ "foo"; - -/* - * Load the compiled-in keys - */ -static __init int module_verify_init(void) -{ - pr_notice("Initialise module verification\n"); - - modsign_keyring = keyring_alloc(".module_sign", - KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), - ((KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ), - KEY_ALLOC_NOT_IN_QUOTA, NULL); - if (IS_ERR(modsign_keyring)) - panic("Can't allocate module signing keyring\n"); - - return 0; -} - -/* - * Must be initialised before we try and load the keys into the keyring. - */ -device_initcall(module_verify_init); - -/* - * Load the compiled-in keys - */ -static __init int load_module_signing_keys(void) -{ - key_ref_t key; - const u8 *p, *end; - size_t plen; - - pr_notice("Loading module verification certificates\n"); - - end = modsign_certificate_list_end; - p = modsign_certificate_list; - while (p < end) { - /* Each cert begins with an ASN.1 SEQUENCE tag and must be more - * than 256 bytes in size. - */ - if (end - p < 4) - goto dodgy_cert; - if (p[0] != 0x30 && - p[1] != 0x82) - goto dodgy_cert; - plen = (p[2] << 8) | p[3]; - plen += 4; - if (plen > end - p) - goto dodgy_cert; - - key = key_create_or_update(make_key_ref(modsign_keyring, 1), - "asymmetric", - NULL, - p, - plen, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW, - KEY_ALLOC_NOT_IN_QUOTA); - if (IS_ERR(key)) - pr_err("MODSIGN: Problem loading in-kernel X.509 certificate (%ld)\n", - PTR_ERR(key)); - else - pr_notice("MODSIGN: Loaded cert '%s'\n", - key_ref_to_ptr(key)->description); - p += plen; - } - - return 0; - -dodgy_cert: - pr_err("MODSIGN: Problem parsing in-kernel X.509 certificate list\n"); - return 0; -} -late_initcall(load_module_signing_keys); diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 24f9247b7d02..915e123a430f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -9,6 +9,4 @@ * 2 of the Licence, or (at your option) any later version. */ -extern struct key *modsign_keyring; - extern int mod_verify_sig(const void *mod, unsigned long *_modlen); diff --git a/kernel/module.c b/kernel/module.c index af5ebd21d77b..f5a3b1e8ec51 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -641,8 +641,6 @@ static int module_unload_init(struct module *mod) /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); - /* Backwards compatibility macros put refcount during init. */ - mod->waiter = current; return 0; } @@ -768,16 +766,9 @@ static int __try_stop_module(void *_sref) static int try_stop_module(struct module *mod, int flags, int *forced) { - if (flags & O_NONBLOCK) { - struct stopref sref = { mod, flags, forced }; + struct stopref sref = { mod, flags, forced }; - return stop_machine(__try_stop_module, &sref, NULL); - } else { - /* We don't need to stop the machine for this. */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - return 0; - } + return stop_machine(__try_stop_module, &sref, NULL); } unsigned long module_refcount(struct module *mod) @@ -810,21 +801,6 @@ EXPORT_SYMBOL(module_refcount); /* This exists whether we can unload or not */ static void free_module(struct module *mod); -static void wait_for_zero_refcount(struct module *mod) -{ - /* Since we might sleep for some time, release the mutex first */ - mutex_unlock(&module_mutex); - for (;;) { - pr_debug("Looking at refcount...\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - if (module_refcount(mod) == 0) - break; - schedule(); - } - current->state = TASK_RUNNING; - mutex_lock(&module_mutex); -} - SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { @@ -839,6 +815,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; + if (!(flags & O_NONBLOCK)) { + printk(KERN_WARNING + "waiting module removal not supported: please upgrade"); + } + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -856,8 +837,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Doing init or already dying? */ if (mod->state != MODULE_STATE_LIVE) { - /* FIXME: if (force), slam module count and wake up - waiter --RR */ + /* FIXME: if (force), slam module count damn the torpedoes */ pr_debug("%s already dying\n", mod->name); ret = -EBUSY; goto out; @@ -873,18 +853,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } - /* Set this up before setting mod->state */ - mod->waiter = current; - /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; - /* Never wait if forced. */ - if (!forced && module_refcount(mod) != 0) - wait_for_zero_refcount(mod); - mutex_unlock(&module_mutex); /* Final destruction now no one is using it. */ if (mod->exit != NULL) @@ -1002,9 +975,6 @@ void module_put(struct module *module) __this_cpu_inc(module->refptr->decs); trace_module_put(module, _RET_IP_); - /* Maybe they're waiting for us to drop reference? */ - if (unlikely(!module_is_live(module))) - wake_up_process(module->waiter); preempt_enable(); } } @@ -2728,7 +2698,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return 0; } -static void find_module_sections(struct module *mod, struct load_info *info) +static int find_module_sections(struct module *mod, struct load_info *info) { mod->kp = section_objs(info, "__param", sizeof(*mod->kp), &mod->num_kp); @@ -2758,6 +2728,18 @@ static void find_module_sections(struct module *mod, struct load_info *info) #ifdef CONFIG_CONSTRUCTORS mod->ctors = section_objs(info, ".ctors", sizeof(*mod->ctors), &mod->num_ctors); + if (!mod->ctors) + mod->ctors = section_objs(info, ".init_array", + sizeof(*mod->ctors), &mod->num_ctors); + else if (find_sec(info, ".init_array")) { + /* + * This shouldn't happen with same compiler and binutils + * building all parts of the module. + */ + printk(KERN_WARNING "%s: has both .ctors and .init_array.\n", + mod->name); + return -EINVAL; + } #endif #ifdef CONFIG_TRACEPOINTS @@ -2795,6 +2777,8 @@ static void find_module_sections(struct module *mod, struct load_info *info) info->debug = section_objs(info, "__verbose", sizeof(*info->debug), &info->num_debug); + + return 0; } static int move_module(struct module *mod, struct load_info *info) @@ -3248,7 +3232,9 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now we've got everything in the final locations, we can * find optional sections. */ - find_module_sections(mod, info); + err = find_module_sections(mod, info); + if (err) + goto free_unload; err = check_module_license_and_versions(mod); if (err) diff --git a/kernel/module_signing.c b/kernel/module_signing.c index f2970bddc5ea..be5b8fac4bd0 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -14,6 +14,7 @@ #include <crypto/public_key.h> #include <crypto/hash.h> #include <keys/asymmetric-type.h> +#include <keys/system_keyring.h> #include "module-internal.h" /* @@ -28,7 +29,7 @@ */ struct module_signature { u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ - u8 hash; /* Digest algorithm [enum pkey_hash_algo] */ + u8 hash; /* Digest algorithm [enum hash_algo] */ u8 id_type; /* Key identifier type [enum pkey_id_type] */ u8 signer_len; /* Length of signer's name */ u8 key_id_len; /* Length of key identifier */ @@ -39,7 +40,7 @@ struct module_signature { /* * Digest the module contents. */ -static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, +static struct public_key_signature *mod_make_digest(enum hash_algo hash, const void *mod, unsigned long modlen) { @@ -54,7 +55,7 @@ static struct public_key_signature *mod_make_digest(enum pkey_hash_algo hash, /* Allocate the hashing algorithm we're going to need and find out how * big the hash operational data will be. */ - tfm = crypto_alloc_shash(pkey_hash_algo[hash], 0, 0); + tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0); if (IS_ERR(tfm)) return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm); @@ -157,7 +158,7 @@ static struct key *request_asymmetric_key(const char *signer, size_t signer_len, pr_debug("Look up: \"%s\"\n", id); - key = keyring_search(make_key_ref(modsign_keyring, 1), + key = keyring_search(make_key_ref(system_trusted_keyring, 1), &key_type_asymmetric, id); if (IS_ERR(key)) pr_warn("Request for unknown module key '%s' err %ld\n", @@ -217,7 +218,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen) return -ENOPKG; if (ms.hash >= PKEY_HASH__LAST || - !pkey_hash_algo[ms.hash]) + !hash_algo_name[ms.hash]) return -ENOPKG; key = request_asymmetric_key(sig, ms.signer_len, diff --git a/kernel/padata.c b/kernel/padata.c index 07af2c95dcfe..2abd25d79cc8 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -46,6 +46,7 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) static int padata_cpu_hash(struct parallel_data *pd) { + unsigned int seq_nr; int cpu_index; /* @@ -53,10 +54,8 @@ static int padata_cpu_hash(struct parallel_data *pd) * seq_nr mod. number of cpus in use. */ - spin_lock(&pd->seq_lock); - cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); - pd->seq_nr++; - spin_unlock(&pd->seq_lock); + seq_nr = atomic_inc_return(&pd->seq_nr); + cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } @@ -429,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - pd->seq_nr = 0; + atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 10c22cae83a0..b38109e204af 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -792,7 +792,8 @@ void free_basic_memory_bitmaps(void) { struct memory_bitmap *bm1, *bm2; - BUG_ON(!(forbidden_pages_map && free_pages_map)); + if (WARN_ON(!(forbidden_pages_map && free_pages_map))) + return; bm1 = forbidden_pages_map; bm2 = free_pages_map; diff --git a/kernel/power/user.c b/kernel/power/user.c index 24850270c802..98d357584cd6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -70,6 +70,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; + data->free_bitmaps = false; error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0c9a934cfec1..1254f312d024 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(rcu_irq_enter); /* * Test whether RCU thinks that the current CPU is idle. */ -bool __rcu_is_watching(void) +bool notrace __rcu_is_watching(void) { return rcu_dynticks_nesting; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4c06ddfea7cd..dd081987a8ec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -664,7 +664,7 @@ void rcu_nmi_exit(void) * rcu_is_watching(), the caller of __rcu_is_watching() must have at * least disabled preemption. */ -bool __rcu_is_watching(void) +bool notrace __rcu_is_watching(void) { return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; } @@ -675,7 +675,7 @@ bool __rcu_is_watching(void) * If the current CPU is in its idle loop and is neither in an interrupt * or NMI handler, return true. */ -bool rcu_is_watching(void) +bool notrace rcu_is_watching(void) { int ret; diff --git a/kernel/smp.c b/kernel/smp.c index 46116100f0ee..bd9f94028838 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -15,7 +15,6 @@ #include "smpboot.h" -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, CSD_FLAG_WAIT = 0x02, @@ -140,8 +139,7 @@ static void csd_unlock(struct call_single_data *csd) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static -void generic_exec_single(int cpu, struct call_single_data *csd, int wait) +static void generic_exec_single(int cpu, struct call_single_data *csd, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); unsigned long flags; @@ -464,7 +462,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); -#endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; diff --git a/kernel/softirq.c b/kernel/softirq.c index b24988353458..11025ccc06dd 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,8 +6,6 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - * - * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/export.h> @@ -627,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, } EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -/* - * Remote softirq bits - */ - -DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); -EXPORT_PER_CPU_SYMBOL(softirq_work_list); - -static void __local_trigger(struct call_single_data *cp, int softirq) -{ - struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); - - list_add_tail(&cp->list, head); - - /* Trigger the softirq only if the list was previously empty. */ - if (head->next == &cp->list) - raise_softirq_irqoff(softirq); -} - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static void remote_softirq_receive(void *data) -{ - struct call_single_data *cp = data; - unsigned long flags; - int softirq; - - softirq = *(int *)cp->info; - local_irq_save(flags); - __local_trigger(cp, softirq); - local_irq_restore(flags); -} - -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - if (cpu_online(cpu)) { - cp->func = remote_softirq_receive; - cp->info = &softirq; - cp->flags = 0; - - __smp_call_function_single(cpu, cp, 0); - return 0; - } - return 1; -} -#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - return 1; -} -#endif - -/** - * __send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @this_cpu: the currently executing cpu - * @softirq: the softirq for the work - * - * Attempt to schedule softirq work on a remote cpu. If this cannot be - * done, the work is instead queued up on the local cpu. - * - * Interrupts must be disabled. - */ -void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) -{ - if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) - __local_trigger(cp, softirq); -} -EXPORT_SYMBOL(__send_remote_softirq); - -/** - * send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @softirq: the softirq for the work - * - * Like __send_remote_softirq except that disabling interrupts and - * computing the current cpu is done for the caller. - */ -void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - unsigned long flags; - int this_cpu; - - local_irq_save(flags); - this_cpu = smp_processor_id(); - __send_remote_softirq(cp, cpu, this_cpu, softirq); - local_irq_restore(flags); -} -EXPORT_SYMBOL(send_remote_softirq); - -static int remote_softirq_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - int i; - - local_irq_disable(); - for (i = 0; i < NR_SOFTIRQS; i++) { - struct list_head *head = &per_cpu(softirq_work_list[i], cpu); - struct list_head *local_head; - - if (list_empty(head)) - continue; - - local_head = &__get_cpu_var(softirq_work_list[i]); - list_splice_init(head, local_head); - raise_softirq_irqoff(i); - } - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block remote_softirq_cpu_notifier = { - .notifier_call = remote_softirq_cpu_notify, -}; - void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { - int i; - per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; - for (i = 0; i < NR_SOFTIRQS; i++) - INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } - register_hotcpu_notifier(&remote_softirq_cpu_notifier); - open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S new file mode 100644 index 000000000000..4aef390671cb --- /dev/null +++ b/kernel/system_certificates.S @@ -0,0 +1,10 @@ +#include <linux/export.h> +#include <linux/init.h> + + __INITRODATA + + .globl VMLINUX_SYMBOL(system_certificate_list) +VMLINUX_SYMBOL(system_certificate_list): + .incbin "kernel/x509_certificate_list" + .globl VMLINUX_SYMBOL(system_certificate_list_end) +VMLINUX_SYMBOL(system_certificate_list_end): diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c new file mode 100644 index 000000000000..564dd93430a2 --- /dev/null +++ b/kernel/system_keyring.c @@ -0,0 +1,105 @@ +/* System trusted keyring for trusted public keys + * + * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/err.h> +#include <keys/asymmetric-type.h> +#include <keys/system_keyring.h> +#include "module-internal.h" + +struct key *system_trusted_keyring; +EXPORT_SYMBOL_GPL(system_trusted_keyring); + +extern __initconst const u8 system_certificate_list[]; +extern __initconst const u8 system_certificate_list_end[]; + +/* + * Load the compiled-in keys + */ +static __init int system_trusted_keyring_init(void) +{ + pr_notice("Initialise system trusted keyring\n"); + + system_trusted_keyring = + keyring_alloc(".system_keyring", + KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH), + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(system_trusted_keyring)) + panic("Can't allocate system trusted keyring\n"); + + set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags); + return 0; +} + +/* + * Must be initialised before we try and load the keys into the keyring. + */ +device_initcall(system_trusted_keyring_init); + +/* + * Load the compiled-in list of X.509 certificates. + */ +static __init int load_system_certificate_list(void) +{ + key_ref_t key; + const u8 *p, *end; + size_t plen; + + pr_notice("Loading compiled-in X.509 certificates\n"); + + end = system_certificate_list_end; + p = system_certificate_list; + while (p < end) { + /* Each cert begins with an ASN.1 SEQUENCE tag and must be more + * than 256 bytes in size. + */ + if (end - p < 4) + goto dodgy_cert; + if (p[0] != 0x30 && + p[1] != 0x82) + goto dodgy_cert; + plen = (p[2] << 8) | p[3]; + plen += 4; + if (plen > end - p) + goto dodgy_cert; + + key = key_create_or_update(make_key_ref(system_trusted_keyring, 1), + "asymmetric", + NULL, + p, + plen, + ((KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ), + KEY_ALLOC_NOT_IN_QUOTA | + KEY_ALLOC_TRUSTED); + if (IS_ERR(key)) { + pr_err("Problem loading in-kernel X.509 certificate (%ld)\n", + PTR_ERR(key)); + } else { + pr_notice("Loaded X.509 cert '%s'\n", + key_ref_to_ptr(key)->description); + key_ref_put(key); + } + p += plen; + } + + return 0; + +dodgy_cert: + pr_err("Problem parsing in-kernel X.509 certificate list\n"); + return 0; +} +late_initcall(load_system_certificate_list); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9f4618eb51c8..13d2f7cd65db 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -673,17 +673,18 @@ err: nlmsg_free(rep_skb); } -static struct genl_ops taskstats_ops = { - .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, -}; - -static struct genl_ops cgroupstats_ops = { - .cmd = CGROUPSTATS_CMD_GET, - .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, +static const struct genl_ops taskstats_ops[] = { + { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_user_cmd, + .policy = taskstats_cmd_get_policy, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = CGROUPSTATS_CMD_GET, + .doit = cgroupstats_user_cmd, + .policy = cgroupstats_cmd_get_policy, + }, }; /* Needed early in initialization */ @@ -702,26 +703,13 @@ static int __init taskstats_init(void) { int rc; - rc = genl_register_family(&family); + rc = genl_register_family_with_ops(&family, taskstats_ops); if (rc) return rc; - rc = genl_register_ops(&family, &taskstats_ops); - if (rc < 0) - goto err; - - rc = genl_register_ops(&family, &cgroupstats_ops); - if (rc < 0) - goto err_cgroup_ops; - family_registered = 1; pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; -err_cgroup_ops: - genl_unregister_ops(&family, &taskstats_ops); -err: - genl_unregister_family(&family); - return rc; } /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 03cf44ac54d3..0e9f9eaade2f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -367,9 +367,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list, static int __register_ftrace_function(struct ftrace_ops *ops) { - if (unlikely(ftrace_disabled)) - return -ENODEV; - if (FTRACE_WARN_ON(ops == &global_ops)) return -EINVAL; @@ -428,9 +425,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) { int ret; - if (ftrace_disabled) - return -ENODEV; - if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; @@ -2088,10 +2082,15 @@ static void ftrace_startup_enable(int command) static int ftrace_startup(struct ftrace_ops *ops, int command) { bool hash_enable = true; + int ret; if (unlikely(ftrace_disabled)) return -ENODEV; + ret = __register_ftrace_function(ops); + if (ret) + return ret; + ftrace_start_up++; command |= FTRACE_UPDATE_CALLS; @@ -2113,12 +2112,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return 0; } -static void ftrace_shutdown(struct ftrace_ops *ops, int command) +static int ftrace_shutdown(struct ftrace_ops *ops, int command) { bool hash_disable = true; + int ret; if (unlikely(ftrace_disabled)) - return; + return -ENODEV; + + ret = __unregister_ftrace_function(ops); + if (ret) + return ret; ftrace_start_up--; /* @@ -2153,9 +2157,10 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command) } if (!command || !ftrace_enabled) - return; + return 0; ftrace_run_update_code(command); + return 0; } static void ftrace_startup_sysctl(void) @@ -3060,16 +3065,13 @@ static void __enable_ftrace_function_probe(void) if (i == FTRACE_FUNC_HASHSIZE) return; - ret = __register_ftrace_function(&trace_probe_ops); - if (!ret) - ret = ftrace_startup(&trace_probe_ops, 0); + ret = ftrace_startup(&trace_probe_ops, 0); ftrace_probe_registered = 1; } static void __disable_ftrace_function_probe(void) { - int ret; int i; if (!ftrace_probe_registered) @@ -3082,9 +3084,7 @@ static void __disable_ftrace_function_probe(void) } /* no more funcs left */ - ret = __unregister_ftrace_function(&trace_probe_ops); - if (!ret) - ftrace_shutdown(&trace_probe_ops, 0); + ftrace_shutdown(&trace_probe_ops, 0); ftrace_probe_registered = 0; } @@ -3307,7 +3307,11 @@ void unregister_ftrace_function_probe_all(char *glob) static LIST_HEAD(ftrace_commands); static DEFINE_MUTEX(ftrace_cmd_mutex); -int register_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only register ftrace commands from __init, so mark this + * __init too. + */ +__init int register_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p; int ret = 0; @@ -3326,7 +3330,11 @@ int register_ftrace_command(struct ftrace_func_command *cmd) return ret; } -int unregister_ftrace_command(struct ftrace_func_command *cmd) +/* + * Currently we only unregister ftrace commands from __init, so mark + * this __init too. + */ +__init int unregister_ftrace_command(struct ftrace_func_command *cmd) { struct ftrace_func_command *p, *n; int ret = -ENODEV; @@ -3641,7 +3649,7 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); +static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); static int __init set_graph_function(char *str) { @@ -3659,7 +3667,7 @@ static void __init set_ftrace_early_graph(char *buf) func = strsep(&buf, ","); /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - func); + FTRACE_GRAPH_MAX_FUNCS, func); if (ret) printk(KERN_DEBUG "ftrace: function %s not " "traceable\n", func); @@ -3776,15 +3784,25 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; -int ftrace_graph_filter_enabled; +int ftrace_graph_notrace_count; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; +unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +struct ftrace_graph_data { + unsigned long *table; + size_t size; + int *count; + const struct seq_operations *seq_ops; +}; static void * __g_next(struct seq_file *m, loff_t *pos) { - if (*pos >= ftrace_graph_count) + struct ftrace_graph_data *fgd = m->private; + + if (*pos >= *fgd->count) return NULL; - return &ftrace_graph_funcs[*pos]; + return &fgd->table[*pos]; } static void * @@ -3796,10 +3814,12 @@ g_next(struct seq_file *m, void *v, loff_t *pos) static void *g_start(struct seq_file *m, loff_t *pos) { + struct ftrace_graph_data *fgd = m->private; + mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_filter_enabled && !*pos) + if (!*fgd->count && !*pos) return (void *)1; return __g_next(m, pos); @@ -3835,38 +3855,88 @@ static const struct seq_operations ftrace_graph_seq_ops = { }; static int -ftrace_graph_open(struct inode *inode, struct file *file) +__ftrace_graph_open(struct inode *inode, struct file *file, + struct ftrace_graph_data *fgd) { int ret = 0; - if (unlikely(ftrace_disabled)) - return -ENODEV; - mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - ftrace_graph_filter_enabled = 0; - ftrace_graph_count = 0; - memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + *fgd->count = 0; + memset(fgd->table, 0, fgd->size * sizeof(*fgd->table)); } mutex_unlock(&graph_lock); - if (file->f_mode & FMODE_READ) - ret = seq_open(file, &ftrace_graph_seq_ops); + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, fgd->seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = fgd; + } + } else + file->private_data = fgd; return ret; } static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int +ftrace_graph_notrace_open(struct inode *inode, struct file *file) +{ + struct ftrace_graph_data *fgd; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + fgd = kmalloc(sizeof(*fgd), GFP_KERNEL); + if (fgd == NULL) + return -ENOMEM; + + fgd->table = ftrace_graph_notrace_funcs; + fgd->size = FTRACE_GRAPH_MAX_FUNCS; + fgd->count = &ftrace_graph_notrace_count; + fgd->seq_ops = &ftrace_graph_seq_ops; + + return __ftrace_graph_open(inode, file, fgd); +} + +static int ftrace_graph_release(struct inode *inode, struct file *file) { - if (file->f_mode & FMODE_READ) + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + + kfree(m->private); seq_release(inode, file); + } else { + kfree(file->private_data); + } + return 0; } static int -ftrace_set_func(unsigned long *array, int *idx, char *buffer) +ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -3879,7 +3949,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + if (!not && *idx >= size) return -EBUSY; search_len = strlen(search); @@ -3907,7 +3977,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) fail = 0; if (!exists) { array[(*idx)++] = rec->ip; - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + if (*idx >= size) goto out; } } else { @@ -3925,8 +3995,6 @@ out: if (fail) return -EINVAL; - ftrace_graph_filter_enabled = !!(*idx); - return 0; } @@ -3935,36 +4003,33 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - ssize_t read, ret; + ssize_t read, ret = 0; + struct ftrace_graph_data *fgd = file->private_data; if (!cnt) return 0; - mutex_lock(&graph_lock); - - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { - ret = -ENOMEM; - goto out_unlock; - } + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) + return -ENOMEM; read = trace_get_user(&parser, ubuf, cnt, ppos); if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; + mutex_lock(&graph_lock); + /* we allow only one expression at a time */ - ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, - parser.buffer); - if (ret) - goto out_free; + ret = ftrace_set_func(fgd->table, fgd->count, fgd->size, + parser.buffer); + + mutex_unlock(&graph_lock); } - ret = read; + if (!ret) + ret = read; -out_free: trace_parser_put(&parser); -out_unlock: - mutex_unlock(&graph_lock); return ret; } @@ -3976,6 +4041,14 @@ static const struct file_operations ftrace_graph_fops = { .llseek = ftrace_filter_lseek, .release = ftrace_graph_release, }; + +static const struct file_operations ftrace_graph_notrace_fops = { + .open = ftrace_graph_notrace_open, + .read = seq_read, + .write = ftrace_graph_write, + .llseek = ftrace_filter_lseek, + .release = ftrace_graph_release, +}; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) @@ -3997,6 +4070,9 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); + trace_create_file("set_graph_notrace", 0444, d_tracer, + NULL, + &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -4290,12 +4366,15 @@ core_initcall(ftrace_nodyn_init); static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } /* Keep as macros so we do not need to define the commands */ -# define ftrace_startup(ops, command) \ - ({ \ - (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ - 0; \ +# define ftrace_startup(ops, command) \ + ({ \ + int ___ret = __register_ftrace_function(ops); \ + if (!___ret) \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + ___ret; \ }) -# define ftrace_shutdown(ops, command) do { } while (0) +# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops) + # define ftrace_startup_sysctl() do { } while (0) # define ftrace_shutdown_sysctl() do { } while (0) @@ -4320,12 +4399,21 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, */ preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); + + /* + * Control funcs (perf) uses RCU. Only trace if + * RCU is currently active. + */ + if (!rcu_is_watching()) + goto out; + do_for_each_ftrace_op(op, ftrace_control_list) { if (!(op->flags & FTRACE_OPS_FL_STUB) && !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip, regs)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); + out: trace_recursion_clear(TRACE_CONTROL_BIT); preempt_enable_notrace(); } @@ -4695,9 +4783,7 @@ int register_ftrace_function(struct ftrace_ops *ops) mutex_lock(&ftrace_lock); - ret = __register_ftrace_function(ops); - if (!ret) - ret = ftrace_startup(ops, 0); + ret = ftrace_startup(ops, 0); mutex_unlock(&ftrace_lock); @@ -4716,9 +4802,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ret; mutex_lock(&ftrace_lock); - ret = __unregister_ftrace_function(ops); - if (!ret) - ftrace_shutdown(ops, 0); + ret = ftrace_shutdown(ops, 0); mutex_unlock(&ftrace_lock); return ret; @@ -4912,6 +4996,13 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, return NOTIFY_DONE; } +/* Just a place holder for function graph */ +static struct ftrace_ops fgraph_ops __read_mostly = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL | + FTRACE_OPS_FL_RECURSION_SAFE, +}; + int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) { @@ -4938,7 +5029,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_graph_return = retfunc; ftrace_graph_entry = entryfunc; - ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); out: mutex_unlock(&ftrace_lock); @@ -4955,7 +5046,7 @@ void unregister_ftrace_graph(void) ftrace_graph_active--; ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; - ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d9fea7dfd5d3..9d20cd9743ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -235,13 +235,33 @@ void trace_array_put(struct trace_array *this_tr) mutex_unlock(&trace_types_lock); } -int filter_current_check_discard(struct ring_buffer *buffer, - struct ftrace_event_call *call, void *rec, - struct ring_buffer_event *event) +int filter_check_discard(struct ftrace_event_file *file, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) { - return filter_check_discard(call, rec, buffer, event); + if (unlikely(file->flags & FTRACE_EVENT_FL_FILTERED) && + !filter_match_preds(file->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} +EXPORT_SYMBOL_GPL(filter_check_discard); + +int call_filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; } -EXPORT_SYMBOL_GPL(filter_current_check_discard); +EXPORT_SYMBOL_GPL(call_filter_check_discard); cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu) { @@ -843,9 +863,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, if (isspace(ch)) { parser->buffer[parser->idx] = 0; parser->cont = false; - } else { + } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + } else { + ret = -EINVAL; + goto out; } *ppos += read; @@ -1261,21 +1284,6 @@ int is_tracing_stopped(void) } /** - * ftrace_off_permanent - disable all ftrace code permanently - * - * This should only be called when a serious anomally has - * been detected. This will turn off the function tracing, - * ring buffers, and other tracing utilites. It takes no - * locks and can be called from any context. - */ -void ftrace_off_permanent(void) -{ - tracing_disabled = 1; - ftrace_stop(); - tracing_off_permanent(); -} - -/** * tracing_start - quick start of the tracer * * If tracing is enabled but was stopped by tracing_stop, @@ -1631,7 +1639,7 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); } @@ -1715,7 +1723,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, entry->size = trace.nr_entries; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: @@ -1817,7 +1825,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) trace.entries = entry->caller; save_stack_trace_user(&trace); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out_drop_count: @@ -2009,7 +2017,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, tbuffer, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) { + if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2064,7 +2072,7 @@ __trace_array_vprintk(struct ring_buffer *buffer, memcpy(&entry->buf, tbuffer, len); entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) { + if (!call_filter_check_discard(call, entry, buffer, event)) { __buffer_unlock_commit(buffer, event); ftrace_trace_stack(buffer, flags, 6, pc); } @@ -2761,7 +2769,7 @@ static void show_snapshot_main_help(struct seq_file *m) seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate or free)\n"); seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); seq_printf(m, "# is not a '0' or '1')\n"); } @@ -2965,6 +2973,11 @@ int tracing_open_generic(struct inode *inode, struct file *filp) return 0; } +bool tracing_is_disabled(void) +{ + return (tracing_disabled) ? true: false; +} + /* * Open and update trace_array ref count. * Must have the current trace_array passed to it. @@ -5455,12 +5468,12 @@ static struct ftrace_func_command ftrace_snapshot_cmd = { .func = ftrace_trace_snapshot_callback, }; -static int register_snapshot_cmd(void) +static __init int register_snapshot_cmd(void) { return register_ftrace_command(&ftrace_snapshot_cmd); } #else -static inline int register_snapshot_cmd(void) { return 0; } +static inline __init int register_snapshot_cmd(void) { return 0; } #endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) @@ -6254,6 +6267,17 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; iter->trace_buffer = &global_trace.trace_buffer; + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->trace_buffer->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* Output in nanoseconds only if we are using a clock in nanoseconds. */ + if (trace_clocks[iter->tr->clock_id].in_ns) + iter->iter_flags |= TRACE_FILE_TIME_IN_NS; } void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 73d08aa25b55..ea189e027b80 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -193,8 +193,8 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); - DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); + struct ftrace_event_file __rcu *enter_syscall_files[NR_syscalls]; + struct ftrace_event_file __rcu *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; @@ -515,6 +515,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); +bool tracing_is_disabled(void); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, @@ -712,6 +713,8 @@ extern unsigned long trace_flags; #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_FILL_SHIFT 28 +#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); @@ -731,15 +734,16 @@ extern void __trace_graph_return(struct trace_array *tr, #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 -extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; +extern int ftrace_graph_notrace_count; +extern unsigned long ftrace_graph_notrace_funcs[FTRACE_GRAPH_MAX_FUNCS]; static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_filter_enabled) + if (!ftrace_graph_count) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -759,11 +763,31 @@ static inline int ftrace_graph_addr(unsigned long addr) return 0; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_notrace_count) + return 0; + + for (i = 0; i < ftrace_graph_notrace_count; i++) { + if (addr == ftrace_graph_notrace_funcs[i]) + return 1; + } + + return 0; +} #else static inline int ftrace_graph_addr(unsigned long addr) { return 1; } + +static inline int ftrace_graph_notrace_addr(unsigned long addr) +{ + return 0; +} #endif /* CONFIG_DYNAMIC_FTRACE */ #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t @@ -987,9 +1011,9 @@ struct filter_pred { extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); -extern void print_event_filter(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s); -extern int apply_event_filter(struct ftrace_event_call *call, +extern int apply_event_filter(struct ftrace_event_file *file, char *filter_string); extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); @@ -1000,20 +1024,6 @@ extern int filter_assign_type(const char *type); struct ftrace_event_field * trace_find_event_field(struct ftrace_event_call *call, char *name); -static inline int -filter_check_discard(struct ftrace_event_call *call, void *rec, - struct ring_buffer *buffer, - struct ring_buffer_event *event) -{ - if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && - !filter_match_preds(call->filter, rec)) { - ring_buffer_discard_commit(buffer, event); - return 1; - } - - return 0; -} - extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index d594da0dc03c..697fb9bac8f0 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -78,7 +78,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); out: diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 78e27e3b52ac..e854f420e033 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,12 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + if (tp_event->perf_perm) { + int ret = tp_event->perf_perm(tp_event, p_event); + if (ret) + return ret; + } + /* The ftrace function trace is allowed only for root. */ if (ftrace_event_is_function(tp_event) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) @@ -173,7 +179,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; - int event_id = p_event->attr.config; + u64 event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 368a4d50cc30..f919a2e21bf3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -989,7 +989,7 @@ static ssize_t event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; struct trace_seq *s; int r = -ENODEV; @@ -1004,12 +1004,12 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); mutex_lock(&event_mutex); - call = event_file_data(filp); - if (call) - print_event_filter(call, s); + file = event_file_data(filp); + if (file) + print_event_filter(file, s); mutex_unlock(&event_mutex); - if (call) + if (file) r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -1021,7 +1021,7 @@ static ssize_t event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; char *buf; int err = -ENODEV; @@ -1039,9 +1039,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, buf[cnt] = '\0'; mutex_lock(&event_mutex); - call = event_file_data(filp); - if (call) - err = apply_event_filter(call, buf); + file = event_file_data(filp); + if (file) + err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); free_page((unsigned long) buf); @@ -1062,6 +1062,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct trace_array *tr; int ret; + if (tracing_is_disabled()) + return -ENODEV; + /* Make sure the system still exists */ mutex_lock(&trace_types_lock); mutex_lock(&event_mutex); @@ -1108,6 +1111,9 @@ static int system_tr_open(struct inode *inode, struct file *filp) struct trace_array *tr = inode->i_private; int ret; + if (tracing_is_disabled()) + return -ENODEV; + if (trace_array_get(tr) < 0) return -ENODEV; @@ -1124,11 +1130,12 @@ static int system_tr_open(struct inode *inode, struct file *filp) if (ret < 0) { trace_array_put(tr); kfree(dir); + return ret; } filp->private_data = dir; - return ret; + return 0; } static int subsystem_release(struct inode *inode, struct file *file) @@ -1539,7 +1546,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) return -1; } } - trace_create_file("filter", 0644, file->dir, call, + trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); trace_create_file("format", 0444, file->dir, call, @@ -1577,6 +1584,7 @@ static void event_remove(struct ftrace_event_call *call) if (file->event_call != call) continue; ftrace_event_enable_disable(file, 0); + destroy_preds(file); /* * The do_for_each_event_file() is * a double loop. After finding the call for this @@ -1700,7 +1708,7 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); - destroy_preds(call); + destroy_call_preds(call); } static int probe_remove_event_call(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 97daa8cf958d..2468f56dc5db 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -637,10 +637,18 @@ static void append_filter_err(struct filter_parse_state *ps, free_page((unsigned long) buf); } +static inline struct event_filter *event_filter(struct ftrace_event_file *file) +{ + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + return file->event_call->filter; + else + return file->filter; +} + /* caller must hold event_mutex */ -void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +void print_event_filter(struct ftrace_event_file *file, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter = event_filter(file); if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); @@ -766,11 +774,21 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void filter_disable(struct ftrace_event_call *call) +static void call_filter_disable(struct ftrace_event_call *call) { call->flags &= ~TRACE_EVENT_FL_FILTERED; } +static void filter_disable(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call_filter_disable(call); + else + file->flags &= ~FTRACE_EVENT_FL_FILTERED; +} + static void __free_filter(struct event_filter *filter) { if (!filter) @@ -781,16 +799,30 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +void destroy_call_preds(struct ftrace_event_call *call) +{ + __free_filter(call->filter); + call->filter = NULL; +} + +static void destroy_file_preds(struct ftrace_event_file *file) +{ + __free_filter(file->filter); + file->filter = NULL; +} + /* - * Called when destroying the ftrace_event_call. - * The call is being freed, so we do not need to worry about - * the call being currently used. This is for module code removing + * Called when destroying the ftrace_event_file. + * The file is being freed, so we do not need to worry about + * the file being currently used. This is for module code removing * the tracepoints from within it. */ -void destroy_preds(struct ftrace_event_call *call) +void destroy_preds(struct ftrace_event_file *file) { - __free_filter(call->filter); - call->filter = NULL; + if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + destroy_call_preds(file->event_call); + else + destroy_file_preds(file); } static struct event_filter *__alloc_filter(void) @@ -825,28 +857,56 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static inline void __remove_filter(struct ftrace_event_file *file) { + struct ftrace_event_call *call = file->event_call; + + filter_disable(file); + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + remove_filter_string(call->filter); + else + remove_filter_string(file->filter); +} + +static void filter_free_subsystem_preds(struct event_subsystem *system, + struct trace_array *tr) +{ + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - filter_disable(call); - remove_filter_string(call->filter); + __remove_filter(file); } } -static void filter_free_subsystem_filters(struct event_subsystem *system) +static inline void __free_subsystem_filter(struct ftrace_event_file *file) { + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) { + __free_filter(call->filter); + call->filter = NULL; + } else { + __free_filter(file->filter); + file->filter = NULL; + } +} + +static void filter_free_subsystem_filters(struct event_subsystem *system, + struct trace_array *tr) +{ + struct ftrace_event_file *file; struct ftrace_event_call *call; - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; - __free_filter(call->filter); - call->filter = NULL; + __free_subsystem_filter(file); } } @@ -1617,15 +1677,85 @@ fail: return err; } +static inline void event_set_filtered_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_FILTERED; + else + file->flags |= FTRACE_EVENT_FL_FILTERED; +} + +static inline void event_set_filter(struct ftrace_event_file *file, + struct event_filter *filter) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + rcu_assign_pointer(call->filter, filter); + else + rcu_assign_pointer(file->filter, filter); +} + +static inline void event_clear_filter(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + RCU_INIT_POINTER(call->filter, NULL); + else + RCU_INIT_POINTER(file->filter, NULL); +} + +static inline void +event_set_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags |= FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline void +event_clear_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + else + file->flags &= ~FTRACE_EVENT_FL_NO_SET_FILTER; +} + +static inline bool +event_no_set_filter_flag(struct ftrace_event_file *file) +{ + struct ftrace_event_call *call = file->event_call; + + if (file->flags & FTRACE_EVENT_FL_NO_SET_FILTER) + return true; + + if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) && + (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)) + return true; + + return false; +} + struct filter_list { struct list_head list; struct event_filter *filter; }; static int replace_system_preds(struct event_subsystem *system, + struct trace_array *tr, struct filter_parse_state *ps, char *filter_string) { + struct ftrace_event_file *file; struct ftrace_event_call *call; struct filter_list *filter_item; struct filter_list *tmp; @@ -1633,8 +1763,8 @@ static int replace_system_preds(struct event_subsystem *system, bool fail = true; int err; - list_for_each_entry(call, &ftrace_events, list) { - + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (strcmp(call->class->system, system->name) != 0) continue; @@ -1644,18 +1774,20 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + event_set_no_set_filter_flag(file); else - call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; + event_clear_no_set_filter_flag(file); } - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { struct event_filter *filter; + call = file->event_call; + if (strcmp(call->class->system, system->name) != 0) continue; - if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + if (event_no_set_filter_flag(file)) continue; filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); @@ -1676,17 +1808,17 @@ static int replace_system_preds(struct event_subsystem *system, err = replace_preds(call, filter, ps, filter_string, false); if (err) { - filter_disable(call); + filter_disable(file); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); append_filter_err(ps, filter); } else - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); /* * Regardless of if this returned an error, we still * replace the filter for the call. */ - filter = call->filter; - rcu_assign_pointer(call->filter, filter_item->filter); + filter = event_filter(file); + event_set_filter(file, filter_item->filter); filter_item->filter = filter; fail = false; @@ -1816,6 +1948,7 @@ static int create_filter(struct ftrace_event_call *call, * and always remembers @filter_str. */ static int create_system_filter(struct event_subsystem *system, + struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct event_filter *filter = NULL; @@ -1824,7 +1957,7 @@ static int create_system_filter(struct event_subsystem *system, err = create_filter_start(filter_str, true, &ps, &filter); if (!err) { - err = replace_system_preds(system, ps, filter_str); + err = replace_system_preds(system, tr, ps, filter_str); if (!err) { /* System filters just show a default message */ kfree(filter->filter_string); @@ -1840,20 +1973,25 @@ static int create_system_filter(struct event_subsystem *system, } /* caller must hold event_mutex */ -int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +int apply_event_filter(struct ftrace_event_file *file, char *filter_string) { + struct ftrace_event_call *call = file->event_call; struct event_filter *filter; int err; if (!strcmp(strstrip(filter_string), "0")) { - filter_disable(call); - filter = call->filter; + filter_disable(file); + filter = event_filter(file); + if (!filter) return 0; - RCU_INIT_POINTER(call->filter, NULL); + + event_clear_filter(file); + /* Make sure the filter is not being used */ synchronize_sched(); __free_filter(filter); + return 0; } @@ -1866,14 +2004,15 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) * string */ if (filter) { - struct event_filter *tmp = call->filter; + struct event_filter *tmp; + tmp = event_filter(file); if (!err) - call->flags |= TRACE_EVENT_FL_FILTERED; + event_set_filtered_flag(file); else - filter_disable(call); + filter_disable(file); - rcu_assign_pointer(call->filter, filter); + event_set_filter(file, filter); if (tmp) { /* Make sure the call is done with the filter */ @@ -1889,6 +2028,7 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { struct event_subsystem *system = dir->subsystem; + struct trace_array *tr = dir->tr; struct event_filter *filter; int err = 0; @@ -1901,18 +2041,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, } if (!strcmp(strstrip(filter_string), "0")) { - filter_free_subsystem_preds(system); + filter_free_subsystem_preds(system, tr); remove_filter_string(system->filter); filter = system->filter; system->filter = NULL; /* Ensure all filters are no longer used */ synchronize_sched(); - filter_free_subsystem_filters(system); + filter_free_subsystem_filters(system, tr); __free_filter(filter); goto out_unlock; } - err = create_system_filter(system, filter_string, &filter); + err = create_system_filter(system, tr, filter_string, &filter); if (filter) { /* * No event actually uses the system filter diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d21a74670088..7c3e3e72e2b6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -180,7 +180,7 @@ struct ftrace_event_call __used event_##call = { \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ - .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b5c09242683d..0b99120d395c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -82,9 +82,9 @@ static struct trace_array *graph_array; * to fill in space into DURATION column. */ enum { - DURATION_FILL_FULL = -1, - DURATION_FILL_START = -2, - DURATION_FILL_END = -3, + FLAGS_FILL_FULL = 1 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_START = 2 << TRACE_GRAPH_PRINT_FILL_SHIFT, + FLAGS_FILL_END = 3 << TRACE_GRAPH_PRINT_FILL_SHIFT, }; static enum print_line_t @@ -114,16 +114,37 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, return -EBUSY; } + /* + * The curr_ret_stack is an index to ftrace return stack of + * current task. Its value should be in [0, FTRACE_RETFUNC_ + * DEPTH) when the function graph tracer is used. To support + * filtering out specific functions, it makes the index + * negative by subtracting huge value (FTRACE_NOTRACE_DEPTH) + * so when it sees a negative index the ftrace will ignore + * the record. And the index gets recovered when returning + * from the filtered function by adding the FTRACE_NOTRACE_ + * DEPTH and then it'll continue to record functions normally. + * + * The curr_ret_stack is initialized to -1 and get increased + * in this function. So it can be less than -1 only if it was + * filtered out via ftrace_graph_notrace_addr() which can be + * set from set_graph_notrace file in debugfs by user. + */ + if (current->curr_ret_stack < -1) + return -EBUSY; + calltime = trace_clock_local(); index = ++current->curr_ret_stack; + if (ftrace_graph_notrace_addr(func)) + current->curr_ret_stack -= FTRACE_NOTRACE_DEPTH; barrier(); current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; current->ret_stack[index].fp = frame_pointer; - *depth = index; + *depth = current->curr_ret_stack; return 0; } @@ -137,7 +158,17 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, index = current->curr_ret_stack; - if (unlikely(index < 0)) { + /* + * A negative index here means that it's just returned from a + * notrace'd function. Recover index to get an original + * return address. See ftrace_push_return_trace(). + * + * TODO: Need to check whether the stack gets corrupted. + */ + if (index < 0) + index += FTRACE_NOTRACE_DEPTH; + + if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) { ftrace_graph_stop(); WARN_ON(1); /* Might as well panic, otherwise we have no where to go */ @@ -193,6 +224,15 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) trace.rettime = trace_clock_local(); barrier(); current->curr_ret_stack--; + /* + * The curr_ret_stack can be less than -1 only if it was + * filtered out and it's about to return from the function. + * Recover the index and continue to trace normal functions. + */ + if (current->curr_ret_stack < -1) { + current->curr_ret_stack += FTRACE_NOTRACE_DEPTH; + return ret; + } /* * The trace should run after decrementing the ret counter @@ -230,7 +270,7 @@ int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); return 1; @@ -259,10 +299,20 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) /* trace it when it is-nested-in or is a function enabled. */ if ((!(trace->depth || ftrace_graph_addr(trace->func)) || - ftrace_graph_ignore_irqs()) || + ftrace_graph_ignore_irqs()) || (trace->depth < 0) || (max_depth && trace->depth >= max_depth)) return 0; + /* + * Do not trace a function if it's filtered by set_graph_notrace. + * Make the index of ret stack negative to indicate that it should + * ignore further functions. But it needs its own ret stack entry + * to recover the original index in order to continue tracing after + * returning from the function. + */ + if (ftrace_graph_notrace_addr(trace->func)) + return 1; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -335,7 +385,7 @@ void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); } @@ -652,7 +702,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, } /* No overhead */ - ret = print_graph_duration(DURATION_FILL_START, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_START); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -664,7 +714,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (!ret) return TRACE_TYPE_PARTIAL_LINE; - ret = print_graph_duration(DURATION_FILL_END, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_END); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -729,14 +779,14 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s, return TRACE_TYPE_HANDLED; /* No real adata, just filling the column with spaces */ - switch (duration) { - case DURATION_FILL_FULL: + switch (flags & TRACE_GRAPH_PRINT_FILL_MASK) { + case FLAGS_FILL_FULL: ret = trace_seq_puts(s, " | "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_START: + case FLAGS_FILL_START: ret = trace_seq_puts(s, " "); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; - case DURATION_FILL_END: + case FLAGS_FILL_END: ret = trace_seq_puts(s, " |"); return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; } @@ -852,7 +902,7 @@ print_graph_entry_nested(struct trace_iterator *iter, } /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; @@ -1172,7 +1222,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, return TRACE_TYPE_PARTIAL_LINE; /* No time */ - ret = print_graph_duration(DURATION_FILL_FULL, s, flags); + ret = print_graph_duration(0, s, flags | FLAGS_FILL_FULL); if (ret != TRACE_TYPE_HANDLED) return ret; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 243f6834d026..dae9541ada9e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -835,7 +835,7 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs, entry->ip = (unsigned long)tp->rp.kp.addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_buffer_unlock_commit_regs(buffer, event, irq_flags, pc, regs); } @@ -884,7 +884,7 @@ __kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_buffer_unlock_commit_regs(buffer, event, irq_flags, pc, regs); } diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b3dcfb2f0fef..0abd9b863474 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -323,7 +323,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->rw = *rw; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } @@ -353,7 +353,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->map = *map; - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, pc); } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 4e98e3b257a3..3f34dc9b40f3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -45,7 +45,7 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } @@ -101,7 +101,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - if (!filter_check_discard(call, entry, buffer, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, flags, pc); } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 847f88a6194b..7af67360b330 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -43,46 +43,15 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; -/* - * Iterate through the rbtree using a post order traversal path - * to release the next node. - * It won't necessary release one at each iteration - * but it will at least advance closer to the next one - * to be released. - */ -static struct rb_node *release_next(struct tracer_stat *ts, - struct rb_node *node) +static void __reset_stat_session(struct stat_session *session) { - struct stat_node *snode; - struct rb_node *parent = rb_parent(node); - - if (node->rb_left) - return node->rb_left; - else if (node->rb_right) - return node->rb_right; - else { - if (!parent) - ; - else if (parent->rb_left == node) - parent->rb_left = NULL; - else - parent->rb_right = NULL; + struct stat_node *snode, *n; - snode = container_of(node, struct stat_node, node); - if (ts->stat_release) - ts->stat_release(snode->stat); + rbtree_postorder_for_each_entry_safe(snode, n, &session->stat_root, node) { + if (session->ts->stat_release) + session->ts->stat_release(snode->stat); kfree(snode); - - return parent; } -} - -static void __reset_stat_session(struct stat_session *session) -{ - struct rb_node *node = session->stat_root.rb_node; - - while (node) - node = release_next(session->ts, node); session->stat_root = RB_ROOT; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 559329d9bd2f..e4b6d11bdf78 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -302,6 +302,7 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call) static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -314,7 +315,13 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ + ftrace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -336,8 +343,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); - if (!filter_current_check_discard(buffer, sys_data->enter_event, - entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); } @@ -345,6 +351,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { struct trace_array *tr = data; + struct ftrace_event_file *ftrace_file; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -356,7 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) + + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ + ftrace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); + if (!ftrace_file) + return; + + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &ftrace_file->flags)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -377,8 +390,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) entry->nr = syscall_nr; entry->ret = syscall_get_return_value(current, regs); - if (!filter_current_check_discard(buffer, sys_data->exit_event, - entry, event)) + if (!filter_check_discard(ftrace_file, entry, buffer, event)) trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc); } @@ -397,7 +409,7 @@ static int reg_event_syscall_enter(struct ftrace_event_file *file, if (!tr->sys_refcount_enter) ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - set_bit(num, tr->enabled_enter_syscalls); + rcu_assign_pointer(tr->enter_syscall_files[num], file); tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); @@ -415,10 +427,15 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - clear_bit(num, tr->enabled_enter_syscalls); + rcu_assign_pointer(tr->enter_syscall_files[num], NULL); if (!tr->sys_refcount_enter) unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); + /* + * Callers expect the event to be completely disabled on + * return, so wait for current handlers to finish. + */ + synchronize_sched(); } static int reg_event_syscall_exit(struct ftrace_event_file *file, @@ -435,7 +452,7 @@ static int reg_event_syscall_exit(struct ftrace_event_file *file, if (!tr->sys_refcount_exit) ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - set_bit(num, tr->enabled_exit_syscalls); + rcu_assign_pointer(tr->exit_syscall_files[num], file); tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); @@ -453,10 +470,15 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, return; mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - clear_bit(num, tr->enabled_exit_syscalls); + rcu_assign_pointer(tr->exit_syscall_files[num], NULL); if (!tr->sys_refcount_exit) unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); + /* + * Callers expect the event to be completely disabled on + * return, so wait for current handlers to finish. + */ + synchronize_sched(); } static int __init init_syscall_trace(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 272261b5f94f..b6dcc42ef7f5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -128,6 +128,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) if (is_ret) tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); + tu->call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER; return tu; error: @@ -561,7 +562,7 @@ static void uprobe_trace_print(struct trace_uprobe *tu, for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); - if (!filter_current_check_discard(buffer, call, entry, event)) + if (!call_filter_check_discard(call, entry, buffer, event)) trace_buffer_unlock_commit(buffer, event, 0, 0); } diff --git a/kernel/up.c b/kernel/up.c index 630d72bf7e41..509403e3fbc6 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +void __smp_call_function_single(int cpu, struct call_single_data *csd, + int wait) +{ + unsigned long flags; + + local_irq_save(flags); + csd->func(csd->info); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__smp_call_function_single); + int on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; diff --git a/kernel/user.c b/kernel/user.c index 5bbb91988e69..a3a0dbfda329 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,10 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, +#ifdef CONFIG_KEYS_KERBEROS_CACHE + .krb_cache_register_sem = + __RWSEM_INITIALIZER(init_user_ns.krb_cache_register_sem), +#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 13fb1134ba58..240fb62cf394 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -101,6 +101,9 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); +#ifdef CONFIG_PERSISTENT_KEYRINGS + init_rwsem(&ns->persistent_keyring_register_sem); +#endif return 0; } @@ -130,6 +133,9 @@ void free_user_ns(struct user_namespace *ns) do { parent = ns->parent; +#ifdef CONFIG_PERSISTENT_KEYRINGS + key_put(ns->persistent_keyring_register); +#endif proc_free_inum(ns->proc_inum); kmem_cache_free(user_ns_cachep, ns); ns = parent; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 987293d03ebc..c66912be990f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,6 +305,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; +/* I: attributes used when instantiating ordered pools on demand */ +static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -518,14 +521,21 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* allocate ID and assign it to @pool */ +/** + * worker_pool_assign_id - allocate ID and assing it to @pool + * @pool: the pool pointer of interest + * + * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned + * successfully, -errno on failure. + */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; lockdep_assert_held(&wq_pool_mutex); - ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE, + GFP_KERNEL); if (ret >= 0) { pool->id = ret; return 0; @@ -1320,7 +1330,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); - /* if dying, only works from the same workqueue are allowed */ + /* if draining, only works from the same workqueue are allowed */ if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; @@ -1736,16 +1746,17 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; + set_user_nice(worker->task, pool->attrs->nice); + + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; + /* * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any * online CPUs. It'll be re-applied when any of the CPUs come up. */ - set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; - /* * The caller is responsible for ensuring %POOL_DISASSOCIATED * remains stable across this function. See the comments above the @@ -4106,7 +4117,7 @@ out_unlock: static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; - int cpu; + int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); @@ -4126,6 +4137,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) mutex_unlock(&wq->mutex); } return 0; + } else if (wq->flags & __WQ_ORDERED) { + ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]); + /* there should only be single pwq for ordering guarantee */ + WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node || + wq->pwqs.prev != &wq->dfl_pwq->pwqs_node), + "ordering guarantee broken for workqueue %s\n", wq->name); + return ret; } else { return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } @@ -5009,10 +5027,6 @@ static int __init init_workqueues(void) int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; int i, cpu; - /* make sure we have enough bits for OFFQ pool ID */ - BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < - WORK_CPU_END * NR_STD_WORKER_POOLS); - WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -5051,13 +5065,23 @@ static int __init init_workqueues(void) } } - /* create default unbound wq attrs */ + /* create default unbound and ordered wq attrs */ for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; + + /* + * An ordered wq should have only one pwq as ordering is + * guaranteed by max_active which is enforced by pwqs. + * Turn off NUMA so that dfl_pwq is used for all nodes. + */ + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + attrs->nice = std_nice[i]; + attrs->no_numa = true; + ordered_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); |