diff options
Diffstat (limited to 'kernel')
103 files changed, 2957 insertions, 1719 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 1a9f929fe629..986c8214dabf 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = __mnt_want_write(internal); + err = mnt_get_write_access(internal); if (err) { mntput(internal); kfree(acct); @@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); mntput(mnt); return 0; } diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e867c17d3f84..85a5b306733b 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -34,7 +34,7 @@ struct audit_chunk { struct list_head list; struct audit_tree *owner; unsigned index; /* index; upper bit indicates 'will prune' */ - } owners[]; + } owners[] __counted_by(count); }; struct audit_tree_mark { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 65075f1e4ac8..91e82e34b51e 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -527,11 +527,18 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - exe_file = get_task_exe_file(tsk); + /* only do exe filtering if we are recording @current events/records */ + if (tsk != current) + return 0; + + if (WARN_ON_ONCE(!current->mm)) + return 0; + exe_file = get_mm_exe_file(current->mm); if (!exe_file) return 0; ino = file_inode(exe_file)->i_ino; dev = file_inode(exe_file)->i_sb->s_dev; fput(exe_file); + return audit_mark_compare(mark, ino, dev); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 21d2fa815e78..6f0d6fb6523f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2212,7 +2212,7 @@ __audit_reusename(const __user char *uptr) if (!n->name) continue; if (n->name->uptr == uptr) { - n->name->refcnt++; + atomic_inc(&n->name->refcnt); return n->name; } } @@ -2241,7 +2241,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - name->refcnt++; + atomic_inc(&name->refcnt); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2373,7 +2373,7 @@ out_alloc: return; if (name) { n->name = name; - name->refcnt++; + atomic_inc(&name->refcnt); } out: @@ -2500,7 +2500,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - found_child->name->refcnt++; + atomic_inc(&found_child->name->refcnt); } } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1095bbe29859..8090d7fb11ef 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8501,7 +8501,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, tname = btf_name_by_offset(btf, walk_type->name_off); ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix); - if (ret < 0) + if (ret >= sizeof(safe_tname)) return false; safe_id = btf_find_by_name_kind(btf, safe_tname, BTF_INFO_KIND(walk_type->info)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5b2741aa0d9b..03b3d4492980 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -785,7 +785,8 @@ found: * to descendants * @cgrp: The cgroup which descendants to traverse * @link: A link for which to replace BPF program - * @type: Type of attach operation + * @new_prog: &struct bpf_prog for the target BPF program with its refcnt + * incremented * * Must be called with cgroup_mutex held. */ @@ -1334,7 +1335,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be executed + * @atype: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1424,7 +1425,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1449,7 +1450,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be executed + * @atype: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). @@ -1496,7 +1497,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be executed + * @atype: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * @@ -1670,7 +1671,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @type: type of program to be executed + * @atype: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and * can allow or deny such access. diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 99d0625b6c82..1aafb2ff2e95 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -118,8 +118,7 @@ static struct inode *bpf_get_inode(struct super_block *sb, return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); - inode->i_atime = inode_set_ctime_current(inode); - inode->i_mtime = inode->i_atime; + simple_inode_init_ts(inode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); @@ -147,7 +146,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 9c49ae53deaf..d93ddac283d4 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -459,8 +459,7 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c) * Typical case will be between 11K and 116K closer to 11K. * bpf progs can and should share bpf_mem_cache when possible. */ - -static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +static void init_refill_work(struct bpf_mem_cache *c) { init_irq_work(&c->refill_work, bpf_mem_refill); if (c->unit_size <= 256) { @@ -476,7 +475,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) c->high_watermark = max(96 * 256 / c->unit_size, 3); } c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); +} +static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) +{ /* To avoid consuming memory assume that 1st run of bpf * prog won't be doing more than 4 map_update_elem from * irq disabled region @@ -484,6 +486,31 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false); } +static int check_obj_size(struct bpf_mem_cache *c, unsigned int idx) +{ + struct llist_node *first; + unsigned int obj_size; + + /* For per-cpu allocator, the size of free objects in free list doesn't + * match with unit_size and now there is no way to get the size of + * per-cpu pointer saved in free object, so just skip the checking. + */ + if (c->percpu_size) + return 0; + + first = c->free_llist.first; + if (!first) + return 0; + + obj_size = ksize(first); + if (obj_size != c->unit_size) { + WARN_ONCE(1, "bpf_mem_cache[%u]: unexpected object size %u, expect %u\n", + idx, obj_size, c->unit_size); + return -EINVAL; + } + return 0; +} + /* When size != 0 bpf_mem_cache for each cpu. * This is typical bpf hash map use case when all elements have equal size. * @@ -494,10 +521,10 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) { static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; + int cpu, i, err, unit_size, percpu_size = 0; struct bpf_mem_caches *cc, __percpu *pcc; struct bpf_mem_cache *c, __percpu *pc; struct obj_cgroup *objcg = NULL; - int cpu, i, unit_size, percpu_size = 0; if (size) { pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); @@ -521,6 +548,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->objcg = objcg; c->percpu_size = percpu_size; c->tgt = c; + init_refill_work(c); prefill_mem_cache(c, cpu); } ma->cache = pc; @@ -534,6 +562,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); if (!pcc) return -ENOMEM; + err = 0; #ifdef CONFIG_MEMCG_KMEM objcg = get_obj_cgroup_from_current(); #endif @@ -544,11 +573,30 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) c->unit_size = sizes[i]; c->objcg = objcg; c->tgt = c; + + init_refill_work(c); + /* Another bpf_mem_cache will be used when allocating + * c->unit_size in bpf_mem_alloc(), so doesn't prefill + * for the bpf_mem_cache because these free objects will + * never be used. + */ + if (i != bpf_mem_cache_idx(c->unit_size)) + continue; prefill_mem_cache(c, cpu); + err = check_obj_size(c, i); + if (err) + goto out; } } + +out: ma->caches = pcc; - return 0; + /* refill_work is either zeroed or initialized, so it is safe to + * call irq_work_sync(). + */ + if (err) + bpf_mem_alloc_destroy(ma); + return err; } static void drain_mem_cache(struct bpf_mem_cache *c) @@ -916,3 +964,35 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +static __init int bpf_mem_cache_adjust_size(void) +{ + unsigned int size; + + /* Adjusting the indexes in size_index() according to the object_size + * of underlying slab cache, so bpf_mem_alloc() will select a + * bpf_mem_cache with unit_size equal to the object_size of + * the underlying slab cache. + * + * The maximal value of KMALLOC_MIN_SIZE and __kmalloc_minalign() is + * 256-bytes, so only do adjustment for [8-bytes, 192-bytes]. + */ + for (size = 192; size >= 8; size -= 8) { + unsigned int kmalloc_size, index; + + kmalloc_size = kmalloc_size_roundup(size); + if (kmalloc_size == size) + continue; + + if (kmalloc_size <= 192) + index = size_index[(kmalloc_size - 1) / 8]; + else + index = fls(kmalloc_size - 1) - 1; + /* Only overwrite if necessary */ + if (size_index[(size - 1) / 8] != index) + size_index[(size - 1) / 8] = index; + } + + return 0; +} +subsys_initcall(bpf_mem_cache_adjust_size); diff --git a/kernel/bpf/mprog.c b/kernel/bpf/mprog.c index 32d2c4829eb8..1394168062e8 100644 --- a/kernel/bpf/mprog.c +++ b/kernel/bpf/mprog.c @@ -253,6 +253,9 @@ int bpf_mprog_attach(struct bpf_mprog_entry *entry, goto out; } idx = tidx; + } else if (bpf_mprog_total(entry) == bpf_mprog_max()) { + ret = -ERANGE; + goto out; } if (flags & BPF_F_BEFORE) { tidx = bpf_mprog_pos_before(entry, &rtuple); @@ -398,14 +401,16 @@ int bpf_mprog_query(const union bpf_attr *attr, union bpf_attr __user *uattr, struct bpf_mprog_cp *cp; struct bpf_prog *prog; const u32 flags = 0; + u32 id, count = 0; + u64 revision = 1; int i, ret = 0; - u32 id, count; - u64 revision; if (attr->query.query_flags || attr->query.attach_flags) return -EINVAL; - revision = bpf_mprog_revision(entry); - count = bpf_mprog_total(entry); + if (entry) { + revision = bpf_mprog_revision(entry); + count = bpf_mprog_total(entry); + } if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) return -EFAULT; if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3e4f2ec1af06..87d6693d8233 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -199,12 +199,14 @@ static int __bpf_prog_dev_bound_init(struct bpf_prog *prog, struct net_device *n offload->netdev = netdev; ondev = bpf_offload_find_netdev(offload->netdev); + /* When program is offloaded require presence of "true" + * bpf_offload_netdev, avoid the one created for !ondev case below. + */ + if (bpf_prog_is_offloaded(prog->aux) && (!ondev || !ondev->offdev)) { + err = -EINVAL; + goto err_free; + } if (!ondev) { - if (bpf_prog_is_offloaded(prog->aux)) { - err = -EINVAL; - goto err_free; - } - /* When only binding to the device, explicitly * create an entry in the hashtable. */ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8d2ddcb7566b..d869f51ea93a 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -98,7 +98,12 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete) int err = 0; void *ptr; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -128,7 +133,12 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete) void *ptr; u32 index; - raw_spin_lock_irqsave(&qs->lock, flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, flags); + } if (queue_stack_map_is_empty(qs)) { memset(value, 0, qs->map.value_size); @@ -193,7 +203,12 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value, if (flags & BPF_NOEXIST || flags > BPF_EXIST) return -EINVAL; - raw_spin_lock_irqsave(&qs->lock, irq_flags); + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) + return -EBUSY; + } else { + raw_spin_lock_irqsave(&qs->lock, irq_flags); + } if (queue_stack_map_is_full(qs)) { if (!replace) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index eb01c31ed591..d77b2f8b9364 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3796,7 +3796,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) { enum bpf_prog_type ptype; struct bpf_prog *prog; - u32 mask; int ret; if (CHECK_ATTR(BPF_PROG_ATTACH)) @@ -3805,10 +3804,16 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = attach_type_to_prog_type(attr->attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC) return -EINVAL; - mask = bpf_mprog_supported(ptype) ? - BPF_F_ATTACH_MASK_MPROG : BPF_F_ATTACH_MASK_BASE; - if (attr->attach_flags & ~mask) - return -EINVAL; + if (bpf_mprog_supported(ptype)) { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) + return -EINVAL; + } else { + if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) + return -EINVAL; + if (attr->relative_fd || + attr->expected_revision) + return -EINVAL; + } prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) @@ -3878,6 +3883,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); } + } else if (attr->attach_flags || + attr->relative_fd || + attr->expected_revision) { + return -EINVAL; } switch (ptype) { @@ -3913,7 +3922,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) return ret; } -#define BPF_PROG_QUERY_LAST_FIELD query.link_attach_flags +#define BPF_PROG_QUERY_LAST_FIELD query.revision static int bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index c4ab9d6cdbe9..82ad23b1d257 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -308,11 +308,9 @@ again: rcu_read_lock(); for (;; curr_fd++) { struct file *f; - f = task_lookup_next_fd_rcu(curr_task, &curr_fd); + f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); if (!f) break; - if (!get_file_rcu(f)) - continue; /* set info->fd */ info->fd = curr_fd; diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 13f0b5dc8262..1338a13a8b64 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -123,7 +123,6 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) { bool ingress = attr->query.attach_type == BPF_TCX_INGRESS; struct net *net = current->nsproxy->net_ns; - struct bpf_mprog_entry *entry; struct net_device *dev; int ret; @@ -133,12 +132,7 @@ int tcx_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr) ret = -ENODEV; goto out; } - entry = tcx_entry_fetch(dev, ingress); - if (!entry) { - ret = -ENOENT; - goto out; - } - ret = bpf_mprog_query(attr, uattr, entry); + ret = bpf_mprog_query(attr, uattr, tcx_entry_fetch(dev, ingress)); out: rtnl_unlock(); return ret; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb78212fa5b2..873ade146f3d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4047,11 +4047,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) bitmap_from_u64(mask, bt_reg_mask(bt)); for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; - if (reg->type != SCALAR_VALUE) { - bt_clear_reg(bt, i); - continue; - } - reg->precise = true; + bt_clear_reg(bt, i); + if (reg->type == SCALAR_VALUE) + reg->precise = true; } return 0; } @@ -14481,7 +14479,7 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum enforce_attach_type_range = tnum_unknown; const struct bpf_prog *prog = env->prog; struct bpf_reg_state *reg; - struct tnum range = tnum_range(0, 1); + struct tnum range = tnum_range(0, 1), const_0 = tnum_const(0); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; struct bpf_func_state *frame = env->cur_state->frame[0]; @@ -14529,8 +14527,8 @@ static int check_return_code(struct bpf_verifier_env *env) return -EINVAL; } - if (!tnum_in(tnum_const(0), reg->var_off)) { - verbose_invalid_scalar(env, reg, &range, "async callback", "R0"); + if (!tnum_in(const_0, reg->var_off)) { + verbose_invalid_scalar(env, reg, &const_0, "async callback", "R0"); return -EINVAL; } return 0; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c487ffef6652..76db6c67e39a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -360,10 +360,9 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, } css_task_iter_end(&it); length = n; - /* now sort & (if procs) strip out duplicates */ + /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); - if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(array, length); + length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config new file mode 100644 index 000000000000..95a400f042b1 --- /dev/null +++ b/kernel/configs/hardening.config @@ -0,0 +1,98 @@ +# Help: Basic kernel hardening options +# +# These are considered the basic kernel hardening, self-protection, and +# attack surface reduction options. They are expected to have low (or +# no) performance impact on most workloads, and have a reasonable level +# of legacy API removals. + +# Make sure reporting of various hardening actions is possible. +CONFIG_BUG=y + +# Basic kernel memory permission enforcement. +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_VMAP_STACK=y + +# Kernel image and memory ASLR. +CONFIG_RANDOMIZE_BASE=y +CONFIG_RANDOMIZE_MEMORY=y + +# Randomize allocator freelists, harden metadata. +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_RANDOM_KMALLOC_CACHES=y + +# Randomize kernel stack offset on syscall entry. +CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT=y + +# Basic stack frame overflow protection. +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y + +# Basic buffer length bounds checking. +CONFIG_HARDENED_USERCOPY=y +CONFIG_FORTIFY_SOURCE=y + +# Basic array index bounds checking. +CONFIG_UBSAN=y +CONFIG_UBSAN_TRAP=y +CONFIG_UBSAN_BOUNDS=y +# CONFIG_UBSAN_SHIFT is not set +# CONFIG_UBSAN_DIV_ZERO +# CONFIG_UBSAN_UNREACHABLE +# CONFIG_UBSAN_BOOL +# CONFIG_UBSAN_ENUM +# CONFIG_UBSAN_ALIGNMENT +CONFIG_UBSAN_SANITIZE_ALL=y + +# Linked list integrity checking. +CONFIG_LIST_HARDENED=y + +# Initialize all heap variables to zero on allocation. +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y + +# Initialize all stack variables to zero on function entry. +CONFIG_INIT_STACK_ALL_ZERO=y + +# Wipe RAM at reboot via EFI. For more details, see: +# https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/ +# https://bugzilla.redhat.com/show_bug.cgi?id=1532058 +CONFIG_RESET_ATTACK_MITIGATION=y + +# Disable DMA between EFI hand-off and the kernel's IOMMU setup. +CONFIG_EFI_DISABLE_PCI_DMA=y + +# Force IOMMU TLB invalidation so devices will never be able to access stale +# data content. +CONFIG_IOMMU_SUPPORT=y +CONFIG_IOMMU_DEFAULT_DMA_STRICT=y + +# Do not allow direct physical memory access to non-device memory. +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y + +# Provide userspace with seccomp BPF API for syscall attack surface reduction. +CONFIG_SECCOMP=y +CONFIG_SECCOMP_FILTER=y + +# Provides some protections against SYN flooding. +CONFIG_SYN_COOKIES=y + +# Attack surface reduction: do not autoload TTY line disciplines. +# CONFIG_LDISC_AUTOLOAD is not set + +# Dangerous; enabling this disables userspace brk ASLR. +# CONFIG_COMPAT_BRK is not set + +# Dangerous; exposes kernel text image layout. +# CONFIG_PROC_KCORE is not set + +# Dangerous; enabling this disables userspace VDSO ASLR. +# CONFIG_COMPAT_VDSO is not set + +# Attack surface reduction: Use the modern PTY interface (devpts) only. +# CONFIG_LEGACY_PTYS is not set + +# Attack surface reduction: Use only modesetting video drivers. +# CONFIG_DRM_LEGACY is not set diff --git a/kernel/cpu.c b/kernel/cpu.c index 6de7c6bb74ee..69e92ddef5dd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -659,11 +659,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu) #endif } -static inline bool cpu_smt_allowed(unsigned int cpu) +static inline bool cpu_bootable(unsigned int cpu) { if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; + /* All CPUs are bootable if controls are not configured */ + if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED) + return true; + + /* All CPUs are bootable if CPU is not SMT capable */ + if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) + return true; + if (topology_is_primary_thread(cpu)) return true; @@ -685,7 +693,7 @@ bool cpu_smt_possible(void) EXPORT_SYMBOL_GPL(cpu_smt_possible); #else -static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } +static inline bool cpu_bootable(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state @@ -788,10 +796,10 @@ static int bringup_wait_for_ap_online(unsigned int cpu) * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The * CPU marked itself as booted_once in notify_cpu_starting() so the - * cpu_smt_allowed() check will now return false if this is not the + * cpu_bootable() check will now return false if this is not the * primary sibling. */ - if (!cpu_smt_allowed(cpu)) + if (!cpu_bootable(cpu)) return -ECANCELED; return 0; } @@ -1372,7 +1380,14 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); tick_cleanup_dead_cpu(cpu); + + /* + * Callbacks must be re-integrated right away to the RCU state machine. + * Otherwise an RCU callback could block a further teardown function + * waiting for its completion. + */ rcutree_migrate_callbacks(cpu); + return 0; } @@ -1388,10 +1403,10 @@ void cpuhp_report_idle_dead(void) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - rcu_report_dead(smp_processor_id()); + rcutree_report_cpu_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* - * We cannot call complete after rcu_report_dead() so we delegate it + * We cannot call complete after rcutree_report_cpu_dead() so we delegate it * to an online cpu. */ smp_call_function_single(cpumask_first(cpu_online_mask), @@ -1515,11 +1530,14 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target) /* * Ensure that the control task does not run on the to be offlined * CPU to prevent a deadlock against cfs_b->period_timer. + * Also keep at least one housekeeping cpu onlined to avoid generating + * an empty sched_domain span. */ - cpu = cpumask_any_but(cpu_online_mask, cpu); - if (cpu >= nr_cpu_ids) - return -EBUSY; - return work_on_cpu(cpu, __cpu_down_maps_locked, &work); + for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) { + if (cpu != work.cpu) + return work_on_cpu(cpu, __cpu_down_maps_locked, &work); + } + return -EBUSY; } static int cpu_down(unsigned int cpu, enum cpuhp_state target) @@ -1617,7 +1635,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ + rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); /* @@ -1741,7 +1759,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target) err = -EBUSY; goto out; } - if (!cpu_smt_allowed(cpu)) { + if (!cpu_bootable(cpu)) { err = -EPERM; goto out; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 03a7932cde0a..2f675ef045d4 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -740,6 +740,17 @@ subsys_initcall(crash_notes_memory_init); #define pr_fmt(fmt) "crash hp: " fmt /* + * Different than kexec/kdump loading/unloading/jumping/shrinking which + * usually rarely happen, there will be many crash hotplug events notified + * during one short period, e.g one memory board is hot added and memory + * regions are online. So mutex lock __crash_hotplug_lock is used to + * serialize the crash hotplug handling specifically. + */ +DEFINE_MUTEX(__crash_hotplug_lock); +#define crash_hotplug_lock() mutex_lock(&__crash_hotplug_lock) +#define crash_hotplug_unlock() mutex_unlock(&__crash_hotplug_lock) + +/* * This routine utilized when the crash_hotplug sysfs node is read. * It reflects the kernel's ability/permission to update the crash * elfcorehdr directly. @@ -748,9 +759,11 @@ int crash_check_update_elfcorehdr(void) { int rc = 0; + crash_hotplug_lock(); /* Obtain lock while reading crash information */ if (!kexec_trylock()) { pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + crash_hotplug_unlock(); return 0; } if (kexec_crash_image) { @@ -761,6 +774,7 @@ int crash_check_update_elfcorehdr(void) } /* Release lock now that update complete */ kexec_unlock(); + crash_hotplug_unlock(); return rc; } @@ -783,9 +797,11 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) { struct kimage *image; + crash_hotplug_lock(); /* Obtain lock while changing crash information */ if (!kexec_trylock()) { pr_info("kexec_trylock() failed, elfcorehdr may be inaccurate\n"); + crash_hotplug_unlock(); return; } @@ -852,6 +868,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) out: /* Release lock now that update complete */ kexec_unlock(); + crash_hotplug_unlock(); } static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) diff --git a/kernel/cred.c b/kernel/cred.c index 98cb4eca23fb..3c714cb31660 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -36,7 +36,7 @@ do { \ static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; +static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; /* * The initial credentials for the initial task @@ -162,23 +162,29 @@ EXPORT_SYMBOL(__put_cred); */ void exit_creds(struct task_struct *tsk) { - struct cred *cred; + struct cred *real_cred, *cred; kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, atomic_read(&tsk->cred->usage), read_cred_subscribers(tsk->cred)); - cred = (struct cred *) tsk->real_cred; + real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; - validate_creds(cred); - alter_cred_subscribers(cred, -1); - put_cred(cred); cred = (struct cred *) tsk->cred; tsk->cred = NULL; + validate_creds(cred); - alter_cred_subscribers(cred, -1); - put_cred(cred); + if (real_cred == cred) { + alter_cred_subscribers(cred, -2); + put_cred_many(cred, 2); + } else { + validate_creds(real_cred); + alter_cred_subscribers(real_cred, -1); + put_cred(real_cred); + alter_cred_subscribers(cred, -1); + put_cred(cred); + } #ifdef CONFIG_KEYS_REQUEST_CACHE key_put(tsk->cached_requested_key); @@ -355,8 +361,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) #endif clone_flags & CLONE_THREAD ) { - p->real_cred = get_cred(p->cred); - get_cred(p->cred); + p->real_cred = get_cred_many(p->cred, 2); alter_cred_subscribers(p->cred, 2); kdebug("share_creds(%p{%d,%d})", p->cred, atomic_read(&p->cred->usage), @@ -520,8 +525,7 @@ int commit_creds(struct cred *new) proc_id_connector(task, PROC_EVENT_GID); /* release the old obj and subj refs both */ - put_cred(old); - put_cred(old); + put_cred_many(old, 2); return 0; } EXPORT_SYMBOL(commit_creds); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 394494a6b1f3..dff067bd56b1 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -399,14 +399,13 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, } mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), - default_nareas), SMP_CACHE_BYTES); + nareas), SMP_CACHE_BYTES); if (!mem->areas) { pr_warn("%s: Failed to allocate mem->areas.\n", __func__); return; } - swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, - default_nareas); + swiotlb_init_io_tlb_pool(mem, __pa(tlb), nslabs, false, nareas); add_mem_pool(&io_tlb_default_mem, mem); if (flags & SWIOTLB_VERBOSE) @@ -679,6 +678,11 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev, size_t pool_size; size_t tlb_size; + if (nslabs > SLABS_PER_PAGE << MAX_ORDER) { + nslabs = SLABS_PER_PAGE << MAX_ORDER; + nareas = limit_nareas(nareas, nslabs); + } + pool_size = sizeof(*pool) + array_size(sizeof(*pool->areas), nareas); pool = kzalloc(pool_size, gfp); if (!pool) @@ -729,9 +733,6 @@ static void swiotlb_dyn_alloc(struct work_struct *work) } add_mem_pool(mem, pool); - - /* Pairs with smp_rmb() in is_swiotlb_buffer(). */ - smp_wmb(); } /** @@ -1152,9 +1153,26 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr, spin_unlock_irqrestore(&dev->dma_io_tlb_lock, flags); found: - dev->dma_uses_io_tlb = true; - /* Pairs with smp_rmb() in is_swiotlb_buffer() */ - smp_wmb(); + WRITE_ONCE(dev->dma_uses_io_tlb, true); + + /* + * The general barrier orders reads and writes against a presumed store + * of the SWIOTLB buffer address by a device driver (to a driver private + * data structure). It serves two purposes. + * + * First, the store to dev->dma_uses_io_tlb must be ordered before the + * presumed store. This guarantees that the returned buffer address + * cannot be passed to another CPU before updating dev->dma_uses_io_tlb. + * + * Second, the load from mem->pools must be ordered before the same + * presumed store. This guarantees that the returned buffer address + * cannot be observed by another CPU before an update of the RCU list + * that was made by swiotlb_dyn_alloc() on a third CPU (cf. multicopy + * atomicity). + * + * See also the comment in is_swiotlb_buffer(). + */ + smp_mb(); *retpool = pool; return index; diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c72a41f11af..683dc086ef10 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -375,6 +375,7 @@ enum event_type_t { EVENT_TIME = 0x4, /* see ctx_resched() for details */ EVENT_CPU = 0x8, + EVENT_CGROUP = 0x10, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -449,8 +450,8 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); -int perf_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +int perf_event_max_sample_rate_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -684,20 +685,26 @@ do { \ ___p; \ }) -static void perf_ctx_disable(struct perf_event_context *ctx) +static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; perf_pmu_disable(pmu_ctx->pmu); + } } -static void perf_ctx_enable(struct perf_event_context *ctx) +static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; perf_pmu_enable(pmu_ctx->pmu); + } } static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); @@ -856,9 +863,9 @@ static void perf_cgroup_switch(struct task_struct *task) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, true); - ctx_sched_out(&cpuctx->ctx, EVENT_ALL); + ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); /* * must not be done before ctxswout due * to update_cgrp_time_from_cpuctx() in @@ -870,9 +877,9 @@ static void perf_cgroup_switch(struct task_struct *task) * perf_cgroup_set_timestamp() in ctx_sched_in() * to not have to pass task around */ - ctx_sched_in(&cpuctx->ctx, EVENT_ALL); + ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, true); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -965,6 +972,8 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups++; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. @@ -985,6 +994,8 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (!is_cgroup_event(event)) return; + event->pmu_ctx->nr_cgroups--; + /* * Because cgroup events are always per-cpu events, * @ctx == &cpuctx->ctx. @@ -1954,6 +1965,7 @@ static void perf_group_attach(struct perf_event *event) list_add_tail(&event->sibling_list, &group_leader->sibling_list); group_leader->nr_siblings++; + group_leader->group_generation++; perf_event__header_size(group_leader); @@ -2144,6 +2156,7 @@ static void perf_group_detach(struct perf_event *event) if (leader != event) { list_del_init(&event->sibling_list); event->group_leader->nr_siblings--; + event->group_leader->group_generation++; goto out; } @@ -2677,9 +2690,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, false); if (task_ctx) { - perf_ctx_disable(task_ctx); + perf_ctx_disable(task_ctx, false); task_ctx_sched_out(task_ctx, event_type); } @@ -2697,9 +2710,9 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_event_sched_in(cpuctx, task_ctx); - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, false); if (task_ctx) - perf_ctx_enable(task_ctx); + perf_ctx_enable(task_ctx, false); } void perf_pmu_resched(struct pmu *pmu) @@ -3244,6 +3257,9 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3290,8 +3306,11 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) is_active ^= ctx->is_active; /* changed bits */ - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; __pmu_ctx_sched_out(pmu_ctx, is_active); + } } /* @@ -3482,7 +3501,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || @@ -3502,7 +3521,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, false); perf_event_swap_task_ctx_data(ctx, next_ctx); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); /* * RCU_INIT_POINTER here is safe because we've not @@ -3526,13 +3545,13 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); inside_switch: perf_ctx_sched_task_cb(ctx, false); task_ctx_sched_out(ctx, EVENT_ALL); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); raw_spin_unlock(&ctx->lock); } } @@ -3818,47 +3837,32 @@ static int merge_sched_in(struct perf_event *event, void *data) return 0; } -static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void pmu_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + struct pmu *pmu) { - struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - - if (pmu) { - visit_groups_merge(ctx, &ctx->pinned_groups, - smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); - } else { - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - can_add_hw = 1; - visit_groups_merge(ctx, &ctx->pinned_groups, - smp_processor_id(), pmu_ctx->pmu, - merge_sched_in, &can_add_hw); - } - } + visit_groups_merge(ctx, groups, smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); } -static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void ctx_groups_sched_in(struct perf_event_context *ctx, + struct perf_event_groups *groups, + bool cgroup) { struct perf_event_pmu_context *pmu_ctx; - int can_add_hw = 1; - if (pmu) { - visit_groups_merge(ctx, &ctx->flexible_groups, - smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); - } else { - list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { - can_add_hw = 1; - visit_groups_merge(ctx, &ctx->flexible_groups, - smp_processor_id(), pmu_ctx->pmu, - merge_sched_in, &can_add_hw); - } + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (cgroup && !pmu_ctx->nr_cgroups) + continue; + pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu); } } -static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +static void __pmu_ctx_sched_in(struct perf_event_context *ctx, + struct pmu *pmu) { - ctx_flexible_sched_in(ctx, pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu); } static void @@ -3866,6 +3870,9 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; + bool cgroup = event_type & EVENT_CGROUP; + + event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3898,11 +3905,11 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, NULL); + ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, NULL); + ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup); } static void perf_event_context_sched_in(struct task_struct *task) @@ -3917,11 +3924,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); perf_ctx_sched_task_cb(ctx, true); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -3934,7 +3941,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx); + perf_ctx_disable(ctx, false); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3944,7 +3951,7 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx); + perf_ctx_disable(&cpuctx->ctx, false); ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); } @@ -3953,9 +3960,9 @@ static void perf_event_context_sched_in(struct task_struct *task) perf_ctx_sched_task_cb(cpuctx->task_ctx, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx); + perf_ctx_enable(&cpuctx->ctx, false); - perf_ctx_enable(ctx); + perf_ctx_enable(ctx, false); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -4425,6 +4432,9 @@ static int __perf_event_read_cpu(struct perf_event *event, int event_cpu) { u16 local_pkg, event_pkg; + if ((unsigned)event_cpu >= nr_cpu_ids) + return event_cpu; + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { int local_cpu = smp_processor_id(); @@ -4527,6 +4537,8 @@ int perf_event_read_local(struct perf_event *event, u64 *value, u64 *enabled, u64 *running) { unsigned long flags; + int event_oncpu; + int event_cpu; int ret = 0; /* @@ -4551,15 +4563,22 @@ int perf_event_read_local(struct perf_event *event, u64 *value, goto out; } + /* + * Get the event CPU numbers, and adjust them to local if the event is + * a per-package event that can be read locally + */ + event_oncpu = __perf_event_read_cpu(event, event->oncpu); + event_cpu = __perf_event_read_cpu(event, event->cpu); + /* If this is a per-CPU event, it must be for this CPU */ if (!(event->attach_state & PERF_ATTACH_TASK) && - event->cpu != smp_processor_id()) { + event_cpu != smp_processor_id()) { ret = -EINVAL; goto out; } /* If this is a pinned event it must be running on this CPU */ - if (event->attr.pinned && event->oncpu != smp_processor_id()) { + if (event->attr.pinned && event_oncpu != smp_processor_id()) { ret = -EBUSY; goto out; } @@ -4569,7 +4588,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event_oncpu == smp_processor_id()) event->pmu->read(event); *value = local64_read(&event->count); @@ -5440,7 +5459,7 @@ static int __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) { struct perf_event_context *ctx = leader->ctx; - struct perf_event *sub; + struct perf_event *sub, *parent; unsigned long flags; int n = 1; /* skip @nr */ int ret; @@ -5450,6 +5469,33 @@ static int __perf_read_group_add(struct perf_event *leader, return ret; raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * Verify the grouping between the parent and child (inherited) + * events is still in tact. + * + * Specifically: + * - leader->ctx->lock pins leader->sibling_list + * - parent->child_mutex pins parent->child_list + * - parent->ctx->mutex pins parent->sibling_list + * + * Because parent->ctx != leader->ctx (and child_list nests inside + * ctx->mutex), group destruction is not atomic between children, also + * see perf_event_release_kernel(). Additionally, parent can grow the + * group. + * + * Therefore it is possible to have parent and child groups in a + * different configuration and summing over such a beast makes no sense + * what so ever. + * + * Reject this. + */ + parent = leader->parent; + if (parent && + (parent->group_generation != leader->group_generation || + parent->nr_siblings != leader->nr_siblings)) { + ret = -ECHILD; + goto unlock; + } /* * Since we co-schedule groups, {enabled,running} times of siblings @@ -5483,8 +5529,9 @@ static int __perf_read_group_add(struct perf_event *leader, values[n++] = atomic64_read(&sub->lost_samples); } +unlock: raw_spin_unlock_irqrestore(&ctx->lock, flags); - return 0; + return ret; } static int perf_read_group(struct perf_event *event, @@ -5503,10 +5550,6 @@ static int perf_read_group(struct perf_event *event, values[0] = 1 + leader->nr_siblings; - /* - * By locking the child_mutex of the leader we effectively - * lock the child list of all siblings.. XXX explain how. - */ mutex_lock(&leader->child_mutex); ret = __perf_read_group_add(leader, read_format, values); @@ -13346,6 +13389,8 @@ static int inherit_group(struct perf_event *parent_event, !perf_get_aux_event(child_ctr, leader)) return -EINVAL; } + if (leader) + leader->group_generation = parent_event->group_generation; return 0; } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index fb1e180b5f0a..e8d82c2f07d0 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -700,6 +700,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, watermark = 0; } + /* + * kcalloc_node() is unable to allocate buffer if the size is larger + * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case. + */ + if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER) + return -ENOMEM; rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, node); if (!rb->aux_pages) diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..640123767726 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm) struct file *exe_file; rcu_read_lock(); - exe_file = rcu_dereference(mm->exe_file); - if (exe_file && !get_file_rcu(exe_file)) - exe_file = NULL; + exe_file = get_file_rcu(&mm->exe_file); rcu_read_unlock(); return exe_file; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 4fad0e6fca64..c450fa8b8b5e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -71,7 +71,11 @@ bool __refrigerator(bool check_kthr_stop) for (;;) { bool freeze; + raw_spin_lock_irq(¤t->pi_lock); set_current_state(TASK_FROZEN); + /* unstale saved_state so that __thaw_task() will wake us up */ + current->saved_state = TASK_RUNNING; + raw_spin_unlock_irq(¤t->pi_lock); spin_lock_irq(&freezer_lock); freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); @@ -129,6 +133,7 @@ static int __set_task_frozen(struct task_struct *p, void *arg) WARN_ON_ONCE(debug_locks && p->lockdep_depth); #endif + p->saved_state = p->__state; WRITE_ONCE(p->__state, TASK_FROZEN); return TASK_FROZEN; } @@ -170,42 +175,34 @@ bool freeze_task(struct task_struct *p) } /* - * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical - * state in p->jobctl. If either of them got a wakeup that was missed because - * TASK_FROZEN, then their canonical state reflects that and the below will - * refuse to restore the special state and instead issue the wakeup. + * Restore the saved_state before the task entered freezer. For typical task + * in the __refrigerator(), saved_state == TASK_RUNNING so nothing happens + * here. For tasks which were TASK_NORMAL | TASK_FREEZABLE, their initial state + * is restored unless they got an expected wakeup (see ttwu_state_match()). + * Returns 1 if the task state was restored. */ -static int __set_task_special(struct task_struct *p, void *arg) +static int __restore_freezer_state(struct task_struct *p, void *arg) { - unsigned int state = 0; + unsigned int state = p->saved_state; - if (p->jobctl & JOBCTL_TRACED) - state = TASK_TRACED; - - else if (p->jobctl & JOBCTL_STOPPED) - state = TASK_STOPPED; - - if (state) + if (state != TASK_RUNNING) { WRITE_ONCE(p->__state, state); + return 1; + } - return state; + return 0; } void __thaw_task(struct task_struct *p) { - unsigned long flags, flags2; + unsigned long flags; spin_lock_irqsave(&freezer_lock, flags); if (WARN_ON_ONCE(freezing(p))) goto unlock; - if (lock_task_sighand(p, &flags2)) { - /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ - bool ret = task_call_func(p, __set_task_special, NULL); - unlock_task_sighand(p, &flags2); - if (ret) - goto unlock; - } + if (task_call_func(p, __restore_freezer_state, NULL)) + goto unlock; wake_up_state(p, TASK_FROZEN); unlock: diff --git a/kernel/futex/core.c b/kernel/futex/core.c index f10587d1d481..52695c59d041 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -193,7 +193,7 @@ static u64 get_inode_sequence_number(struct inode *inode) /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex - * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED + * @flags: FLAGS_* * @key: address where result is stored. * @rw: mapping needs to be read/write (values: FUTEX_READ, * FUTEX_WRITE) @@ -217,14 +217,18 @@ static u64 get_inode_sequence_number(struct inode *inode) * * lock_page() might sleep, the caller should not hold a spinlock. */ -int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, +int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *tail; + struct page *page; + struct folio *folio; struct address_space *mapping; int err, ro = 0; + bool fshared; + + fshared = flags & FLAGS_SHARED; /* * The futex address must be "naturally" aligned. @@ -248,7 +252,17 @@ int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, * but access_ok() should be faster than find_vma() */ if (!fshared) { - key->private.mm = mm; + /* + * On no-MMU, shared futexes are treated as private, therefore + * we must not include the current process in the key. Since + * there is only one address space, the address is a unique key + * on its own. + */ + if (IS_ENABLED(CONFIG_MMU)) + key->private.mm = mm; + else + key->private.mm = NULL; + key->private.address = address; return 0; } @@ -273,54 +287,52 @@ again: err = 0; /* - * The treatment of mapping from this point on is critical. The page - * lock protects many things but in this context the page lock + * The treatment of mapping from this point on is critical. The folio + * lock protects many things but in this context the folio lock * stabilizes mapping, prevents inode freeing in the shared * file-backed region case and guards against movement to swap cache. * - * Strictly speaking the page lock is not needed in all cases being - * considered here and page lock forces unnecessarily serialization + * Strictly speaking the folio lock is not needed in all cases being + * considered here and folio lock forces unnecessarily serialization. * From this point on, mapping will be re-verified if necessary and - * page lock will be acquired only if it is unavoidable + * folio lock will be acquired only if it is unavoidable * - * Mapping checks require the head page for any compound page so the - * head page and mapping is looked up now. For anonymous pages, it - * does not matter if the page splits in the future as the key is - * based on the address. For filesystem-backed pages, the tail is - * required as the index of the page determines the key. For - * base pages, there is no tail page and tail == page. + * Mapping checks require the folio so it is looked up now. For + * anonymous pages, it does not matter if the folio is split + * in the future as the key is based on the address. For + * filesystem-backed pages, the precise page is required as the + * index of the page determines the key. */ - tail = page; - page = compound_head(page); - mapping = READ_ONCE(page->mapping); + folio = page_folio(page); + mapping = READ_ONCE(folio->mapping); /* - * If page->mapping is NULL, then it cannot be a PageAnon + * If folio->mapping is NULL, then it cannot be an anonymous * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast * found it, but truncated or holepunched or subjected to - * invalidate_complete_page2 before we got the page lock (also + * invalidate_complete_page2 before we got the folio lock (also * cases which we are happy to fail). And we hold a reference, * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page->mapping. + * an unlikely race, but we do need to retry for folio->mapping. */ if (unlikely(!mapping)) { int shmem_swizzled; /* - * Page lock is required to identify which special case above - * applies. If this is really a shmem page then the page lock + * Folio lock is required to identify which special case above + * applies. If this is really a shmem page then the folio lock * will prevent unexpected transitions. */ - lock_page(page); - shmem_swizzled = PageSwapCache(page) || page->mapping; - unlock_page(page); - put_page(page); + folio_lock(folio); + shmem_swizzled = folio_test_swapcache(folio) || folio->mapping; + folio_unlock(folio); + folio_put(folio); if (shmem_swizzled) goto again; @@ -331,14 +343,14 @@ again: /* * Private mappings are handled in a simple way. * - * If the futex key is stored on an anonymous page, then the associated + * If the futex key is stored in anonymous memory, then the associated * object is the mm which is implicitly pinned by the calling process. * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (folio_test_anon(folio)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -357,10 +369,10 @@ again: /* * The associated futex object in this case is the inode and - * the page->mapping must be traversed. Ordinarily this should - * be stabilised under page lock but it's not strictly + * the folio->mapping must be traversed. Ordinarily this should + * be stabilised under folio lock but it's not strictly * necessary in this case as we just want to pin the inode, not - * update the radix tree or anything like that. + * update i_pages or anything like that. * * The RCU read lock is taken as the inode is finally freed * under RCU. If the mapping still matches expectations then the @@ -368,9 +380,9 @@ again: */ rcu_read_lock(); - if (READ_ONCE(page->mapping) != mapping) { + if (READ_ONCE(folio->mapping) != mapping) { rcu_read_unlock(); - put_page(page); + folio_put(folio); goto again; } @@ -378,19 +390,19 @@ again: inode = READ_ONCE(mapping->host); if (!inode) { rcu_read_unlock(); - put_page(page); + folio_put(folio); goto again; } key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = page_to_pgoff(tail); + key->shared.pgoff = folio->index + folio_page_idx(folio, page); rcu_read_unlock(); } out: - put_page(page); + folio_put(folio); return err; } diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index b5379c0e6d6d..a06030a1a27b 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -5,6 +5,7 @@ #include <linux/futex.h> #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> +#include <linux/compat.h> #ifdef CONFIG_PREEMPT_RT #include <linux/rcuwait.h> @@ -16,17 +17,84 @@ * Futex flags used to encode options to functions and preserve them across * restarts. */ +#define FLAGS_SIZE_8 0x0000 +#define FLAGS_SIZE_16 0x0001 +#define FLAGS_SIZE_32 0x0002 +#define FLAGS_SIZE_64 0x0003 + +#define FLAGS_SIZE_MASK 0x0003 + #ifdef CONFIG_MMU -# define FLAGS_SHARED 0x01 +# define FLAGS_SHARED 0x0010 #else /* * NOMMU does not have per process address space. Let the compiler optimize * code away. */ -# define FLAGS_SHARED 0x00 +# define FLAGS_SHARED 0x0000 #endif -#define FLAGS_CLOCKRT 0x02 -#define FLAGS_HAS_TIMEOUT 0x04 +#define FLAGS_CLOCKRT 0x0020 +#define FLAGS_HAS_TIMEOUT 0x0040 +#define FLAGS_NUMA 0x0080 +#define FLAGS_STRICT 0x0100 + +/* FUTEX_ to FLAGS_ */ +static inline unsigned int futex_to_flags(unsigned int op) +{ + unsigned int flags = FLAGS_SIZE_32; + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; + + if (op & FUTEX_CLOCK_REALTIME) + flags |= FLAGS_CLOCKRT; + + return flags; +} + +/* FUTEX2_ to FLAGS_ */ +static inline unsigned int futex2_to_flags(unsigned int flags2) +{ + unsigned int flags = flags2 & FUTEX2_SIZE_MASK; + + if (!(flags2 & FUTEX2_PRIVATE)) + flags |= FLAGS_SHARED; + + if (flags2 & FUTEX2_NUMA) + flags |= FLAGS_NUMA; + + return flags; +} + +static inline unsigned int futex_size(unsigned int flags) +{ + return 1 << (flags & FLAGS_SIZE_MASK); +} + +static inline bool futex_flags_valid(unsigned int flags) +{ + /* Only 64bit futexes for 64bit code */ + if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) { + if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64) + return false; + } + + /* Only 32bit futexes are implemented -- for now */ + if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32) + return false; + + return true; +} + +static inline bool futex_validate_input(unsigned int flags, u64 val) +{ + int bits = 8 * futex_size(flags); + + if (bits < 64 && (val >> bits)) + return false; + + return true; +} #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); @@ -116,7 +184,7 @@ enum futex_access { FUTEX_WRITE }; -extern int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, +extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, enum futex_access rw); extern struct hrtimer_sleeper * @@ -260,10 +328,14 @@ extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2); -extern int futex_requeue(u32 __user *uaddr1, unsigned int flags, - u32 __user *uaddr2, int nr_wake, int nr_requeue, +extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1, + u32 __user *uaddr2, unsigned int flags2, + int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi); +extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + struct hrtimer_sleeper *to, u32 bitset); + extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset); diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index ce2889f12375..90e5197f4e56 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/slab.h> +#include <linux/sched/rt.h> #include <linux/sched/task.h> #include "futex.h" @@ -610,29 +611,16 @@ int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, /* * Caller must hold a reference on @pi_state. */ -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) +static int wake_futex_pi(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, + struct rt_mutex_waiter *top_waiter) { - struct rt_mutex_waiter *top_waiter; struct task_struct *new_owner; bool postunlock = false; DEFINE_RT_WAKE_Q(wqh); u32 curval, newval; int ret = 0; - top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); - if (WARN_ON_ONCE(!top_waiter)) { - /* - * As per the comment in futex_unlock_pi() this should not happen. - * - * When this happens, give up our locks and try again, giving - * the futex_lock_pi() instance time to complete, either by - * waiting on the rtmutex or removing itself from the futex - * queue. - */ - ret = -EAGAIN; - goto out_unlock; - } - new_owner = top_waiter->task; /* @@ -945,7 +933,7 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl to = futex_setup_timer(time, &timeout, flags, 0); retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); + ret = get_futex_key(uaddr, flags, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -1002,6 +990,12 @@ retry_private: goto no_block; } + /* + * Must be done before we enqueue the waiter, here is unfortunately + * under the hb lock, but that *should* work because it does nothing. + */ + rt_mutex_pre_schedule(); + rt_mutex_init_waiter(&rt_waiter); /* @@ -1039,19 +1033,37 @@ retry_private: ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); cleanup: - spin_lock(q.lock_ptr); /* * If we failed to acquire the lock (deadlock/signal/timeout), we must - * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait - * lists consistent. + * must unwind the above, however we canont lock hb->lock because + * rt_mutex already has a waiter enqueued and hb->lock can itself try + * and enqueue an rt_waiter through rtlock. + * + * Doing the cleanup without holding hb->lock can cause inconsistent + * state between hb and pi_state, but only in the direction of not + * seeing a waiter that is leaving. + * + * See futex_unlock_pi(), it deals with this inconsistency. + * + * There be dragons here, since we must deal with the inconsistency on + * the way out (here), it is impossible to detect/warn about the race + * the other way around (missing an incoming waiter). * - * In particular; it is important that futex_unlock_pi() can not - * observe this inconsistency. + * What could possibly go wrong... */ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ret = 0; + /* + * Now that the rt_waiter has been dequeued, it is safe to use + * spinlock/rtlock (which might enqueue its own rt_waiter) and fix up + * the + */ + spin_lock(q.lock_ptr); + /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); no_block: /* * Fixup the pi_state owner and possibly acquire the lock if we @@ -1117,7 +1129,7 @@ retry: if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); + ret = get_futex_key(uaddr, flags, &key, FUTEX_WRITE); if (ret) return ret; @@ -1132,6 +1144,7 @@ retry: top_waiter = futex_top_waiter(hb, &key); if (top_waiter) { struct futex_pi_state *pi_state = top_waiter->pi_state; + struct rt_mutex_waiter *rt_waiter; ret = -EINVAL; if (!pi_state) @@ -1144,22 +1157,39 @@ retry: if (pi_state->owner != current) goto out_unlock; - get_pi_state(pi_state); /* * By taking wait_lock while still holding hb->lock, we ensure - * there is no point where we hold neither; and therefore - * wake_futex_p() must observe a state consistent with what we - * observed. + * there is no point where we hold neither; and thereby + * wake_futex_pi() must observe any new waiters. + * + * Since the cleanup: case in futex_lock_pi() removes the + * rt_waiter without holding hb->lock, it is possible for + * wake_futex_pi() to not find a waiter while the above does, + * in this case the waiter is on the way out and it can be + * ignored. * * In particular; this forces __rt_mutex_start_proxy() to * complete such that we're guaranteed to observe the - * rt_waiter. Also see the WARN in wake_futex_pi(). + * rt_waiter. */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Futex vs rt_mutex waiter state -- if there are no rt_mutex + * waiters even though futex thinks there are, then the waiter + * is leaving and the uncontended path is safe to take. + */ + rt_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); + if (!rt_waiter) { + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + goto do_uncontended; + } + + get_pi_state(pi_state); spin_unlock(&hb->lock); /* drops pi_state->pi_mutex.wait_lock */ - ret = wake_futex_pi(uaddr, uval, pi_state); + ret = wake_futex_pi(uaddr, uval, pi_state, rt_waiter); put_pi_state(pi_state); @@ -1187,6 +1217,7 @@ retry: return ret; } +do_uncontended: /* * We have no kernel internal state, i.e. no waiters in the * kernel. Waiters which are about to queue themselves are stuck diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index cba8b1a6a4cc..16a3645bd786 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -269,7 +269,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, union futex_key *key2, struct futex_pi_state **ps, struct task_struct **exiting, int set_waiters) { - struct futex_q *top_waiter = NULL; + struct futex_q *top_waiter; u32 curval; int ret; @@ -346,8 +346,9 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 * @uaddr1: source futex user address - * @flags: futex flags (FLAGS_SHARED, etc.) + * @flags1: futex flags (FLAGS_SHARED, etc.) * @uaddr2: target futex user address + * @flags2: futex flags (FLAGS_SHARED, etc.) * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) * @nr_requeue: number of waiters to requeue (0-INT_MAX) * @cmpval: @uaddr1 expected value (or %NULL) @@ -361,7 +362,8 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * - >=0 - on success, the number of tasks requeued or woken; * - <0 - on error */ -int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, +int futex_requeue(u32 __user *uaddr1, unsigned int flags1, + u32 __user *uaddr2, unsigned int flags2, int nr_wake, int nr_requeue, u32 *cmpval, int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; @@ -424,10 +426,10 @@ int futex_requeue(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, } retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + ret = get_futex_key(uaddr1, flags1, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + ret = get_futex_key(uaddr2, flags2, &key2, requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -459,7 +461,7 @@ retry_private: if (ret) return ret; - if (!(flags & FLAGS_SHARED)) + if (!(flags1 & FLAGS_SHARED)) goto retry_private; goto retry; @@ -789,7 +791,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ rt_mutex_init_waiter(&rt_waiter); - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -850,11 +852,13 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - /* Current is not longer pi_blocked_on */ - spin_lock(q.lock_ptr); + /* + * See futex_unlock_pi()'s cleanup: comment. + */ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; + spin_lock(q.lock_ptr); debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index a8074079b09e..8200d86d30e1 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -#include <linux/compat.h> #include <linux/syscalls.h> #include <linux/time_namespace.h> @@ -85,15 +84,12 @@ err_unlock: long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { + unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; - unsigned int flags = 0; - if (!(op & FUTEX_PRIVATE_FLAG)) - flags |= FLAGS_SHARED; - - if (op & FUTEX_CLOCK_REALTIME) { - flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && + if (flags & FLAGS_CLOCKRT) { + if (cmd != FUTEX_WAIT_BITSET && + cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } @@ -110,9 +106,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: @@ -129,7 +125,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } @@ -183,8 +179,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } -/* Mask of available flags for each futex in futex_waitv list */ -#define FUTEXV_WAITER_MASK (FUTEX_32 | FUTEX_PRIVATE_FLAG) +#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE) /** * futex_parse_waitv - Parse a waitv array from userspace @@ -202,16 +197,22 @@ static int futex_parse_waitv(struct futex_vector *futexv, unsigned int i; for (i = 0; i < nr_futexes; i++) { + unsigned int flags; + if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; - if ((aux.flags & ~FUTEXV_WAITER_MASK) || aux.__reserved) + if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; - if (!(aux.flags & FUTEX_32)) + flags = futex2_to_flags(aux.flags); + if (!futex_flags_valid(flags)) return -EINVAL; - futexv[i].w.flags = aux.flags; + if (!futex_validate_input(flags, aux.val)) + return -EINVAL; + + futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; @@ -220,6 +221,46 @@ static int futex_parse_waitv(struct futex_vector *futexv, return 0; } +static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, + clockid_t clockid, struct hrtimer_sleeper *to) +{ + int flag_clkid = 0, flag_init = 0; + struct timespec64 ts; + ktime_t time; + int ret; + + if (!timeout) + return 0; + + if (clockid == CLOCK_REALTIME) { + flag_clkid = FLAGS_CLOCKRT; + flag_init = FUTEX_CLOCK_REALTIME; + } + + if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) + return -EINVAL; + + if (get_timespec64(&ts, timeout)) + return -EFAULT; + + /* + * Since there's no opcode for futex_waitv, use + * FUTEX_WAIT_BITSET that uses absolute timeout as well + */ + ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); + if (ret) + return ret; + + futex_setup_timer(&time, to, flag_clkid, 0); + return 0; +} + +static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) +{ + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); +} + /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on @@ -249,8 +290,6 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, { struct hrtimer_sleeper to; struct futex_vector *futexv; - struct timespec64 ts; - ktime_t time; int ret; /* This syscall supports no flags for now */ @@ -260,30 +299,8 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; - if (timeout) { - int flag_clkid = 0, flag_init = 0; - - if (clockid == CLOCK_REALTIME) { - flag_clkid = FLAGS_CLOCKRT; - flag_init = FUTEX_CLOCK_REALTIME; - } - - if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) - return -EINVAL; - - if (get_timespec64(&ts, timeout)) - return -EFAULT; - - /* - * Since there's no opcode for futex_waitv, use - * FUTEX_WAIT_BITSET that uses absolute timeout as well - */ - ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); - if (ret) - return ret; - - futex_setup_timer(&time, &to, flag_clkid, 0); - } + if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) + return ret; futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); if (!futexv) { @@ -298,13 +315,125 @@ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, kfree(futexv); destroy_timer: - if (timeout) { - hrtimer_cancel(&to.timer); - destroy_hrtimer_on_stack(&to.timer); - } + if (timeout) + futex2_destroy_timeout(&to); + return ret; +} + +/* + * sys_futex_wake - Wake a number of futexes + * @uaddr: Address of the futex(es) to wake + * @mask: bitmask + * @nr: Number of the futexes to wake + * @flags: FUTEX2 flags + * + * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the + * futex2 family of calls. + */ + +SYSCALL_DEFINE4(futex_wake, + void __user *, uaddr, + unsigned long, mask, + int, nr, + unsigned int, flags) +{ + if (flags & ~FUTEX2_VALID_MASK) + return -EINVAL; + + flags = futex2_to_flags(flags); + if (!futex_flags_valid(flags)) + return -EINVAL; + + if (!futex_validate_input(flags, mask)) + return -EINVAL; + + return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); +} + +/* + * sys_futex_wait - Wait on a futex + * @uaddr: Address of the futex to wait on + * @val: Value of @uaddr + * @mask: bitmask + * @flags: FUTEX2 flags + * @timeout: Optional absolute timeout + * @clockid: Clock to be used for the timeout, realtime or monotonic + * + * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the + * futex2 familiy of calls. + */ + +SYSCALL_DEFINE6(futex_wait, + void __user *, uaddr, + unsigned long, val, + unsigned long, mask, + unsigned int, flags, + struct __kernel_timespec __user *, timeout, + clockid_t, clockid) +{ + struct hrtimer_sleeper to; + int ret; + + if (flags & ~FUTEX2_VALID_MASK) + return -EINVAL; + + flags = futex2_to_flags(flags); + if (!futex_flags_valid(flags)) + return -EINVAL; + + if (!futex_validate_input(flags, val) || + !futex_validate_input(flags, mask)) + return -EINVAL; + + if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) + return ret; + + ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); + + if (timeout) + futex2_destroy_timeout(&to); + return ret; } +/* + * sys_futex_requeue - Requeue a waiter from one futex to another + * @waiters: array describing the source and destination futex + * @flags: unused + * @nr_wake: number of futexes to wake + * @nr_requeue: number of futexes to requeue + * + * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the + * futex2 family of calls. + */ + +SYSCALL_DEFINE4(futex_requeue, + struct futex_waitv __user *, waiters, + unsigned int, flags, + int, nr_wake, + int, nr_requeue) +{ + struct futex_vector futexes[2]; + u32 cmpval; + int ret; + + if (flags) + return -EINVAL; + + if (!waiters) + return -EINVAL; + + ret = futex_parse_waitv(futexes, waiters, 2); + if (ret) + return ret; + + cmpval = futexes[0].w.val; + + return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, + u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, + nr_wake, nr_requeue, &cmpval, 0); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ba01b9408203..37860f794bf7 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -145,16 +145,19 @@ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; - int ret; DEFINE_WAKE_Q(wake_q); + int ret; if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); + ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; + if ((flags & FLAGS_STRICT) && !nr_wake) + return 0; + hb = futex_hash(&key); /* Make sure we really have tasks to wakeup */ @@ -245,10 +248,10 @@ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, DEFINE_WAKE_Q(wake_q); retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); + ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; @@ -419,11 +422,11 @@ static int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *wo */ retry: for (i = 0; i < count; i++) { - if ((vs[i].w.flags & FUTEX_PRIVATE_FLAG) && retry) + if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), - !(vs[i].w.flags & FUTEX_PRIVATE_FLAG), + vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) @@ -435,7 +438,7 @@ retry: for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; - u32 val = (u32)vs[i].w.val; + u32 val = vs[i].w.val; hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); @@ -599,7 +602,7 @@ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * while the syscall executes. */ retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); + ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -629,20 +632,18 @@ retry_private: return ret; } -int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) +int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + struct hrtimer_sleeper *to, u32 bitset) { - struct hrtimer_sleeper timeout, *to; - struct restart_block *restart; - struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; + struct futex_hash_bucket *hb; int ret; if (!bitset) return -EINVAL; + q.bitset = bitset; - to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q @@ -650,18 +651,17 @@ retry: */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) - goto out; + return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_wait_queue(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ - ret = 0; if (!futex_unqueue(&q)) - goto out; - ret = -ETIMEDOUT; + return 0; + if (to && !to->task) - goto out; + return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the @@ -670,24 +670,38 @@ retry: if (!signal_pending(current)) goto retry; - ret = -ERESTARTSYS; - if (!abs_time) - goto out; + return -ERESTARTSYS; +} - restart = ¤t->restart_block; - restart->futex.uaddr = uaddr; - restart->futex.val = val; - restart->futex.time = *abs_time; - restart->futex.bitset = bitset; - restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; +int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) +{ + struct hrtimer_sleeper timeout, *to; + struct restart_block *restart; + int ret; - ret = set_restart_fn(restart, futex_wait_restart); + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); -out: - if (to) { - hrtimer_cancel(&to->timer); - destroy_hrtimer_on_stack(&to->timer); + ret = __futex_wait(uaddr, flags, val, to, bitset); + + /* No timeout, nothing to clean up. */ + if (!to) + return ret; + + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + + if (ret == -ERESTARTSYS) { + restart = ¤t->restart_block; + restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = *abs_time; + restart->futex.bitset = bitset; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; + + return set_restart_fn(restart, futex_wait_restart); } + return ret; } diff --git a/kernel/groups.c b/kernel/groups.c index 9aaed2a31073..9b43da22647d 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -19,7 +19,7 @@ struct group_info *groups_alloc(int gidsetsize) if (!gi) return NULL; - atomic_set(&gi->usage, 1); + refcount_set(&gi->usage, 1); gi->ngroups = gidsetsize; return gi; } diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 5971a66be034..aae0402507ed 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -121,7 +121,6 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), BIT_MASK_DESCR(IRQD_CAN_RESERVE), - BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c653cd31548d..d39a40bc542b 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -219,11 +219,15 @@ void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, void __iomem *reg_base, irq_flow_handler_t handler) { + struct irq_chip_type *ct = gc->chip_types; + int i; + raw_spin_lock_init(&gc->lock); gc->num_ct = num_ct; gc->irq_base = irq_base; gc->reg_base = reg_base; - gc->chip_types->chip.name = name; + for (i = 0; i < num_ct; i++) + ct[i].chip.name = name; gc->chip_types->handler = handler; } @@ -544,21 +548,34 @@ EXPORT_SYMBOL_GPL(irq_setup_alt_chip); void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, unsigned int clr, unsigned int set) { - unsigned int i = gc->irq_base; + unsigned int i, virq; raw_spin_lock(&gc_lock); list_del(&gc->list); raw_spin_unlock(&gc_lock); - for (; msk; msk >>= 1, i++) { + for (i = 0; msk; msk >>= 1, i++) { if (!(msk & 0x01)) continue; + /* + * Interrupt domain based chips store the base hardware + * interrupt number in gc::irq_base. Otherwise gc::irq_base + * contains the base Linux interrupt number. + */ + if (gc->domain) { + virq = irq_find_mapping(gc->domain, gc->irq_base + i); + if (!virq) + continue; + } else { + virq = gc->irq_base + i; + } + /* Remove handler first. That will mask the irq line */ - irq_set_handler(i, NULL); - irq_set_chip(i, &no_irq_chip); - irq_set_chip_data(i, NULL); - irq_modify_status(i, clr, set); + irq_set_handler(virq, NULL); + irq_set_chip(virq, &no_irq_chip); + irq_set_chip_data(virq, NULL); + irq_modify_status(virq, clr, set); } } EXPORT_SYMBOL_GPL(irq_remove_generic_chip); diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 1698e77645ac..75d0ae490e29 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -466,16 +466,16 @@ unsigned int irq_matrix_reserved(struct irq_matrix *m) } /** - * irq_matrix_allocated - Get the number of allocated irqs on the local cpu + * irq_matrix_allocated - Get the number of allocated non-managed irqs on the local CPU * @m: Pointer to the matrix to search * - * This returns number of allocated irqs + * This returns number of allocated non-managed interrupts. */ unsigned int irq_matrix_allocated(struct irq_matrix *m) { struct cpumap *cm = this_cpu_ptr(m->maps); - return cm->allocated; + return cm->allocated - cm->managed_allocated; } #ifdef CONFIG_GENERIC_IRQ_DEBUGFS diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index b4c31a5c1147..79b4a58ba9c3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1204,7 +1204,6 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, #define VIRQ_CAN_RESERVE 0x01 #define VIRQ_ACTIVATE 0x02 -#define VIRQ_NOMASK_QUIRK 0x04 static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags) { @@ -1213,8 +1212,6 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag if (!(vflags & VIRQ_CAN_RESERVE)) { irqd_clr_can_reserve(irqd); - if (vflags & VIRQ_NOMASK_QUIRK) - irqd_set_msi_nomask_quirk(irqd); /* * If the interrupt is managed but no CPU is available to @@ -1275,15 +1272,8 @@ static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain * Interrupt can use a reserved vector and will not occupy * a real device vector until the interrupt is requested. */ - if (msi_check_reservation_mode(domain, info, dev)) { + if (msi_check_reservation_mode(domain, info, dev)) vflags |= VIRQ_CAN_RESERVE; - /* - * MSI affinity setting requires a special quirk (X86) when - * reservation mode is active. - */ - if (info->flags & MSI_FLAG_NOMASK_QUIRK) - vflags |= VIRQ_NOMASK_QUIRK; - } xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED)) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 5353edfad8e1..b0639f21041f 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, idx); + file = task_lookup_fdget_rcu(task, idx); rcu_read_unlock(); + if (file) + fput(file); return file; } diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c index fa2c2f951c6b..e68d82099558 100644 --- a/kernel/locking/lock_events.c +++ b/kernel/locking/lock_events.c @@ -146,7 +146,7 @@ static int __init init_lockevent_counts(void) struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); int i; - if (!d_counts) + if (IS_ERR(d_counts)) goto out; /* @@ -159,14 +159,14 @@ static int __init init_lockevent_counts(void) for (i = 0; i < lockevent_num; i++) { if (skip_lockevent(lockevent_names[i])) continue; - if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, - (void *)(long)i, &fops_lockevent)) + if (IS_ERR(debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent))) goto fail_undo; } - if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + if (IS_ERR(debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, d_counts, (void *)(long)LOCKEVENT_reset_cnts, - &fops_lockevent)) + &fops_lockevent))) goto fail_undo; return 0; diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 15fdc7fa5c68..e2bfb1db589d 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -440,7 +440,7 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) static void seq_time(struct seq_file *m, s64 time) { - char num[15]; + char num[22]; snprint_time(num, sizeof(num), time); seq_printf(m, " %14s", num); diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 270c7f80ce84..69d3cd2cfc3b 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,21 +33,23 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); -torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, acq_writer_lim, 0, "Write_acquisition time limit (jiffies)."); +torture_param(int, call_rcu_chains, 0, "Self-propagate call_rcu() chains during test (0=disable)."); torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); +torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, rt_boost, 2, + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); +torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); -torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); -torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); -torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); -torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); +torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -56,6 +58,55 @@ module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)"); +static cpumask_var_t bind_readers; // Bind the readers to the specified set of CPUs. +static cpumask_var_t bind_writers; // Bind the writers to the specified set of CPUs. + +// Parse a cpumask kernel parameter. If there are more users later on, +// this might need to got to a more central location. +static int param_set_cpumask(const char *val, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + int ret; + char *s; + + if (!alloc_cpumask_var(cm_bind, GFP_KERNEL)) { + s = "Out of memory"; + ret = -ENOMEM; + goto out_err; + } + ret = cpulist_parse(val, *cm_bind); + if (!ret) + return ret; + s = "Bad CPU range"; +out_err: + pr_warn("%s: %s, all CPUs set\n", kp->name, s); + cpumask_setall(*cm_bind); + return ret; +} + +// Output a cpumask kernel parameter. +static int param_get_cpumask(char *buffer, const struct kernel_param *kp) +{ + cpumask_var_t *cm_bind = kp->arg; + + return sprintf(buffer, "%*pbl", cpumask_pr_args(*cm_bind)); +} + +static bool cpumask_nonempty(cpumask_var_t mask) +{ + return cpumask_available(mask) && !cpumask_empty(mask); +} + +static const struct kernel_param_ops lt_bind_ops = { + .set = param_set_cpumask, + .get = param_get_cpumask, +}; + +module_param_cb(bind_readers, <_bind_ops, &bind_readers, 0644); +module_param_cb(bind_writers, <_bind_ops, &bind_writers, 0644); + +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); + static struct task_struct *stats_task; static struct task_struct **writer_tasks; static struct task_struct **reader_tasks; @@ -69,6 +120,12 @@ struct lock_stress_stats { long n_lock_acquired; }; +struct call_rcu_chain { + struct rcu_head crc_rh; + bool crc_stop; +}; +struct call_rcu_chain *call_rcu_chain; + /* Forward reference. */ static void lock_torture_cleanup(void); @@ -116,12 +173,9 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -194,15 +248,14 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) { j = jiffies; - mdelay(longdelay_ms); + mdelay(long_hold); pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); } if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) @@ -320,14 +373,12 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } @@ -348,14 +399,12 @@ __acquires(torture_rwlock) static void torture_rwlock_read_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 10; - const unsigned long longdelay_ms = 100; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold); else udelay(shortdelay_us); } @@ -453,12 +502,9 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 5); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -626,15 +672,13 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); @@ -691,12 +735,9 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 10); + if (long_hold && !(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * long_hold))) + mdelay(long_hold * 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -716,14 +757,11 @@ __acquires(torture_rwsem) static void torture_rwsem_read_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; - /* We want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealreaders_stress * 2000 * longdelay_ms))) - mdelay(longdelay_ms * 2); + if (long_hold && !(torture_random(trsp) % (cxt.nrealreaders_stress * 2000 * long_hold))) + mdelay(long_hold * 2); else - mdelay(longdelay_ms / 2); + mdelay(long_hold / 2); if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -803,11 +841,13 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = { */ static int lock_torture_writer(void *arg) { + unsigned long j; + unsigned long j1; + u32 lockset_mask; struct lock_stress_stats *lwsp = arg; - int tid = lwsp - cxt.lwsa; DEFINE_TORTURE_RANDOM(rand); - u32 lockset_mask; bool skip_main_lock; + int tid = lwsp - cxt.lwsa; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); if (!rt_task(current)) @@ -834,17 +874,24 @@ static int lock_torture_writer(void *arg) cxt.cur_ops->nested_lock(tid, lockset_mask); if (!skip_main_lock) { + if (acq_writer_lim > 0) + j = jiffies; cxt.cur_ops->writelock(tid); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; lock_is_write_held = true; if (WARN_ON_ONCE(atomic_read(&lock_is_read_held))) lwsp->n_lock_fail++; /* rare, but... */ - + if (acq_writer_lim > 0) { + j1 = jiffies; + WARN_ONCE(time_after(j1, j + acq_writer_lim), + "%s: Lock acquisition took %lu jiffies.\n", + __func__, j1 - j); + } lwsp->n_lock_acquired++; - } - if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); + lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); @@ -986,16 +1033,69 @@ static int lock_torture_stats(void *arg) return 0; } + static inline void lock_torture_print_module_parms(struct lock_torture_ops *cur_ops, const char *tag) { + static cpumask_t cpumask_all; + cpumask_t *rcmp = cpumask_nonempty(bind_readers) ? bind_readers : &cpumask_all; + cpumask_t *wcmp = cpumask_nonempty(bind_writers) ? bind_writers : &cpumask_all; + + cpumask_setall(&cpumask_all); pr_alert("%s" TORTURE_FLAG - "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", + "--- %s%s: acq_writer_lim=%d bind_readers=%*pbl bind_writers=%*pbl call_rcu_chains=%d long_hold=%d nested_locks=%d nreaders_stress=%d nwriters_stress=%d onoff_holdoff=%d onoff_interval=%d rt_boost=%d rt_boost_factor=%d shuffle_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d verbose=%d writer_fifo=%d\n", torture_type, tag, cxt.debug_lock ? " [debug]": "", - cxt.nrealwriters_stress, cxt.nrealreaders_stress, - nested_locks, stat_interval, verbose, shuffle_interval, - stutter, shutdown_secs, onoff_interval, onoff_holdoff); + acq_writer_lim, cpumask_pr_args(rcmp), cpumask_pr_args(wcmp), + call_rcu_chains, long_hold, nested_locks, cxt.nrealreaders_stress, + cxt.nrealwriters_stress, onoff_holdoff, onoff_interval, rt_boost, + rt_boost_factor, shuffle_interval, shutdown_secs, stat_interval, stutter, + verbose, writer_fifo); +} + +// If requested, maintain call_rcu() chains to keep a grace period always +// in flight. These increase the probability of getting an RCU CPU stall +// warning and associated diagnostics when a locking primitive stalls. + +static void call_rcu_chain_cb(struct rcu_head *rhp) +{ + struct call_rcu_chain *crcp = container_of(rhp, struct call_rcu_chain, crc_rh); + + if (!smp_load_acquire(&crcp->crc_stop)) { + (void)start_poll_synchronize_rcu(); // Start one grace period... + call_rcu(&crcp->crc_rh, call_rcu_chain_cb); // ... and later start another. + } +} + +// Start the requested number of call_rcu() chains. +static int call_rcu_chain_init(void) +{ + int i; + + if (call_rcu_chains <= 0) + return 0; + call_rcu_chain = kcalloc(call_rcu_chains, sizeof(*call_rcu_chain), GFP_KERNEL); + if (!call_rcu_chain) + return -ENOMEM; + for (i = 0; i < call_rcu_chains; i++) { + call_rcu_chain[i].crc_stop = false; + call_rcu(&call_rcu_chain[i].crc_rh, call_rcu_chain_cb); + } + return 0; +} + +// Stop all of the call_rcu() chains. +static void call_rcu_chain_cleanup(void) +{ + int i; + + if (!call_rcu_chain) + return; + for (i = 0; i < call_rcu_chains; i++) + smp_store_release(&call_rcu_chain[i].crc_stop, true); + rcu_barrier(); + kfree(call_rcu_chain); + call_rcu_chain = NULL; } static void lock_torture_cleanup(void) @@ -1048,6 +1148,8 @@ static void lock_torture_cleanup(void) kfree(cxt.lrsa); cxt.lrsa = NULL; + call_rcu_chain_cleanup(); + end: if (cxt.init_called) { if (cxt.cur_ops->exit) @@ -1177,6 +1279,10 @@ static int __init lock_torture_init(void) } } + firsterr = call_rcu_chain_init(); + if (torture_init_error(firsterr)) + goto unwind; + lock_torture_print_module_parms(cxt.cur_ops, "Start of test"); /* Prepare torture context. */ @@ -1250,6 +1356,8 @@ static int __init lock_torture_init(void) writer_fifo ? sched_set_fifo : NULL); if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_writers)) + torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers); create_reader: if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress)) @@ -1259,6 +1367,8 @@ static int __init lock_torture_init(void) reader_tasks[j]); if (torture_init_error(firsterr)) goto unwind; + if (cpumask_nonempty(bind_readers)) + torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers); } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d973fe6041bf..2deeeca3e71b 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1126,6 +1126,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #endif /* !CONFIG_PREEMPT_RT */ +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin); +EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end); + /** * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 * @cnt: the atomic which we are to dec diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 21db0df0eb00..4a10e8c16fd2 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -218,6 +218,11 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, return try_cmpxchg_acquire(&lock->owner, &old, new); } +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + return rt_mutex_cmpxchg_acquire(lock, NULL, current); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @@ -297,6 +302,20 @@ static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock, } +static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); + +static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) +{ + /* + * With debug enabled rt_mutex_cmpxchg trylock() will always fail. + * + * Avoid unconditionally taking the slow path by using + * rt_mutex_slow_trylock() which is covered by the debug code and can + * acquire a non-contended rtmutex. + */ + return rt_mutex_slowtrylock(lock); +} + static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, struct task_struct *old, struct task_struct *new) @@ -1613,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, raw_spin_unlock_irq(&lock->wait_lock); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) - schedule(); + rt_mutex_schedule(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1642,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, WARN(1, "rtmutex deadlock detected\n"); while (1) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + rt_mutex_schedule(); } } @@ -1738,6 +1757,15 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; /* + * Do all pre-schedule work here, before we queue a waiter and invoke + * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would + * otherwise recurse back into task_blocks_on_rt_mutex() through + * rtlock_slowlock() and will then enqueue a second waiter for this + * same task and things get really confusing real fast. + */ + rt_mutex_pre_schedule(); + + /* * Technically we could use raw_spin_[un]lock_irq() here, but this can * be called in early boot if the cmpxchg() fast path is disabled * (debug, no architecture support). In this case we will acquire the @@ -1748,6 +1776,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + rt_mutex_post_schedule(); return ret; } @@ -1755,7 +1784,9 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock, unsigned int state) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + lockdep_assert(!current->pi_blocked_on); + + if (likely(rt_mutex_try_acquire(lock))) return 0; return rt_mutex_slowlock(lock, NULL, state); diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 25ec0239477c..34a59569db6b 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, struct rt_mutex_base *rtm = &rwb->rtmutex; int ret; + rwbase_pre_schedule(); raw_spin_lock_irq(&rtm->wait_lock); /* @@ -125,12 +126,15 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, rwbase_rtmutex_unlock(rtm); trace_contention_end(rwb, ret); + rwbase_post_schedule(); return ret; } static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { + lockdep_assert(!current->pi_blocked_on); + if (rwbase_read_trylock(rwb)) return 0; @@ -237,6 +241,8 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, /* Force readers into slow path */ atomic_sub(READER_BIAS, &rwb->readers); + rwbase_pre_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); if (__rwbase_write_trylock(rwb)) goto out_unlock; @@ -248,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); + rwbase_post_schedule(); trace_contention_end(rwb, -EINTR); return -EINTR; } @@ -266,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_post_schedule(); return 0; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9eabd585ce7a..2340b6d90ec6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1427,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_signal_pending_state(state, current) \ signal_pending_state(state, current) +#define rwbase_pre_schedule() \ + rt_mutex_pre_schedule() + #define rwbase_schedule() \ - schedule() + rt_mutex_schedule() + +#define rwbase_post_schedule() \ + rt_mutex_post_schedule() #include "rwbase_rt.c" diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 48a19ed8486d..38e292454fcc 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -37,6 +37,8 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { + lockdep_assert(!current->pi_blocked_on); + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) rtlock_slowlock(rtm); } @@ -184,9 +186,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) #define rwbase_signal_pending_state(state, current) (0) +#define rwbase_pre_schedule() + #define rwbase_schedule() \ schedule_rtlock() +#define rwbase_post_schedule() + #include "rwbase_rt.c" /* * The common functions which get wrapped into the rwlock API. diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 93cca6e69860..78719e1ef1b1 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -9,7 +9,7 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/module.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/slab.h> #include <linux/ww_mutex.h> @@ -386,6 +386,19 @@ struct stress { int nlocks; }; +struct rnd_state rng; +DEFINE_SPINLOCK(rng_lock); + +static inline u32 prandom_u32_below(u32 ceil) +{ + u32 ret; + + spin_lock(&rng_lock); + ret = prandom_u32_state(&rng) % ceil; + spin_unlock(&rng_lock); + return ret; +} + static int *get_random_order(int count) { int *order; @@ -399,7 +412,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_u32_below(n + 1); + r = prandom_u32_below(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -452,21 +465,21 @@ retry: ww_mutex_unlock(&locks[order[n]]); if (err == -EDEADLK) { - ww_mutex_lock_slow(&locks[order[contended]], &ctx); - goto retry; + if (!time_after(jiffies, stress->timeout)) { + ww_mutex_lock_slow(&locks[order[contended]], &ctx); + goto retry; + } } + ww_acquire_fini(&ctx); if (err) { pr_err_once("stress (%s) failed with %d\n", __func__, err); break; } - - ww_acquire_fini(&ctx); } while (!time_after(jiffies, stress->timeout)); kfree(order); - kfree(stress); } struct reorder_lock { @@ -531,7 +544,6 @@ out: list_for_each_entry_safe(ll, ln, &locks, link) kfree(ll); kfree(order); - kfree(stress); } static void stress_one_work(struct work_struct *work) @@ -552,8 +564,6 @@ static void stress_one_work(struct work_struct *work) break; } } while (!time_after(jiffies, stress->timeout)); - - kfree(stress); } #define STRESS_INORDER BIT(0) @@ -564,15 +574,24 @@ static void stress_one_work(struct work_struct *work) static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; - int n; + struct stress *stress_array; + int n, count; locks = kmalloc_array(nlocks, sizeof(*locks), GFP_KERNEL); if (!locks) return -ENOMEM; + stress_array = kmalloc_array(nthreads, sizeof(*stress_array), + GFP_KERNEL); + if (!stress_array) { + kfree(locks); + return -ENOMEM; + } + for (n = 0; n < nlocks; n++) ww_mutex_init(&locks[n], &ww_class); + count = 0; for (n = 0; nthreads; n++) { struct stress *stress; void (*fn)(struct work_struct *work); @@ -596,9 +615,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) if (!fn) continue; - stress = kmalloc(sizeof(*stress), GFP_KERNEL); - if (!stress) - break; + stress = &stress_array[count++]; INIT_WORK(&stress->work, fn); stress->locks = locks; @@ -613,6 +630,7 @@ static int stress(int nlocks, int nthreads, unsigned int flags) for (n = 0; n < nlocks; n++) ww_mutex_destroy(&locks[n]); + kfree(stress_array); kfree(locks); return 0; @@ -625,6 +643,8 @@ static int __init test_ww_mutex_init(void) printk(KERN_INFO "Beginning ww mutex selftests\n"); + prandom_seed_state(&rng, get_random_u64()); + wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); if (!wq) return -ENOMEM; diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index d1473c624105..c7196de838ed 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -62,7 +62,7 @@ __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, } mutex_acquire_nest(&rtm->dep_map, 0, 0, nest_lock, ip); - if (likely(rt_mutex_cmpxchg_acquire(&rtm->rtmutex, NULL, current))) { + if (likely(rt_mutex_try_acquire(&rtm->rtmutex))) { if (ww_ctx) ww_mutex_set_context_fastpath(lock, ww_ctx); return 0; diff --git a/kernel/pid.c b/kernel/pid.c index fee14a4486a3..6500ef956f2f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -609,7 +609,7 @@ int pidfd_create(struct pid *pid, unsigned int flags) } /** - * pidfd_open() - Open new pid file descriptor. + * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8d35b9f9aaa3..dee341ae4ace 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -684,7 +684,7 @@ static void power_down(void) cpu_relax(); } -static int load_image_and_restore(bool snapshot_test) +static int load_image_and_restore(void) { int error; unsigned int flags; @@ -694,12 +694,12 @@ static int load_image_and_restore(bool snapshot_test) lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { - swsusp_close(snapshot_test); + swsusp_close(); goto Unlock; } error = swsusp_read(&flags); - swsusp_close(snapshot_test); + swsusp_close(); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -788,7 +788,7 @@ int hibernate(void) pm_pr_dbg("Checking hibernation image\n"); error = swsusp_check(false); if (!error) - error = load_image_and_restore(false); + error = load_image_and_restore(); } thaw_processes(); @@ -952,7 +952,7 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(true); + swsusp_close(); goto Unlock; } @@ -973,7 +973,7 @@ static int software_resume(void) goto Close_Finish; } - error = load_image_and_restore(true); + error = load_image_and_restore(); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); @@ -987,7 +987,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(true); + swsusp_close(); goto Finish; } diff --git a/kernel/power/power.h b/kernel/power/power.h index a98f95e309a3..17fd9aaaf084 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -172,7 +172,7 @@ int swsusp_check(bool exclusive); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -void swsusp_close(bool exclusive); +void swsusp_close(void); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); #endif diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 87e9f7e2bdc0..0f12e0a97e43 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2647,7 +2647,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, memory_bm_free(bm, PG_UNSAFE_KEEP); /* Make a copy of zero_bm so it can be created in safe pages */ - error = memory_bm_create(&tmp, GFP_ATOMIC, PG_ANY); + error = memory_bm_create(&tmp, GFP_ATOMIC, PG_SAFE); if (error) goto Free; @@ -2660,7 +2660,7 @@ static int prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm, goto Free; duplicate_memory_bitmap(zero_bm, &tmp); - memory_bm_free(&tmp, PG_UNSAFE_KEEP); + memory_bm_free(&tmp, PG_UNSAFE_CLEAR); /* At this point zero_bm is in safe pages and it can be used for restoring. */ if (nr_highmem > 0) { diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 74edbce2320b..68a5c2f06957 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -222,7 +222,7 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -static struct block_device *hib_resume_bdev; +static struct bdev_handle *hib_resume_bdev_handle; struct hib_bio_batch { atomic_t count; @@ -276,7 +276,8 @@ static int hib_submit_io(blk_opf_t opf, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(hib_resume_bdev, 1, opf, GFP_NOIO | __GFP_HIGH); + bio = bio_alloc(hib_resume_bdev_handle->bdev, 1, opf, + GFP_NOIO | __GFP_HIGH); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { @@ -356,14 +357,14 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device, BLK_OPEN_WRITE, NULL, NULL); - if (IS_ERR(hib_resume_bdev)) - return PTR_ERR(hib_resume_bdev); + if (IS_ERR(hib_resume_bdev_handle)) + return PTR_ERR(hib_resume_bdev_handle); - res = set_blocksize(hib_resume_bdev, PAGE_SIZE); + res = set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE); if (res < 0) - blkdev_put(hib_resume_bdev, NULL); + bdev_release(hib_resume_bdev_handle); return res; } @@ -443,7 +444,7 @@ static int get_swap_writer(struct swap_map_handle *handle) err_rel: release_swap_writer(handle); err_close: - swsusp_close(false); + swsusp_close(); return ret; } @@ -508,7 +509,7 @@ static int swap_writer_finish(struct swap_map_handle *handle, if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); - swsusp_close(false); + swsusp_close(); return error; } @@ -1522,10 +1523,10 @@ int swsusp_check(bool exclusive) void *holder = exclusive ? &swsusp_holder : NULL; int error; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, - holder, NULL); - if (!IS_ERR(hib_resume_bdev)) { - set_blocksize(hib_resume_bdev, PAGE_SIZE); + hib_resume_bdev_handle = bdev_open_by_dev(swsusp_resume_device, + BLK_OPEN_READ, holder, NULL); + if (!IS_ERR(hib_resume_bdev_handle)) { + set_blocksize(hib_resume_bdev_handle->bdev, PAGE_SIZE); clear_page(swsusp_header); error = hib_submit_io(REQ_OP_READ, swsusp_resume_block, swsusp_header, NULL); @@ -1550,11 +1551,11 @@ int swsusp_check(bool exclusive) put: if (error) - blkdev_put(hib_resume_bdev, holder); + bdev_release(hib_resume_bdev_handle); else pr_debug("Image signature found, resuming\n"); } else { - error = PTR_ERR(hib_resume_bdev); + error = PTR_ERR(hib_resume_bdev_handle); } if (error) @@ -1568,14 +1569,14 @@ put: * @exclusive: Close the resume device which is exclusively opened. */ -void swsusp_close(bool exclusive) +void swsusp_close(void) { - if (IS_ERR(hib_resume_bdev)) { + if (IS_ERR(hib_resume_bdev_handle)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, exclusive ? &swsusp_holder : NULL); + bdev_release(hib_resume_bdev_handle); } /** diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7e0b4dd02398..0b3af1529778 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3740,12 +3740,18 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre seq = prb_next_seq(prb); + /* Flush the consoles so that records up to @seq are printed. */ + console_lock(); + console_unlock(); + for (;;) { diff = 0; /* * Hold the console_lock to guarantee safe access to - * console->seq. + * console->seq. Releasing console_lock flushes more + * records in case @seq is still not printed on all + * usable consoles. */ console_lock(); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98e13be411af..0d866eaa4cc8 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -10,6 +10,7 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include <linux/slab.h> #include <trace/events/rcu.h> /* @@ -248,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_callback(struct rcu_head *rhp) +{ + if (unlikely(!rhp->func)) + kmem_dump_obj(rhp); +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) @@ -568,10 +575,6 @@ void do_trace_rcu_torture_read(const char *rcutorturename, static inline void rcu_gp_set_torture_wait(int duration) { } #endif -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); -#endif - #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, @@ -654,4 +657,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } bool rcu_cpu_beenfullyonline(int cpu); #endif +#ifdef CONFIG_RCU_STALL_COMMON +int rcu_stall_notifier_call_chain(unsigned long val, void *v); +#else // #ifdef CONFIG_RCU_STALL_COMMON +static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } +#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index f71fac422c8f..1693ea22ef1b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -368,7 +368,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1]) + if (!rcu_segcblist_segempty(rsclp, i)) break; rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); @@ -551,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * as their ->gp_seq[] grace-period completion sequence number. */ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1] && + if (!rcu_segcblist_segempty(rsclp, i) && ULONG_CMP_LT(rsclp->gp_seq[i], seq)) break; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ade42d6a9d9b..30fc9d34e329 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -21,6 +21,7 @@ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/rcupdate_wait.h> +#include <linux/rcu_notifier.h> #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <uapi/linux/sched/types.h> @@ -810,7 +811,7 @@ static void synchronize_rcu_trivial(void) int cpu; for_each_online_cpu(cpu) { - rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + torture_sched_setaffinity(current->pid, cpumask_of(cpu)); WARN_ON_ONCE(raw_smp_processor_id() != cpu); } } @@ -1149,7 +1150,7 @@ static int rcu_torture_boost(void *arg) mutex_unlock(&boost_mutex); break; } - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } /* Go do the stutter. */ @@ -1160,7 +1161,7 @@ checkwait: if (stutter_wait("rcu_torture_boost")) /* Clean up and exit. */ while (!kthread_should_stop()) { torture_shutdown_absorb("rcu_torture_boost"); - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); } torture_kthread_stopping("rcu_torture_boost"); return 0; @@ -1183,7 +1184,7 @@ rcu_torture_fqs(void *arg) fqs_resume_time = jiffies + fqs_stutter * HZ; while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); } fqs_burst_remaining = fqs_duration; while (fqs_burst_remaining > 0 && @@ -2126,7 +2127,7 @@ static int rcu_nocb_toggle(void *arg) VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); while (!rcu_inkernel_boot_has_ended()) schedule_timeout_interruptible(HZ / 10); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); if (toggle_interval > ULONG_MAX) @@ -2428,6 +2429,16 @@ static int rcutorture_booster_init(unsigned int cpu) return 0; } +static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr) +{ + pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr); + return NOTIFY_OK; +} + +static struct notifier_block rcu_torture_stall_block = { + .notifier_call = rcu_torture_stall_nf, +}; + /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -2435,9 +2446,14 @@ static int rcutorture_booster_init(unsigned int cpu) static int rcu_torture_stall(void *args) { int idx; + int ret; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2481,6 +2497,11 @@ static int rcu_torture_stall(void *args) cur_ops->readunlock(idx); } pr_alert("%s end.\n", __func__); + if (!ret) { + ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret); + } torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -2899,7 +2920,7 @@ static int rcu_torture_fwd_prog(void *args) WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop()) - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(HZ / 20); oldseq = READ_ONCE(rcu_fwd_seq); } pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); @@ -3200,7 +3221,7 @@ static int rcu_torture_read_exit_child(void *trsp_in) set_user_nice(current, MAX_NICE); // Minimize time between reading and exiting. while (!kthread_should_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); (void)rcu_torture_one_read(trsp, -1); return 0; } @@ -3248,7 +3269,7 @@ static int rcu_torture_read_exit(void *unused) smp_mb(); // Store before wakeup. wake_up(&read_exit_wq); while (!torture_must_stop()) - schedule_timeout_uninterruptible(1); + schedule_timeout_uninterruptible(HZ / 20); torture_kthread_stopping("rcu_torture_read_exit"); return 0; } diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 91a0fd0d4d9a..2c2648a3ad30 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -655,12 +655,12 @@ retry: goto retry; } un_delay(udl, ndl); + b = READ_ONCE(rtsp->a); // Remember, seqlock read-side release can fail. if (!rts_release(rtsp, start)) { rcu_read_unlock(); goto retry; } - b = READ_ONCE(rtsp->a); WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); b = rtsp->b; rcu_read_unlock(); @@ -1025,8 +1025,8 @@ static void ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } static void diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 336af24e0fe3..c38e5933a5d6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp) while (lh) { rhp = lh; lh = lh->next; + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 20d7a238d675..560e99ec5333 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -223,7 +223,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->grplo = cpu; snp->grphi = cpu; } - sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo); } smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; @@ -255,29 +255,31 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); - if (!ssp->sda) { - if (!is_static) - kfree(ssp->srcu_sup); - return -ENOMEM; - } + if (!ssp->sda) + goto err_free_sup; init_srcu_struct_data(ssp); ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->srcu_sup->sda_is_static) { - free_percpu(ssp->sda); - ssp->sda = NULL; - kfree(ssp->srcu_sup); - return -ENOMEM; - } - } else { - WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); - } + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + goto err_free_sda; + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; + +err_free_sda: + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } +err_free_sup: + if (!is_static) { + kfree(ssp->srcu_sup); + ssp->srcu_sup = NULL; + } + return -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -782,8 +784,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); @@ -833,7 +834,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp int cpu; for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1 << (cpu - snp->grplo)))) + if (!(mask & (1UL << (cpu - snp->grplo)))) continue; srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } @@ -1242,10 +1243,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + /* + * The snapshot for acceleration must be taken _before_ the read of the + * current gp sequence used for advancing, otherwise advancing may fail + * and acceleration may then fail too. + * + * This could happen if: + * + * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the + * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). + * + * 2) The grace period for RCU_WAIT_TAIL is seen as started but not + * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. + * + * 3) This value is passed to rcu_segcblist_advance() which can't move + * any segment forward and fails. + * + * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + * But then the call to rcu_seq_snap() observes the grace period for the + * RCU_WAIT_TAIL segment as completed and the subsequent one for the + * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) + * so it returns a snapshot of the next grace period, which is X + 12. + * + * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the + * freshly enqueued callback in RCU_NEXT_TAIL can't move to + * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + * period (gp_num = X + 8). So acceleration fails. + */ + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; needgp = true; @@ -1692,6 +1720,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || @@ -1708,6 +1737,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -1720,8 +1750,6 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8d65f7d576a3..1fa631168594 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; + int dequeue_limit; unsigned long flags; bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq); long n; @@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) long ncbsnz = 0; int needgpcb = 0; - for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { + dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); + for (cpu = 0; cpu < dequeue_limit; cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -538,6 +540,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); len = rcl.len; for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); @@ -1084,7 +1087,7 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); -int rcu_tasks_lazy_ms = -1; +static int rcu_tasks_lazy_ms = -1; module_param(rcu_tasks_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_kthread(void) @@ -1979,20 +1982,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) static void rcu_tasks_initiate_self_tests(void) { - pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU + pr_info("Running RCU Tasks wait API self tests\n"); tests[0].runstart = jiffies; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU + pr_info("Running RCU Tasks Rude wait API self tests\n"); tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU + pr_info("Running RCU Tasks Trace wait API self tests\n"); tests[2].runstart = jiffies; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 42f7589e51e0..fec804b79080 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) trace_rcu_invoke_callback("", head); f = head->func; + debug_rcu_head_callback(head); WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); rcu_lock_release(&rcu_callback_map); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..700524726079 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -31,6 +31,7 @@ #include <linux/bitops.h> #include <linux/export.h> #include <linux/completion.h> +#include <linux/kmemleak.h> #include <linux/moduleparam.h> #include <linux/panic.h> #include <linux/panic_notifier.h> @@ -1260,7 +1261,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register); /* Unregister a counter, with NULL for not caring which. */ void rcu_gp_slow_unregister(atomic_t *rgssp) { - WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL); WRITE_ONCE(rcu_gp_slow_suppress, NULL); } @@ -1556,10 +1557,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp) */ static void rcu_gp_fqs(bool first_time) { + int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall); struct rcu_node *rnp = rcu_get_root(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1); + + WARN_ON_ONCE(nr_fqs > 3); + /* Only countdown nr_fqs for stall purposes if jiffies moves. */ + if (nr_fqs) { + if (nr_fqs == 1) { + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); + } + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs); + } + if (first_time) { /* Collect dyntick-idle snapshots. */ force_qs_rnp(dyntick_save_progress_counter); @@ -2135,6 +2148,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_invoke_callback(rcu_state.name, rhp); f = rhp->func; + debug_rcu_head_callback(rhp); WRITE_ONCE(rhp->func, (rcu_callback_t)0L); f(rhp); @@ -2713,7 +2727,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) */ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, false); + __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); #endif @@ -2764,7 +2778,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); } EXPORT_SYMBOL_GPL(call_rcu); @@ -3388,6 +3402,14 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) success = true; } + /* + * The kvfree_rcu() caller considers the pointer freed at this point + * and likely removes any references to it. Since the actual slab + * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore + * this object (no scanning or false positives reporting). + */ + kmemleak_ignore(ptr); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_monitor_work(krcp); @@ -4083,6 +4105,82 @@ retry: } EXPORT_SYMBOL_GPL(rcu_barrier); +static unsigned long rcu_barrier_last_throttle; + +/** + * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second + * + * This can be thought of as guard rails around rcu_barrier() that + * permits unrestricted userspace use, at least assuming the hardware's + * try_cmpxchg() is robust. There will be at most one call per second to + * rcu_barrier() system-wide from use of this function, which means that + * callers might needlessly wait a second or three. + * + * This is intended for use by test suites to avoid OOM by flushing RCU + * callbacks from the previous test before starting the next. See the + * rcutree.do_rcu_barrier module parameter for more information. + * + * Why not simply make rcu_barrier() more scalable? That might be + * the eventual endpoint, but let's keep it simple for the time being. + * Note that the module parameter infrastructure serializes calls to a + * given .set() function, but should concurrent .set() invocation ever be + * possible, we are ready! + */ +static void rcu_barrier_throttled(void) +{ + unsigned long j = jiffies; + unsigned long old = READ_ONCE(rcu_barrier_last_throttle); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); + + while (time_in_range(j, old, old + HZ / 16) || + !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) { + schedule_timeout_idle(HZ / 16); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + smp_mb(); /* caller's subsequent code after above check. */ + return; + } + j = jiffies; + old = READ_ONCE(rcu_barrier_last_throttle); + } + rcu_barrier(); +} + +/* + * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier + * request arrives. We insist on a true value to allow for possible + * future expansion. + */ +static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp) +{ + bool b; + int ret; + + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) + return -EAGAIN; + ret = kstrtobool(val, &b); + if (!ret && b) { + atomic_inc((atomic_t *)kp->arg); + rcu_barrier_throttled(); + atomic_dec((atomic_t *)kp->arg); + } + return ret; +} + +/* + * Output the number of outstanding rcutree.do_rcu_barrier requests. + */ +static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg)); +} + +static const struct kernel_param_ops do_rcu_barrier_ops = { + .set = param_set_do_rcu_barrier, + .get = param_get_do_rcu_barrier, +}; +static atomic_t do_rcu_barrier; +module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644); + /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is @@ -4130,7 +4228,7 @@ bool rcu_lockdep_current_cpu_online(void) rdp = this_cpu_ptr(&rcu_data); /* * Strictly, we care here about the case where the current CPU is - * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask * not being up to date. So arch_spin_is_locked() might have a * false positive if it's held by some *other* CPU, but that's * OK because that just means a false *negative* on the warning. @@ -4152,25 +4250,6 @@ static bool rcu_init_invoked(void) } /* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - bool blkd; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); - return 0; -} - -/* * All CPUs for the specified rcu_node structure have gone offline, * and all tasks that were preempted within an RCU read-side critical * section while running on one of those CPUs have since exited their RCU @@ -4216,23 +4295,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } /* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); - // Stop-machine done, so allow nohz_full to disable tick. - tick_dep_clear(TICK_DEP_BIT_RCU); - return 0; -} - -/* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller * must hold the corresponding leaf rcu_node ->lock with interrupts @@ -4385,29 +4447,6 @@ int rcutree_online_cpu(unsigned int cpu) } /* - * Near the beginning of the process. The CPU is still very much alive - * with pretty much all services enabled. - */ -int rcutree_offline_cpu(unsigned int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - rcutree_affinity_setting(cpu, cpu); - - // nohz_full CPUs need the tick for stop-machine to work quickly - tick_dep_set(TICK_DEP_BIT_RCU); - return 0; -} - -/* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that * incoming CPUs are not allowed to use RCU read-side critical sections @@ -4418,8 +4457,10 @@ int rcutree_offline_cpu(unsigned int cpu) * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. * This incoming CPU must not have enabled interrupts yet. + * + * This mirrors the effects of rcutree_report_cpu_dead(). */ -void rcu_cpu_starting(unsigned int cpu) +void rcutree_report_cpu_starting(unsigned int cpu) { unsigned long mask; struct rcu_data *rdp; @@ -4473,14 +4514,21 @@ void rcu_cpu_starting(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * + * This mirrors the effect of rcutree_report_cpu_starting(). */ -void rcu_report_dead(unsigned int cpu) +void rcutree_report_cpu_dead(void) { - unsigned long flags, seq_flags; + unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + /* + * IRQS must be disabled from now on and until the CPU dies, or an interrupt + * may introduce a new READ-side while it is actually off the QS masks. + */ + lockdep_assert_irqs_disabled(); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); @@ -4488,7 +4536,6 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - local_irq_save(seq_flags); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4502,8 +4549,6 @@ void rcu_report_dead(unsigned int cpu) WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(seq_flags); - rdp->cpu_started = false; } @@ -4558,7 +4603,60 @@ void rcutree_migrate_callbacks(int cpu) cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } -#endif + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ +int rcutree_offline_cpu(unsigned int cpu) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcutree_affinity_setting(cpu, cpu); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); + return 0; +} +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* * On non-huge systems, use expedited RCU grace periods to make suspend @@ -4990,7 +5088,7 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 192536916f9a..e9821a8422db 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -386,6 +386,10 @@ struct rcu_state { /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + int nr_fqs_jiffies_stall; /* Number of fqs loops after + * which read jiffies and set + * jiffies_stall. Stall + * warnings disabled if !0. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8239b39d945b..6d7cea5d591f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -621,10 +621,14 @@ static void synchronize_rcu_expedited_wait(void) } for (;;) { + unsigned long j; + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; if (rcu_stall_is_suppressed()) continue; + j = jiffies; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -647,7 +651,7 @@ static void synchronize_rcu_expedited_wait(void) } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rcu_state.expedited_sequence, + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6f06dc12904a..ac8e86babe44 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -8,6 +8,7 @@ */ #include <linux/kvm_para.h> +#include <linux/rcu_notifier.h> ////////////////////////////////////////////////////////////////////////////// // @@ -149,12 +150,17 @@ static void panic_on_rcu_stall(void) /** * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * + * To perform the reset request from the caller, disable stall detection until + * 3 fqs loops have passed. This is required to ensure a fresh jiffies is + * loaded. It should be safe to do from the fqs loop as enough timer + * interrupts and context switches should have passed. + * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3); + WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX); } ////////////////////////////////////////////////////////////////////////////// @@ -170,6 +176,7 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0); WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); @@ -534,16 +541,16 @@ static void rcu_check_gp_kthread_starvation(void) data_race(READ_ONCE(rcu_state.gp_state)), gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); - if (cpu >= 0) { - if (cpu_is_offline(cpu)) { - pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); - } else { - pr_err("Stack dump where RCU GP kthread last ran:\n"); - dump_cpu_task(cpu); - } + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + dump_cpu_task(cpu); } wake_up_process(gpk); } @@ -711,7 +718,7 @@ static void print_cpu_stall(unsigned long gps) static void check_cpu_stall(struct rcu_data *rdp) { - bool didstall = false; + bool self_detected; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -725,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp) !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); + + /* + * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS + * loop has to set jiffies to ensure a non-stale jiffies value. This + * is required to have good jiffies value after coming out of long + * breaks of jiffies updates. Not doing so can cause false positives. + */ + if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0) + return; + j = jiffies; /* @@ -758,10 +775,10 @@ static void check_cpu_stall(struct rcu_data *rdp) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; + self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask; if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - /* * If a virtual machine is stopped by the host it can look to * the watchdog like an RCU stall. Check to see if the host @@ -770,39 +787,28 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); - if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) - rcu_ftrace_dump(DUMP_ALL); - didstall = true; - - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like an RCU stall. Check to see if the host - * stopped the vm. - */ - if (kvm_check_and_clear_guest_paused()) - return; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); + if (self_detected) { + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(gps); + } else { + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2, gps); + } - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2, gps); if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); - didstall = true; - } - if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - WRITE_ONCE(rcu_state.jiffies_stall, jn); + + if (READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); + } } } ////////////////////////////////////////////////////////////////////////////// // -// RCU forward-progress mechanisms, including of callback invocation. +// RCU forward-progress mechanisms, including for callback invocation. /* @@ -1054,3 +1060,58 @@ static int __init rcu_sysrq_init(void) return 0; } early_initcall(rcu_sysrq_init); + + +////////////////////////////////////////////////////////////////////////////// +// +// RCU CPU stall-warning notifiers + +static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list); + +/** + * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier + * @n: Entry to add. + * + * Adds an RCU CPU stall notifier to an atomic notifier chain. + * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or + * friends. The @data will be the duration of the stalled grace period, + * in jiffies, coerced to a void* pointer. + * + * Returns 0 on success, %-EEXIST on error. + */ +int rcu_stall_chain_notifier_register(struct notifier_block *n) +{ + return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register); + +/** + * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier + * @n: Entry to add. + * + * Removes an RCU CPU stall notifier from an atomic notifier chain. + * + * Returns zero on success, %-ENOENT on failure. + */ +int rcu_stall_chain_notifier_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister); + +/* + * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in the RCU CPU stall notifier chain in turn, which + * is an atomic call chain. See atomic_notifier_call_chain() for more + * information. + * + * This is for use within RCU, hence the omission of the extra asterisk + * to indicate a non-kerneldoc format header comment. + */ +int rcu_stall_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v); +} diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 19bf6fa3ee6a..c534d6806d3d 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -25,6 +25,7 @@ #include <linux/interrupt.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/torture.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/percpu.h> @@ -524,17 +525,17 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif -#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST) /* Get rcutorture access to sched_setaffinity(). */ -long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { int ret; ret = sched_setaffinity(pid, in_mask); - WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret); return ret; } -EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +EXPORT_SYMBOL_GPL(torture_sched_setaffinity); #endif #ifdef CONFIG_RCU_STALL_COMMON diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f..80a3df49ab47 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -34,7 +34,6 @@ #include <linux/nospec.h> #include <linux/proc_fs.h> #include <linux/psi.h> -#include <linux/psi.h> #include <linux/ptrace_api.h> #include <linux/sched_clock.h> #include <linux/security.h> diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2299a5cfbfb9..81885748871d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -85,7 +85,6 @@ #include "sched.h" #include "stats.h" -#include "autogroup.h" #include "autogroup.h" #include "pelt.h" @@ -114,6 +113,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); typeof(ti->flags) val = READ_ONCE(ti->flags); - for (;;) { + do { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) - break; - } + } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); + return true; } @@ -1480,16 +1479,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p) static void uclamp_update_util_min_rt_default(struct task_struct *p) { - struct rq_flags rf; - struct rq *rq; - if (!rt_task(p)) return; /* Protect updates to p->uclamp_* */ - rq = task_rq_lock(p, &rf); + guard(task_rq_lock)(p); __uclamp_update_util_min_rt_default(p); - task_rq_unlock(rq, p, &rf); } static inline struct uclamp_se @@ -1785,9 +1780,8 @@ static void uclamp_update_root_tg(void) uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); - rcu_read_lock(); + guard(rcu)(); cpu_util_update_eff(&root_task_group.css); - rcu_read_unlock(); } #else static void uclamp_update_root_tg(void) { } @@ -1814,10 +1808,9 @@ static void uclamp_sync_util_min_rt_default(void) smp_mb__after_spinlock(); read_unlock(&tasklist_lock); - rcu_read_lock(); + guard(rcu)(); for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); } static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, @@ -2218,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->check_preempt_curr(rq, p, flags); + rq->curr->sched_class->wakeup_preempt(rq, p, flags); else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); @@ -2239,31 +2232,21 @@ int __task_state_match(struct task_struct *p, unsigned int state) if (READ_ONCE(p->__state) & state) return 1; -#ifdef CONFIG_PREEMPT_RT if (READ_ONCE(p->saved_state) & state) return -1; -#endif + return 0; } static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { -#ifdef CONFIG_PREEMPT_RT - int match; - /* - * Serialize against current_save_and_set_rtlock_wait_state() and - * current_restore_rtlock_saved_state(). + * Serialize against current_save_and_set_rtlock_wait_state(), + * current_restore_rtlock_saved_state(), and __refrigerator(). */ - raw_spin_lock_irq(&p->pi_lock); - match = __task_state_match(p, state); - raw_spin_unlock_irq(&p->pi_lock); - - return match; -#else + guard(raw_spinlock_irq)(&p->pi_lock); return __task_state_match(p, state); -#endif } /* @@ -2417,10 +2400,9 @@ void migrate_disable(void) return; } - preempt_disable(); + guard(preempt)(); this_rq()->nr_pinned++; p->migration_disabled = 1; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @@ -2444,7 +2426,7 @@ void migrate_enable(void) * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ - preempt_disable(); + guard(preempt)(); if (p->cpus_ptr != &p->cpus_mask) __set_cpus_allowed_ptr(p, &ac); /* @@ -2455,7 +2437,6 @@ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -2527,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); return rq; } @@ -2664,9 +2645,11 @@ static int migration_cpu_stop(void *data) * it. */ WARN_ON_ONCE(!pending->stop_pending); + preempt_disable(); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); + preempt_enable(); return 0; } out: @@ -2986,12 +2969,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag complete = true; } + preempt_disable(); task_rq_unlock(rq, p, rf); - if (push_task) { stop_one_cpu_nowait(rq->cpu, push_cpu_stop, p, &rq->push_work); } + preempt_enable(); if (complete) complete_all(&pending->done); @@ -3057,12 +3041,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (flags & SCA_MIGRATE_ENABLE) p->migration_flags &= ~MDF_PUSH; + preempt_disable(); task_rq_unlock(rq, p, rf); - if (!stop_pending) { stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, &pending->arg, &pending->stop_work); } + preempt_enable(); if (flags & SCA_MIGRATE_ENABLE) return 0; @@ -3409,7 +3394,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); + wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(src_rq, &srf); @@ -3516,13 +3501,11 @@ out: */ void kick_process(struct task_struct *p) { - int cpu; + guard(preempt)(); + int cpu = task_cpu(p); - preempt_disable(); - cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); - preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); @@ -3785,7 +3768,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } activate_task(rq, p, en_flags); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); ttwu_do_wakeup(p); @@ -3809,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle / 2; - rq->idle_stamp = 0; } #endif @@ -3856,7 +3836,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) * it should preempt the task that is current now. */ update_rq_clock(rq); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); ret = 1; @@ -3956,6 +3936,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* @@ -4036,13 +4028,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * The caller holds p::pi_lock if p != current or has preemption * disabled when p == current. * - * The rules of PREEMPT_RT saved_state: + * The rules of saved_state: * * The related locking code always holds p::pi_lock when updating * p::saved_state, which means the code is fully serialized in both cases. * - * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other - * bits set. This allows to distinguish all wakeup scenarios. + * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. + * No other bits set. This allows to distinguish all wakeup scenarios. + * + * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This + * allows us to prevent early wakeup of tasks before they can be run on + * asymmetric ISA architectures (eg ARMv9). */ static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) @@ -4056,13 +4052,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) *success = !!(match = __task_state_match(p, state)); -#ifdef CONFIG_PREEMPT_RT /* * Saved state preserves the task state across blocking on - * an RT lock. If the state matches, set p::saved_state to - * TASK_RUNNING, but do not wake the task because it waits - * for a lock wakeup. Also indicate success because from - * the regular waker's point of view this has succeeded. + * an RT lock or TASK_FREEZABLE tasks. If the state matches, + * set p::saved_state to TASK_RUNNING, but do not wake the task + * because it waits for a lock wakeup or __thaw_task(). Also + * indicate success because from the regular waker's point of + * view this has succeeded. * * After acquiring the lock the task will restore p::__state * from p::saved_state which ensures that the regular @@ -4072,7 +4068,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) */ if (match < 0) p->saved_state = TASK_RUNNING; -#endif + return match > 0; } @@ -4254,7 +4250,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + * A similar smp_rmb() lives in __task_needs_rq_lock(). */ smp_rmb(); if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) @@ -4871,7 +4867,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); - check_preempt_curr(rq, p, WF_FORK); + wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -5374,8 +5370,6 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next); - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ @@ -5916,8 +5910,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } @@ -6368,8 +6361,9 @@ static void sched_core_balance(struct rq *rq) struct sched_domain *sd; int cpu = cpu_of(rq); - preempt_disable(); - rcu_read_lock(); + guard(preempt)(); + guard(rcu)(); + raw_spin_rq_unlock_irq(rq); for_each_domain(cpu, sd) { if (need_resched()) @@ -6379,8 +6373,6 @@ static void sched_core_balance(struct rq *rq) break; } raw_spin_rq_lock_irq(rq); - rcu_read_unlock(); - preempt_enable(); } static DEFINE_PER_CPU(struct balance_callback, core_balance_head); @@ -6615,6 +6607,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); + rq->clock_update_flags = RQCF_UPDATED; switch_count = &prev->nivcsw; @@ -6694,8 +6687,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); @@ -6720,22 +6711,24 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); unsigned int task_flags; - if (task_is_running(tsk)) - return; + /* + * Establish LD_WAIT_CONFIG context to ensure none of the code called + * will use a blocking primitive -- which would lead to recursion. + */ + lock_map_acquire_try(&sched_map); task_flags = tsk->flags; /* * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - if (task_flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - else - io_wq_worker_sleeping(tsk); - } + if (task_flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else if (task_flags & PF_IO_WORKER) + io_wq_worker_sleeping(tsk); /* * spinlock and rwlock must not flush block requests. This will @@ -6749,6 +6742,8 @@ static inline void sched_submit_work(struct task_struct *tsk) * make sure to submit it to avoid deadlocks. */ blk_flush_plug(tsk->plug, true); + + lock_map_release(&sched_map); } static void sched_update_worker(struct task_struct *tsk) @@ -6761,16 +6756,26 @@ static void sched_update_worker(struct task_struct *tsk) } } -asmlinkage __visible void __sched schedule(void) +static __always_inline void __schedule_loop(unsigned int sched_mode) { - struct task_struct *tsk = current; - - sched_submit_work(tsk); do { preempt_disable(); - __schedule(SM_NONE); + __schedule(sched_mode); sched_preempt_enable_no_resched(); } while (need_resched()); +} + +asmlinkage __visible void __sched schedule(void) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_RT_MUTEXES + lockdep_assert(!tsk->sched_rt_mutex); +#endif + + if (!task_is_running(tsk)) + sched_submit_work(tsk); + __schedule_loop(SM_NONE); sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -6834,11 +6839,7 @@ void __sched schedule_preempt_disabled(void) #ifdef CONFIG_PREEMPT_RT void __sched notrace schedule_rtlock(void) { - do { - preempt_disable(); - __schedule(SM_RTLOCK_WAIT); - sched_preempt_enable_no_resched(); - } while (need_resched()); + __schedule_loop(SM_RTLOCK_WAIT); } NOKPROBE_SYMBOL(schedule_rtlock); #endif @@ -7034,6 +7035,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio) #ifdef CONFIG_RT_MUTEXES +/* + * Would be more useful with typeof()/auto_type but they don't mix with + * bit-fields. Since it's a local thing, use int. Keep the generic sounding + * name such that if someone were to implement this function we get to compare + * notes. + */ +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; }) + +void rt_mutex_pre_schedule(void) +{ + lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1)); + sched_submit_work(current); +} + +void rt_mutex_schedule(void) +{ + lockdep_assert(current->sched_rt_mutex); + __schedule_loop(SM_NONE); +} + +void rt_mutex_post_schedule(void) +{ + sched_update_worker(current); + lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); +} + static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) { if (pi_task) @@ -7187,9 +7214,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio; - struct rq_flags rf; struct rq *rq; + int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; @@ -7197,7 +7223,9 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &rf); + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + update_rq_clock(rq); /* @@ -7208,8 +7236,9 @@ void set_user_nice(struct task_struct *p, long nice) */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; + return; } + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7232,9 +7261,6 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ p->sched_class->prio_changed(rq, p, old_prio); - -out_unlock: - task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -7507,6 +7533,21 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. @@ -7544,14 +7585,11 @@ static void __setscheduler_params(struct task_struct *p, static bool check_same_owner(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred; - bool match; + guard(rcu)(); - rcu_read_lock(); pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); } /* @@ -7963,27 +8001,17 @@ static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - struct task_struct *p; - int retval; if (!param || pid < 0) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { - retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); - } + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - return retval; + return sched_setscheduler(p, policy, &lparam); } /* @@ -8079,7 +8107,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) { struct sched_attr attr; - struct task_struct *p; int retval; if (!uattr || pid < 0 || flags) @@ -8094,21 +8121,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) attr.sched_policy = SETPARAM_POLICY; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - if (likely(p)) { - if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); - retval = sched_setattr(p, &attr); - put_task_struct(p); - } + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); - return retval; + return sched_setattr(p, &attr); } /** @@ -8126,16 +8146,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); + guard(rcu)(); p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; } - rcu_read_unlock(); return retval; } @@ -8156,30 +8177,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } /* * This one might sleep, we cannot do it with a spinlock held ... */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } /* @@ -8239,46 +8253,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine - * because it'll correctly read the old or the new value. We don't need - * to guarantee who wins the race as long as it doesn't return garbage. - */ - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif - - rcu_read_unlock(); + } return sched_attr_copy_to_user(uattr, &kattr, usize); - -out_unlock: - rcu_read_unlock(); - return retval; } #ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { - int ret = 0; - /* * If the task isn't a deadline task or admission control is * disabled then we don't care about affinity changes. @@ -8292,11 +8298,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) * tasks allowed to run on all the CPUs in the task's * root_domain. */ - rcu_read_lock(); + guard(rcu)(); if (!cpumask_subset(task_rq(p)->rd->span, mask)) - ret = -EBUSY; - rcu_read_unlock(); - return ret; + return -EBUSY; + + return 0; } #endif @@ -8366,39 +8372,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { struct affinity_context ac; struct cpumask *user_mask; - struct task_struct *p; int retval; - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - retval = -EPERM; - goto out_put_task; - } - rcu_read_unlock(); + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; } retval = security_task_setscheduler(p); if (retval) - goto out_put_task; + return retval; /* * With non-SMP configs, user_cpus_ptr/user_mask isn't used and @@ -8408,8 +8399,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (user_mask) { cpumask_copy(user_mask, in_mask); } else if (IS_ENABLED(CONFIG_SMP)) { - retval = -ENOMEM; - goto out_put_task; + return -ENOMEM; } ac = (struct affinity_context){ @@ -8421,8 +8411,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = __sched_setaffinity(p, &ac); kfree(ac.user_mask); -out_put_task: - put_task_struct(p); return retval; } @@ -8464,28 +8452,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; - unsigned long flags; int retval; - rcu_read_lock(); - - retval = -ESRCH; + guard(rcu)(); p = find_process_by_pid(pid); if (!p) - goto out_unlock; + return -ESRCH; retval = security_task_getscheduler(p); if (retval) - goto out_unlock; + return retval; - raw_spin_lock_irqsave(&p->pi_lock, flags); + guard(raw_spinlock_irqsave)(&p->pi_lock); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - -out_unlock: - rcu_read_unlock(); - return retval; + return 0; } /** @@ -8932,55 +8913,46 @@ int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; - unsigned long flags; int yielded = 0; - local_irq_save(flags); - rq = this_rq(); + scoped_guard (irqsave) { + rq = this_rq(); again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) { - yielded = -ESRCH; - goto out_irq; - } + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; - double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; - } + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; - if (!curr->sched_class->yield_to_task) - goto out_unlock; + if (!curr->sched_class->yield_to_task) + return 0; - if (curr->sched_class != p->sched_class) - goto out_unlock; + if (curr->sched_class != p->sched_class) + return 0; - if (task_on_cpu(p_rq, p) || !task_is_running(p)) - goto out_unlock; + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; - yielded = curr->sched_class->yield_to_task(rq, p); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } } -out_unlock: - double_rq_unlock(rq, p_rq); -out_irq: - local_irq_restore(flags); - - if (yielded > 0) + if (yielded) schedule(); return yielded; @@ -9083,38 +9055,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { - struct task_struct *p; - unsigned int time_slice; - struct rq_flags rf; - struct rq *rq; + unsigned int time_slice = 0; int retval; if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - rq = task_rq_lock(p, &rf); - time_slice = 0; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &rf); + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } - rcu_read_unlock(); jiffies_to_timespec64(time_slice, t); return 0; - -out_unlock: - rcu_read_unlock(); - return retval; } /** @@ -9173,9 +9137,9 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", - free, task_pid_nr(p), ppid, - read_task_thread_flags(p)); + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + free, task_pid_nr(p), task_tgid_nr(p), + ppid, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -9269,7 +9233,7 @@ void __init init_idle(struct task_struct *idle, int cpu) * PF_KTHREAD should already be set at this point; regardless, make it * look like a proper per-CPU kthread. */ - idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; + idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); #ifdef CONFIG_SMP @@ -9505,9 +9469,11 @@ static void balance_push(struct rq *rq) * Temporarily drop rq->lock such that we can wake-up the stop task. * Both preemption and IRQs are still disabled. */ + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, this_cpu_ptr(&push_work)); + preempt_enable(); /* * At this point need_resched() is true and we'll take the loop in * schedule(). The next pick is obviously going to be the stop task @@ -10013,7 +9979,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; @@ -10022,8 +9988,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -10498,17 +10462,18 @@ void sched_move_task(struct task_struct *tsk) int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct task_group *group; - struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &rf); + CLASS(task_rq_lock, rq_guard)(tsk); + rq = rq_guard.rq; + /* * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous * group changes. */ group = sched_get_task_group(tsk); if (group == tsk->sched_task_group) - goto unlock; + return; update_rq_clock(rq); @@ -10533,9 +10498,6 @@ void sched_move_task(struct task_struct *tsk) */ resched_curr(rq); } - -unlock: - task_rq_unlock(rq, tsk, &rf); } static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -10572,11 +10534,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); cpu_util_update_eff(css); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); #endif return 0; @@ -10727,8 +10687,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, static_branch_enable(&sched_uclamp_used); - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); tg = css_tg(of_css(of)); if (tg->uclamp_req[clamp_id].value != req.util) @@ -10743,9 +10703,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, /* Update effective clamps to track the most restrictive value */ cpu_util_update_eff(of_css(of)); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); - return nbytes; } @@ -10771,10 +10728,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf, u64 percent; u32 rem; - rcu_read_lock(); - tg = css_tg(seq_css(sf)); - util_clamp = tg->uclamp_req[clamp_id].value; - rcu_read_unlock(); + scoped_guard (rcu) { + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + } if (util_clamp == SCHED_CAPACITY_SCALE) { seq_puts(sf, "max\n"); @@ -10865,11 +10822,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - cpus_read_lock(); - mutex_lock(&cfs_constraints_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); if (ret) - goto out_unlock; + return ret; runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; @@ -10879,39 +10837,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, */ if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); - raw_spin_lock_irq(&cfs_b->lock); - cfs_b->period = ns_to_ktime(period); - cfs_b->quota = quota; - cfs_b->burst = burst; - __refill_cfs_bandwidth_runtime(cfs_b); + scoped_guard (raw_spinlock_irq, &cfs_b->lock) { + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + cfs_b->burst = burst; - /* Restart the period timer (if active) to handle new period expiry: */ - if (runtime_enabled) - start_cfs_bandwidth(cfs_b); + __refill_cfs_bandwidth_runtime(cfs_b); - raw_spin_unlock_irq(&cfs_b->lock); + /* + * Restart the period timer (if active) to handle new + * period expiry: + */ + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); + } for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; - struct rq_flags rf; - rq_lock_irq(rq, &rf); + guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - rq_unlock_irq(rq, &rf); } + if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); -out_unlock: - mutex_unlock(&cfs_constraints_mutex); - cpus_read_unlock(); - return ret; + return 0; } static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) @@ -11096,7 +11053,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { - int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -11108,11 +11064,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - rcu_read_lock(); - ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); } static int cpu_cfs_stat_show(struct seq_file *sf, void *v) @@ -11717,14 +11670,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, * are not the last task to be migrated from this cpu for this mm, so * there is no need to move src_cid to the destination cpu. */ - rcu_read_lock(); + guard(rcu)(); src_task = rcu_dereference(src_rq->curr); if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); t->last_mm_cid = -1; return -1; } - rcu_read_unlock(); return src_cid; } @@ -11768,18 +11719,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, * the lazy-put flag, this task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; + scoped_guard (rcu) { + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } } - rcu_read_unlock(); /* * The src_cid is unused, so it can be unset. @@ -11852,7 +11802,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ { struct rq *rq = cpu_rq(cpu); struct task_struct *t; - unsigned long flags; int cid, lazy_cid; cid = READ_ONCE(pcpu_cid->cid); @@ -11887,23 +11836,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ * the lazy-put flag, that task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { - rcu_read_unlock(); - return; + scoped_guard (rcu) { + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) + return; } - rcu_read_unlock(); /* * The cid is unused, so it can be unset. * Disable interrupts to keep the window of cid ownership without rq * lock small. */ - local_irq_save(flags); - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); - local_irq_restore(flags); + scoped_guard (irqsave) { + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); + } } static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) @@ -11925,14 +11872,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) * snapshot associated with this cid if an active task using the mm is * observed on this rq. */ - rcu_read_lock(); - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - rcu_read_unlock(); - return; + scoped_guard (rcu) { + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + return; + } } - rcu_read_unlock(); if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) return; @@ -12026,7 +11972,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12034,7 +11979,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12044,13 +11989,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_before_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12058,7 +12001,7 @@ void sched_mm_cid_before_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12068,13 +12011,11 @@ void sched_mm_cid_before_execve(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12082,16 +12023,16 @@ void sched_mm_cid_after_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); - rq_unlock_irqrestore(rq, &rf); + scoped_guard (rq_lock_irqsave, rq) { + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + } rseq_set_notify_resume(t); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 57c92d751bcd..95baa12a1029 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (!dl_task_fits_capacity(p, cpu)) { cpumask_clear_cpu(cpu, later_mask); - cap = capacity_orig_of(cpu); + cap = arch_scale_cpu_capacity(cpu); if (cap > max_cap || (cpu == task_cpu(p) && cap == max_cap)) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 4492608b7d7f..458d359f5991 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -350,7 +350,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, * Except when the rq is capped by uclamp_max. */ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && - sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { + sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && + !sg_policy->need_freq_update) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index a286e726eb4b..42c40cfdf836 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -101,6 +101,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); + cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58b542bf2893..b28114478b82 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -132,7 +132,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) int i; for_each_cpu_and(i, mask, cpu_active_mask) - cap += capacity_orig_of(i); + cap += arch_scale_cpu_capacity(i); return cap; } @@ -144,7 +144,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) static inline unsigned long dl_bw_capacity(int i) { if (!sched_asym_cpucap_active() && - capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) { return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; } else { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -509,7 +509,6 @@ void init_dl_rq(struct dl_rq *dl_rq) /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; - dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else @@ -553,39 +552,6 @@ static inline void dl_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); } -static void update_dl_migration(struct dl_rq *dl_rq) -{ - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { - if (!dl_rq->overloaded) { - dl_set_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 1; - } - } else if (dl_rq->overloaded) { - dl_clear_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 0; - } -} - -static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); -} - -static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); -} - #define __node_2_pdl(node) \ rb_entry((node), struct task_struct, pushable_dl_tasks) @@ -594,6 +560,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); } +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. @@ -609,6 +580,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) __pushable_less); if (leftmost) rq->dl.earliest_dl.next = p->dl.deadline; + + if (!rq->dl.overloaded) { + dl_set_overload(rq); + rq->dl.overloaded = 1; + } } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -625,11 +601,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; RB_CLEAR_NODE(&p->pushable_dl_tasks); -} -static inline int has_pushable_dl_tasks(struct rq *rq) -{ - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); + if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) { + dl_clear_overload(rq); + rq->dl.overloaded = 0; + } } static int push_dl_task(struct rq *rq); @@ -763,7 +739,7 @@ static inline void deadline_queue_pull_task(struct rq *rq) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, struct rq *rq) @@ -1175,7 +1151,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1504,7 +1480,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); } static inline @@ -1518,7 +1493,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); } static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) @@ -1939,7 +1913,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { @@ -2291,9 +2265,6 @@ static int push_dl_task(struct rq *rq) struct rq *later_rq; int ret = 0; - if (!rq->dl.overloaded) - return 0; - next_task = pick_next_pushable_dl_task(rq); if (!next_task) return 0; @@ -2449,9 +2420,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } @@ -2652,7 +2625,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) deadline_queue_push_tasks(rq); #endif if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); } else { @@ -2721,7 +2694,7 @@ DEFINE_SCHED_CLASS(dl) = { .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, - .check_preempt_curr = check_preempt_curr_dl, + .wakeup_preempt = wakeup_preempt_dl, .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6..4580a450700e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,7 +8,7 @@ */ /* - * This allows printing both to /proc/sched_debug and + * This allows printing both to /sys/kernel/debug/sched/debug and * to the console */ #define SEQ_printf(m, x...) \ @@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); -#ifdef CONFIG_SMP - PU(rt_nr_migratory); -#endif P(rt_throttled); PN(rt_time); PN(rt_runtime); @@ -748,7 +745,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) PU(dl_nr_running); #ifdef CONFIG_SMP - PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; @@ -864,7 +860,6 @@ static void sched_debug_header(struct seq_file *m) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_base_slice); - P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cb225921bbca..8767988242ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,8 +51,6 @@ #include <asm/switch_to.h> -#include <linux/sched/cond_resched.h> - #include "sched.h" #include "stats.h" #include "autogroup.h" @@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; int sched_thermal_decay_shift; @@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", @@ -664,6 +649,10 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; } +/* + * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * For this to be so, the result of this function must have a left bias. + */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -677,8 +666,12 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) load += weight; } - if (load) + if (load) { + /* sign flips effective floor / ceil */ + if (avg < 0) + avg -= (load - 1); avg = div_s64(avg, load); + } return cfs_rq->min_vruntime + avg; } @@ -864,14 +857,16 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * * Which allows an EDF like search on (sub)trees. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; + struct sched_entity *best_left = NULL; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; + best = curr; /* * Once selected, run a task until it either becomes non-eligible or @@ -892,33 +887,75 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) } /* - * If this entity has an earlier deadline than the previous - * best, take this one. If it also has the earliest deadline - * of its subtree, we're done. + * Now we heap search eligible trees for the best (min_)deadline */ - if (!best || deadline_gt(deadline, best, se)) { + if (!best || deadline_gt(deadline, best, se)) best = se; - if (best->deadline == best->min_deadline) - break; - } /* - * If the earlest deadline in this subtree is in the fully - * eligible left half of our space, go there. + * Every se in a left branch is eligible, keep track of the + * branch with the best min_deadline */ + if (node->rb_left) { + struct sched_entity *left = __node_2_se(node->rb_left); + + if (!best_left || deadline_gt(min_deadline, best_left, left)) + best_left = left; + + /* + * min_deadline is in the left branch. rb_left and all + * descendants are eligible, so immediately switch to the second + * loop. + */ + if (left->min_deadline == se->min_deadline) + break; + } + + /* min_deadline is at this node, no need to look right */ + if (se->deadline == se->min_deadline) + break; + + /* else min_deadline is in the right branch. */ + node = node->rb_right; + } + + /* + * We ran into an eligible node which is itself the best. + * (Or nr_running == 0 and both are NULL) + */ + if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0) + return best; + + /* + * Now best_left and all of its children are eligible, and we are just + * looking for deadline == min_deadline + */ + node = &best_left->run_node; + while (node) { + struct sched_entity *se = __node_2_se(node); + + /* min_deadline is the current node */ + if (se->deadline == se->min_deadline) + return se; + + /* min_deadline is in the left branch */ if (node->rb_left && __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { node = node->rb_left; continue; } + /* else min_deadline is in the right branch */ node = node->rb_right; } + return NULL; +} - if (!best || (curr && deadline_gt(deadline, best, curr))) - best = curr; +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = __pick_eevdf(cfs_rq); - if (unlikely(!best)) { + if (!se) { struct sched_entity *left = __pick_first_entity(cfs_rq); if (left) { pr_err("EEVDF scheduling fail, picking leftmost\n"); @@ -926,7 +963,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) } } - return best; + return se; } #ifdef CONFIG_SCHED_DEBUG @@ -2847,19 +2884,7 @@ static void task_numa_placement(struct task_struct *p) } /* Cannot migrate task to CPU-less node */ - if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { - int near_nid = max_nid; - int distance, near_distance = INT_MAX; - - for_each_node_state(nid, N_CPU) { - distance = node_distance(max_nid, nid); - if (distance < near_distance) { - near_nid = nid; - near_distance = distance; - } - } - max_nid = near_nid; - } + max_nid = numa_nearest_node(max_nid, N_CPU); if (ng) { numa_group_count_active_nodes(ng); @@ -3130,7 +3155,7 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } -static bool vma_is_accessed(struct vm_area_struct *vma) +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long pids; /* @@ -3142,8 +3167,20 @@ static bool vma_is_accessed(struct vm_area_struct *vma) if (READ_ONCE(current->mm->numa_scan_seq) < 2) return true; - pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1]; - return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + return false; } #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) @@ -3163,6 +3200,8 @@ static void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -3205,7 +3244,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -3215,6 +3253,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; vma_iter_init(&vmi, mm, start); vma = vma_next(&vmi); if (!vma) { @@ -3227,6 +3275,7 @@ static void task_numa_work(struct callback_head *work) do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } @@ -3237,15 +3286,19 @@ static void task_numa_work(struct callback_head *work) * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { @@ -3258,8 +3311,15 @@ static void task_numa_work(struct callback_head *work) msecs_to_jiffies(sysctl_numa_balancing_scan_delay); /* Reset happens after 4 times scan delay of scan start */ - vma->numab_state->next_pid_reset = vma->numab_state->next_scan + + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; } /* @@ -3267,23 +3327,35 @@ static void task_numa_work(struct callback_head *work) * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, - vma->numab_state->next_scan)) + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); continue; + } + + /* RESET access PIDs regularly for old VMAs. */ + if (mm->numa_scan_seq && + time_after(jiffies, vma->numab_state->pids_active_reset)) { + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); + vma->numab_state->pids_active[1] = 0; + } - /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); continue; + } /* - * RESET access PIDs regularly for old VMAs. Resetting after checking - * vma for recent access to avoid clearing PID info before access.. + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. */ - if (mm->numa_scan_seq && - time_after(jiffies, vma->numab_state->next_pid_reset)) { - vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset + - msecs_to_jiffies(VMA_PID_RESET_PERIOD); - vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]); - vma->numab_state->access_pids[1] = 0; + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); + continue; } do { @@ -3310,8 +3382,28 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; } for_each_vma(vmi, vma); + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; + } + out: /* * It is possible to reach the end of the VMA list but the last few @@ -3605,6 +3697,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, */ deadline = div_s64(deadline * old_weight, weight); se->deadline = se->vruntime + deadline; + if (se != cfs_rq->curr) + min_deadline_cb_propagate(&se->run_node, NULL); } #ifdef CONFIG_SMP @@ -3888,7 +3982,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + long delta; + u64 now; /* * No need to update load_avg for root_task_group as it is not used. @@ -3896,9 +3991,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; + /* + * For migration heavy workloads, access to tg->load_avg can be + * unbound. Limit the update rate to at most once per ms. + */ + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) + return; + + delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; + cfs_rq->last_update_tg_load_avg = now; } } @@ -4572,22 +4677,6 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } -#ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) -{ - return clamp(task_util_est(p), uclamp_min, uclamp_max); -} -#else -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) -{ - return task_util_est(p); -} -#endif - static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -4691,7 +4780,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) + if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4739,14 +4828,14 @@ static inline int util_fits_cpu(unsigned long util, return fits; /* - * We must use capacity_orig_of() for comparing against uclamp_min and + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and * uclamp_max. We only care about capacity pressure (by using * capacity_of()) for comparing against the real util. * * If a task is boosted to 1024 for example, we don't want a tiny * pressure to skew the check whether it fits a CPU or not. * - * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * * Only exception is for thermal pressure since it has a direct impact @@ -4758,7 +4847,7 @@ static inline int util_fits_cpu(unsigned long util, * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. */ - capacity_orig = capacity_orig_of(cpu); + capacity_orig = arch_scale_cpu_capacity(cpu); capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* @@ -4878,7 +4967,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return true; + return !cfs_rq->nr_running; } #define UPDATE_TG 0x0 @@ -4919,10 +5008,12 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 vslice = calc_delta_fair(se->slice, se); - u64 vruntime = avg_vruntime(cfs_rq); + u64 vslice, vruntime = avg_vruntime(cfs_rq); s64 lag = 0; + se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); + /* * Due to how V is constructed as the weighted average of entities, * adding tasks with positive lag, or removing tasks with negative lag @@ -5211,7 +5302,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +pick_next_entity(struct cfs_rq *cfs_rq) { /* * Enabling NEXT_BUDDY will affect latency but not fairness. @@ -5755,13 +5846,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { - struct cfs_rq *local_unthrottle = NULL; int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; bool throttled = false; - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *tmp; struct rq_flags rf; struct rq *rq; + LIST_HEAD(local_unthrottle); rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -5777,11 +5868,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) if (!cfs_rq_throttled(cfs_rq)) goto next; -#ifdef CONFIG_SMP /* Already queued for async unthrottle */ if (!list_empty(&cfs_rq->throttled_csd_list)) goto next; -#endif /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); @@ -5798,11 +5887,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) { - if (cpu_of(rq) != this_cpu || - SCHED_WARN_ON(local_unthrottle)) + if (cpu_of(rq) != this_cpu) { unthrottle_cfs_rq_async(cfs_rq); - else - local_unthrottle = cfs_rq; + } else { + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, + &local_unthrottle); + } } else { throttled = true; } @@ -5810,15 +5905,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) next: rq_unlock_irqrestore(rq, &rf); } - rcu_read_unlock(); - if (local_unthrottle) { - rq = cpu_rq(this_cpu); + list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, + throttled_csd_list) { + struct rq *rq = rq_of(cfs_rq); + rq_lock_irqsave(rq, &rf); - if (cfs_rq_throttled(local_unthrottle)) - unthrottle_cfs_rq(local_unthrottle); + + list_del_init(&cfs_rq->throttled_csd_list); + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); } + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + + rcu_read_unlock(); return throttled; } @@ -6148,9 +6251,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); -#ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); -#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -7108,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; struct sched_domain_shared *sd_share; - struct rq *this_rq = this_rq(); - int this = smp_processor_id(); - struct sched_domain *this_sd = NULL; - u64 time = 0; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !has_idle_core) { - u64 avg_cost, avg_idle, span_avg; - unsigned long now = jiffies; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * If we're busy, the assumption that the last idle period - * predicts the future is flawed; age away the remaining - * predicted idle time. - */ - if (unlikely(this_rq->wake_stamp < now)) { - while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { - this_rq->wake_stamp++; - this_rq->wake_avg_idle >>= 1; - } - } - - avg_idle = this_rq->wake_avg_idle; - avg_cost = this_sd->avg_scan_cost + 1; - - span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; - - time = cpu_clock(this); - } - if (sched_feat(SIS_UTIL)) { sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { @@ -7158,6 +7223,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7165,7 +7254,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7176,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { - time = cpu_clock(this) - time; - - /* - * Account for the scan cost of wakeups against the average - * idle time. - */ - this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); - - update_avg(&this_sd->avg_scan_cost, time); - } - return idle_cpu; } @@ -7227,7 +7304,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); /* * First, select CPU which fits better (-1 being better than 0). @@ -7267,7 +7344,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) bool has_idle_core = false; struct sched_domain *sd; unsigned long task_util, util_min, util_max; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check @@ -7294,8 +7371,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) + return prev; + + prev_aff = prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -7322,7 +7405,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - return recent_used_cpu; + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; + + } else { + recent_used_cpu = -1; } /* @@ -7363,6 +7452,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } @@ -7469,7 +7569,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) util = max(util, util_est); } - return min(util, capacity_orig_of(cpu)); + return min(util, arch_scale_cpu_capacity(cpu)); } unsigned long cpu_util_cfs(int cpu) @@ -7621,11 +7721,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, { unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); unsigned long busy_time = eenv->pd_busy_time; + unsigned long energy; if (dst_cpu >= 0) busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); - return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + + trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); + + return energy; } /* @@ -7700,7 +7805,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; sync_entity_load_avg(&p->se); - if (!uclamp_task_util(p, p_util_min, p_util_max)) + if (!task_util_est(p) && p_util_min == 0) goto unlock; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -7708,11 +7813,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; unsigned long cpu_cap, cpu_thermal_cap, util; - unsigned long cur_delta, max_spare_cap = 0; + long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; - unsigned long prev_spare_cap = 0; + unsigned long cur_delta, base_energy; int max_spare_cap_cpu = -1; - unsigned long base_energy; int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7775,7 +7879,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) prev_spare_cap = cpu_cap; prev_fits = fits; } else if ((fits > max_fits) || - ((fits == max_fits) && (cpu_cap > max_spare_cap))) { + ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7787,7 +7891,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } } - if (max_spare_cap_cpu < 0 && prev_spare_cap == 0) + if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) continue; eenv_pd_busy_time(&eenv, cpus, p); @@ -7795,7 +7899,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) base_energy = compute_energy(&eenv, pd, cpus, p, -1); /* Evaluate the energy impact of using prev_cpu. */ - if (prev_spare_cap > 0) { + if (prev_spare_cap > -1) { prev_delta = compute_energy(&eenv, pd, cpus, p, prev_cpu); /* CPU utilization has changed */ @@ -7996,7 +8100,7 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; @@ -8009,7 +8113,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_preempt_curr() after an enqueue (which may have + * unconditionally wakeup_preempt() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ @@ -8101,7 +8205,7 @@ again: goto again; } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8164,7 +8268,7 @@ again: } } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8203,7 +8307,7 @@ simple: put_prev_task(rq, prev); do { - se = pick_next_entity(cfs_rq, NULL); + se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8916,7 +9020,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } /* @@ -9256,8 +9360,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); - if (!capacity) capacity = 1; @@ -9333,7 +9435,7 @@ static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { return ((rq->cpu_capacity * sd->imbalance_pct) < - (rq->cpu_capacity_orig * 100)); + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } /* @@ -9344,7 +9446,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) { return rq->misfit_task_load && - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || check_cpu_capacity(rq, sd)); } @@ -9496,7 +9598,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) * can only do it if @group is an SMT group and has exactly on busy CPU. Larger * imbalances in the number of CPUS are dealt with in find_busiest_group(). * - * If we are balancing load within an SMT core, or at DIE domain level, always + * If we are balancing load within an SMT core, or at PKG domain level, always * proceed. * * Return: true if @env::dst_cpu can do with asym_packing load balance. False @@ -11195,13 +11297,15 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_rq_unlock_irqrestore(busiest, flags); + preempt_disable(); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } + preempt_enable(); } } else { sd->nr_balance_failed = 0; @@ -11509,36 +11613,39 @@ static inline int on_null_domain(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing + * NOHZ idle load balancing (ILB) details: + * + * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set + * + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set * anywhere yet. */ - static inline int find_new_ilb(void) { - int ilb; const struct cpumask *hk_mask; + int ilb_cpu; hk_mask = housekeeping_cpumask(HK_TYPE_MISC); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { + for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - if (ilb == smp_processor_id()) + if (ilb_cpu == smp_processor_id()) continue; - if (idle_cpu(ilb)) - return ilb; + if (idle_cpu(ilb_cpu)) + return ilb_cpu; } - return nr_cpu_ids; + return -1; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU + * SMP function call (IPI). + * + * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -11552,8 +11659,7 @@ static void kick_ilb(unsigned int flags) nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); - - if (ilb_cpu >= nr_cpu_ids) + if (ilb_cpu < 0) return; /* @@ -11566,7 +11672,7 @@ static void kick_ilb(unsigned int flags) /* * This way we generate an IPI on the target CPU which - * is idle. And the softirq performing nohz idle load balance + * is idle, and the softirq performing NOHZ idle load balancing * will be run before returning from the IPI. */ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); @@ -11595,7 +11701,7 @@ static void nohz_balancer_kick(struct rq *rq) /* * None are in tickless mode and hence no need for NOHZ idle load - * balancing. + * balancing: */ if (likely(!atomic_read(&nohz.nr_cpus))) return; @@ -11617,9 +11723,8 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { /* - * If there's a CFS task and the current CPU has reduced - * capacity; kick the ILB to see if there's a better CPU to run - * on. + * If there's a runnable CFS task and the current CPU has reduced + * capacity, kick the ILB to see if there's a better CPU to run on: */ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; @@ -11671,11 +11776,11 @@ static void nohz_balancer_kick(struct rq *rq) if (sds) { /* * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread + * increase the overall cache utilization), we need a less-loaded LLC + * domain to pull some load from. Likewise, we may need to spread * load within the current LLC domain (e.g. packed SMT cores but * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks + * the others are - so just get a NOHZ balance going if it looks * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); @@ -11945,8 +12050,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) } /* - * Check if we need to run the ILB for updating blocked load before entering - * idle state. + * Check if we need to directly run the ILB for updating blocked load before + * entering idle state. Here we run ILB directly without issuing IPIs. + * + * Note that when this function is called, the tick may not yet be stopped on + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is + * called from this function on (this) CPU that's not yet in the mask. That's + * OK because the goal of nohz_run_idle_balance() is to run ILB only for + * updating the blocked load of already idle CPUs without waking up one of + * those idle CPUs and outside the preempt disable / irq off phase of the local + * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) { @@ -12391,7 +12507,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (p->prio > oldprio) resched_curr(rq); } else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -12493,7 +12609,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) if (task_current(rq, p)) resched_curr(rq); else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } } @@ -12852,7 +12968,7 @@ DEFINE_SCHED_CLASS(fair) = { .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .check_preempt_curr = check_preempt_wakeup, + .wakeup_preempt = check_preempt_wakeup_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae..a3ddf84de430 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 342f58a329f5..565f8374ddbb 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -373,6 +373,7 @@ EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { + current->flags |= PF_IDLE; arch_cpu_idle_prepare(); cpuhp_online_idle(state); while (1) @@ -400,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) { resched_curr(rq); } @@ -481,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = { /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, - .check_preempt_curr = check_preempt_curr_idle, + .wakeup_preempt = wakeup_preempt_idle, .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 0f310768260c..63b6cf898220 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Per Entity Load Tracking + * Per Entity Load Tracking (PELT) * * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1d0f634725a6..7b4aa5809c0f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) return growth; } -static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, +static void update_triggers(struct psi_group *group, u64 now, enum psi_aggregators aggregator) { struct psi_trigger *t; u64 *total = group->total[aggregator]; struct list_head *triggers; u64 *aggregator_total; - *update_total = false; if (aggregator == PSI_AVGS) { triggers = &group->avg_triggers; @@ -471,14 +470,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, * events without dropping any). */ if (new_stall) { - /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. - */ - *update_total = true; - /* Calculate growth since last update */ growth = window_update(&t->win, now, total[t->state]); if (!t->pending_event) { @@ -503,8 +494,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, /* Reset threshold breach flag once event got generated */ t->pending_event = false; } - - return now + group->rtpoll_min_period; } static u64 update_averages(struct psi_group *group, u64 now) @@ -565,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; u32 changed_states; - bool update_total; u64 now; dwork = to_delayed_work(work); @@ -584,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work) * go - see calc_avgs() and missed_periods. */ if (now >= group->avg_next_update) { - update_triggers(group, now, &update_total, PSI_AVGS); + update_triggers(group, now, PSI_AVGS); group->avg_next_update = update_averages(group, now); } @@ -608,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now) group->rtpoll_next_update = now + group->rtpoll_min_period; } -/* Schedule polling if it's not already scheduled or forced. */ +/* Schedule rtpolling if it's not already scheduled or forced. */ static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, bool force) { @@ -640,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group) { bool force_reschedule = false; u32 changed_states; - bool update_total; u64 now; mutex_lock(&group->rtpoll_trigger_lock); @@ -649,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group) if (now > group->rtpoll_until) { /* - * We are either about to start or might stop polling if no - * state change was recorded. Resetting poll_scheduled leaves + * We are either about to start or might stop rtpolling if no + * state change was recorded. Resetting rtpoll_scheduled leaves * a small window for psi_group_change to sneak in and schedule - * an immediate poll_work before we get to rescheduling. One - * potential extra wakeup at the end of the polling window - * should be negligible and polling_next_update still keeps + * an immediate rtpoll_work before we get to rescheduling. One + * potential extra wakeup at the end of the rtpolling window + * should be negligible and rtpoll_next_update still keeps * updates correctly on schedule. */ atomic_set(&group->rtpoll_scheduled, 0); /* - * A task change can race with the poll worker that is supposed to + * A task change can race with the rtpoll worker that is supposed to * report on it. To avoid missing events, ensure ordering between - * poll_scheduled and the task state accesses, such that if the poll - * worker misses the state update, the task change is guaranteed to - * reschedule the poll worker: + * rtpoll_scheduled and the task state accesses, such that if the + * rtpoll worker misses the state update, the task change is + * guaranteed to reschedule the rtpoll worker: * - * poll worker: - * atomic_set(poll_scheduled, 0) + * rtpoll worker: + * atomic_set(rtpoll_scheduled, 0) * smp_mb() * LOAD states * * task change: * STORE states - * if atomic_xchg(poll_scheduled, 1) == 0: - * schedule poll worker + * if atomic_xchg(rtpoll_scheduled, 1) == 0: + * schedule rtpoll worker * * The atomic_xchg() implies a full barrier. */ smp_mb(); } else { - /* Polling window is not over, keep rescheduling */ + /* The rtpolling window is not over, keep rescheduling */ force_reschedule = true; } @@ -687,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group) collect_percpu_times(group, PSI_POLL, &changed_states); if (changed_states & group->rtpoll_states) { - /* Initialize trigger windows when entering polling mode */ + /* Initialize trigger windows when entering rtpolling mode */ if (now > group->rtpoll_until) init_rtpoll_triggers(group, now); @@ -706,10 +693,12 @@ static void psi_rtpoll_work(struct psi_group *group) } if (now >= group->rtpoll_next_update) { - group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL); - if (update_total) + if (changed_states & group->rtpoll_states) { + update_triggers(group, now, PSI_POLL); memcpy(group->rtpoll_total, group->total[PSI_POLL], sizeof(group->rtpoll_total)); + } + group->rtpoll_next_update = now + group->rtpoll_min_period; } psi_schedule_rtpoll_work(group, @@ -1009,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) struct psi_group_cpu *groupc; u64 now; + if (static_branch_likely(&psi_disabled)) + return; + if (!task->pid) return; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0597ba0f85ff..6aaf0a3d6081 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth; * period over which we measure -rt task CPU usage in us. * default: 1s */ -unsigned int sysctl_sched_rt_period = 1000000; +int sysctl_sched_rt_period = 1000000; /* * part of the period that we allow rt tasks to run in us. @@ -34,9 +34,11 @@ static struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sched_rt_runtime_us", @@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = (void *)&sysctl_sched_rt_period, }, { .procname = "sched_rr_timeslice_ms", @@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq) #if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; - rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif /* CONFIG_SMP */ @@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } -static void update_rt_migration(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { - if (!rt_rq->overloaded) { - rt_set_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 1; - } - } else if (rt_rq->overloaded) { - rt_clear_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 0; - } -} - -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory++; - - update_rt_migration(rt_rq); -} - -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory--; - - update_rt_migration(rt_rq); -} - static inline int has_pushable_tasks(struct rq *rq) { return !plist_head_empty(&rq->rt.pushable_tasks); @@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) /* Update the highest prio pushable task */ if (p->prio < rq->rt.highest_prio.next) rq->rt.highest_prio.next = p->prio; + + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) @@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) rq->rt.highest_prio.next = p->prio; } else { rq->rt.highest_prio.next = MAX_RT_PRIO-1; + + if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } } } @@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) { } -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } @@ -515,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) min_cap = uclamp_eff_value(p, UCLAMP_MIN); max_cap = uclamp_eff_value(p, UCLAMP_MAX); - cpu_cap = capacity_orig_of(cpu); + cpu_cap = arch_scale_cpu_capacity(cpu); return cpu_cap >= min(min_cap, max_cap); } @@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) /* * When we're idle and a woken (rt) task is - * throttled check_preempt_curr() will set + * throttled wakeup_preempt() will set * skip_update and the time between the wakeup * and this unthrottle will get accounted as * 'runtime'. @@ -1281,7 +1237,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); - inc_rt_migration(rt_se, rt_rq); inc_rt_group(rt_se, rt_rq); } @@ -1294,7 +1249,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); - dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } @@ -1715,7 +1669,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_curr(rq); @@ -2109,9 +2063,11 @@ retry: */ push_task = get_push_task(rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, push_cpu_stop, push_task, &rq->push_work); + preempt_enable(); raw_spin_rq_lock(rq); } @@ -2448,9 +2404,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } @@ -2702,7 +2660,7 @@ DEFINE_SCHED_CLASS(rt) = { .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, - .check_preempt_curr = check_preempt_curr_rt, + .wakeup_preempt = wakeup_preempt_rt, .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, @@ -2985,9 +2943,6 @@ static int sched_rt_global_constraints(void) #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || ((u64)sysctl_sched_rt_runtime * @@ -3018,7 +2973,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_validate(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..2e5a95486a42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -74,15 +74,6 @@ #include "../workqueue_internal.h" -#ifdef CONFIG_CGROUP_SCHED -#include <linux/cgroup.h> -#include <linux/psi.h> -#endif - -#ifdef CONFIG_SCHED_DEBUG -# include <linux/static_key.h> -#endif - #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> # include <asm/paravirt_api_clock.h> @@ -109,14 +100,12 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; -extern unsigned int sysctl_sched_child_runs_first; - extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); -extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; @@ -594,6 +583,7 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + u64 last_update_tg_load_avg; unsigned long tg_load_avg_contrib; long propagate; long prop_runnable_sum; @@ -644,9 +634,7 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; -#ifdef CONFIG_SMP struct list_head throttled_csd_list; -#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -675,8 +663,6 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - unsigned int rt_nr_migratory; - unsigned int rt_nr_total; int overloaded; struct plist_head pushable_tasks; @@ -721,7 +707,6 @@ struct dl_rq { u64 next; } earliest_dl; - unsigned int dl_nr_migratory; int overloaded; /* @@ -963,10 +948,6 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1048,7 +1029,6 @@ struct rq { struct sched_domain __rcu *sd; unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; struct balance_callback *balance_callback; @@ -1079,9 +1059,6 @@ struct rq { u64 idle_stamp; u64 avg_idle; - unsigned long wake_stamp; - u64 wake_avg_idle; - /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; @@ -1658,6 +1635,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, + _T->rq = task_rq_lock(_T->lock, &_T->rf), + task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) @@ -1868,11 +1850,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; static __always_inline bool sched_asym_cpucap_active(void) { @@ -2239,7 +2223,7 @@ struct sched_class { void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); - void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); struct task_struct *(*pick_next_task)(struct rq *rq); @@ -2513,7 +2497,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT #define SCHED_NR_MIGRATE_BREAK 8 @@ -2977,11 +2961,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP -static inline unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - /** * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -3219,6 +3198,8 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } +extern struct cpufreq_governor schedutil_gov; + #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 85590599b4d6..6cf7304e6449 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SMP */ static void -check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) { /* we're never preempted */ } @@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = { .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, - .check_preempt_curr = check_preempt_curr_stop, + .wakeup_preempt = wakeup_preempt_stop, .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a5bc678c08..10d1391e7416 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -212,6 +212,69 @@ static unsigned int sysctl_sched_energy_aware = 1; static DEFINE_MUTEX(sched_energy_mutex); static bool sched_energy_update; +static bool sched_is_eas_possible(const struct cpumask *cpu_mask) +{ + bool any_asym_capacity = false; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; + int i; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + for_each_cpu(i, cpu_mask) { + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { + any_asym_capacity = true; + break; + } + } + if (!any_asym_capacity) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, SMT is not supported\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* Do not attempt EAS if schedutil is not being used. */ + for_each_cpu(i, cpu_mask) { + policy = cpufreq_cpu_get(i); + if (!policy) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", + cpumask_pr_args(cpu_mask), i); + } + return false; + } + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + } + + return true; +} + void rebuild_sched_domains_energy(void) { mutex_lock(&sched_energy_mutex); @@ -230,6 +293,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write, if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!sched_is_eas_possible(cpu_active_mask)) { + if (write) { + return -EOPNOTSUPP; + } else { + *lenp = 0; + return 0; + } + } + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); @@ -348,103 +420,33 @@ static void sched_energy_set(bool has_eas) * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. no SMT is detected. - * 4. the EM complexity is low enough to keep scheduling overheads low; - * 5. schedutil is driving the frequency of all CPUs of the rd; - * 6. frequency invariance support is present; - * - * The complexity of the Energy Model is defined as: - * - * C = nr_pd * (nr_cpus + nr_ps) - * - * with parameters defined as: - * - nr_pd: the number of performance domains - * - nr_cpus: the number of CPUs - * - nr_ps: the sum of the number of performance states of all performance - * domains (for example, on a system with 2 performance domains, - * with 10 performance states each, nr_ps = 2 * 10 = 20). - * - * It is generally not a good idea to use such a model in the wake-up path on - * very complex platforms because of the associated scheduling overheads. The - * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 performance states each, for example. + * 4. schedutil is driving the frequency of all CPUs of the rd; + * 5. frequency invariance support is present; */ -#define EM_MAX_COMPLEXITY 2048 - -extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); + int i; struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; if (!sysctl_sched_energy_aware) goto free; - /* EAS is enabled for asymmetric CPU capacity topologies. */ - if (!per_cpu(sd_asym_cpucapacity, cpu)) { - if (sched_debug()) { - pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", - cpumask_pr_args(cpu_map)); - } + if (!sched_is_eas_possible(cpu_map)) goto free; - } - - /* EAS definitely does *not* handle SMT */ - if (sched_smt_active()) { - pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", - cpumask_pr_args(cpu_map)); - goto free; - } - - if (!arch_scale_freq_invariant()) { - if (sched_debug()) { - pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", - cpumask_pr_args(cpu_map)); - } - goto free; - } for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) continue; - /* Do not attempt EAS if schedutil is not being used. */ - policy = cpufreq_cpu_get(i); - if (!policy) - goto free; - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (rd->pd) - pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_map)); - goto free; - } - /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) goto free; tmp->next = pd; pd = tmp; - - /* - * Count performance domains and performance states for the - * complexity check. - */ - nr_pd++; - nr_ps += em_pd_nr_perf_states(pd->em_pd); - } - - /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { - WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", - cpumask_pr_args(cpu_map)); - goto free; } perf_domain_debug(cpu_map, pd); @@ -666,11 +668,14 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -691,6 +696,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1117,7 +1133,7 @@ fail: * * - Simultaneous multithreading (SMT) * - Multi-Core Cache (MC) - * - Package (DIE) + * - Package (PKG) * * Where the last one more or less denotes everything up to a NUMA node. * @@ -1139,13 +1155,13 @@ fail: * * CPU 0 1 2 3 4 5 6 7 * - * DIE [ ] + * PKG [ ] * MC [ ] [ ] * SMT [ ] [ ] [ ] [ ] * * - or - * - * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 * @@ -1548,6 +1564,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -1679,7 +1696,7 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -2112,22 +2129,31 @@ static int hop_cmp(const void *a, const void *b) return -1; } -/* - * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu - * closest to @cpu from @cpumask. - * cpumask: cpumask to find a cpu from - * cpu: Nth cpu to find - * - * returns: cpu, or nr_cpu_ids when nothing found. +/** + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU + * from @cpus to @cpu, taking into account distance + * from a given @node. + * @cpus: cpumask to find a cpu from + * @cpu: CPU to start searching + * @node: NUMA node to order CPUs by distance + * + * Return: cpu, or nr_cpu_ids when nothing found. */ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { - struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu }; + struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; struct cpumask ***hop_masks; int hop, ret = nr_cpu_ids; + if (node == NUMA_NO_NODE) + return cpumask_nth_and(cpu, cpus, cpu_online_mask); + rcu_read_lock(); + /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ + node = numa_nearest_node(node, N_CPU); + k.node = node; + k.masks = rcu_dereference(sched_domains_numa_masks); if (!k.masks) goto unlock; @@ -2362,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2479,20 +2506,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + unsigned long capacity; + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + capacity = arch_scale_cpu_capacity(i); /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, capacity); cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2592,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); diff --git a/kernel/signal.c b/kernel/signal.c index 09019017d669..f2a5578326ad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2329,15 +2329,38 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, do_notify_parent_cldstop(current, false, why); /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. + * The previous do_notify_parent_cldstop() invocation woke ptracer. + * One a PREEMPTION kernel this can result in preemption requirement + * which will be fulfilled after read_unlock() and the ptracer will be + * put on the CPU. + * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for + * this task wait in schedule(). If this task gets preempted then it + * remains enqueued on the runqueue. The ptracer will observe this and + * then sleep for a delay of one HZ tick. In the meantime this task + * gets scheduled, enters schedule() and will wait for the ptracer. * - * XXX: implement read_unlock_no_resched(). + * This preemption point is not bad from a correctness point of + * view but extends the runtime by one HZ tick time due to the + * ptracer's sleep. The preempt-disable section ensures that there + * will be no preemption between unlock and schedule() and so + * improving the performance since the ptracer will observe that + * the tracee is scheduled out once it gets on the CPU. + * + * On PREEMPT_RT locking tasklist_lock does not disable preemption. + * Therefore the task can be preempted after do_notify_parent_cldstop() + * before unlocking tasklist_lock so there is no benefit in doing this. + * + * In fact disabling preemption is harmful on PREEMPT_RT because + * the spinlock_t in cgroup_enter_frozen() must not be acquired + * with preemption disabled due to the 'sleeping' spinlock + * substitution of RT. */ - preempt_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); - preempt_enable_no_resched(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable_no_resched(); schedule(); cgroup_leave_frozen(true); diff --git a/kernel/smp.c b/kernel/smp.c index 8455a53465af..f085ebcdf9e7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -127,7 +127,7 @@ send_call_function_ipi_mask(struct cpumask *mask) } static __always_inline void -csd_do_func(smp_call_func_t func, void *info, struct __call_single_data *csd) +csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd) { trace_csd_function_entry(func, csd); func(info); @@ -170,11 +170,13 @@ static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0444); +static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ +module_param(panic_on_ipistall, int, 0444); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ -static void __csd_lock_record(struct __call_single_data *csd) +static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ @@ -189,13 +191,13 @@ static void __csd_lock_record(struct __call_single_data *csd) /* Or before unlock, as the case may be. */ } -static __always_inline void csd_lock_record(struct __call_single_data *csd) +static __always_inline void csd_lock_record(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } -static int csd_lock_wait_getcpu(struct __call_single_data *csd) +static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; @@ -210,7 +212,7 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; @@ -230,6 +232,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * } ts2 = sched_clock(); + /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns || csd_lock_timeout_ns == 0)) return false; @@ -243,9 +246,17 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ + /* How long since this CSD lock was stuck. */ + ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", - firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, + firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts_delta, cpu, csd->func, csd->info); + /* + * If the CSD lock is still stuck after 5 minutes, it is unlikely + * to become unstuck. Use a signed comparison to avoid triggering + * on underflows when the TSC is out of sync between sockets. + */ + BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), @@ -276,7 +287,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void __csd_lock_wait(struct __call_single_data *csd) +static void __csd_lock_wait(call_single_data_t *csd) { int bug_id = 0; u64 ts0, ts1; @@ -290,7 +301,7 @@ static void __csd_lock_wait(struct __call_single_data *csd) smp_acquire__after_ctrl_dep(); } -static __always_inline void csd_lock_wait(struct __call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); @@ -300,17 +311,17 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd) smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #else -static void csd_lock_record(struct __call_single_data *csd) +static void csd_lock_record(call_single_data_t *csd) { } -static __always_inline void csd_lock_wait(struct __call_single_data *csd) +static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif -static __always_inline void csd_lock(struct __call_single_data *csd) +static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; @@ -323,7 +334,7 @@ static __always_inline void csd_lock(struct __call_single_data *csd) smp_wmb(); } -static __always_inline void csd_unlock(struct __call_single_data *csd) +static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); @@ -376,7 +387,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, struct __call_single_data *csd) +static int generic_exec_single(int cpu, call_single_data_t *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; @@ -667,7 +678,7 @@ EXPORT_SYMBOL(smp_call_function_single); * * Return: %0 on success or negative errno value on error */ -int smp_call_function_single_async(int cpu, struct __call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9ed5ce989415..4f65824879ab 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, put_task_stack(tsk); return c.len; } +EXPORT_SYMBOL_GPL(stack_trace_save_tsk); /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array @@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, save_stack_trace_tsk(task, &trace); return trace.nr_entries; } +EXPORT_SYMBOL_GPL(stack_trace_save_tsk); /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e137c1385c56..9db51ea373b0 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,9 @@ COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); COND_SYSCALL_COMPAT(get_robust_list); COND_SYSCALL(futex_waitv); +COND_SYSCALL(futex_wake); +COND_SYSCALL(futex_wait); +COND_SYSCALL(futex_requeue); COND_SYSCALL(kexec_load); COND_SYSCALL_COMPAT(kexec_load); COND_SYSCALL(init_module); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 354a2d294f52..2b6585751891 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1983,7 +1983,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = perf_proc_update_handler, + .proc_handler = perf_event_max_sample_rate_handler, .extra1 = SYSCTL_ONE, }, { diff --git a/kernel/task_work.c b/kernel/task_work.c index 065e1ef8fc8d..95a7e1b7f1da 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -78,6 +78,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call + * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8d9f13d847f0..4657cb8e8b1f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -290,6 +290,17 @@ static int alarmtimer_suspend(struct device *dev) rtc_timer_cancel(rtc, &rtctimer); rtc_read_time(rtc, &tm); now = rtc_tm_to_ktime(tm); + + /* + * If the RTC alarm timer only supports a limited time offset, set the + * alarm time to the maximum supported value. + * The system may wake up earlier (possibly much earlier) than expected + * when the alarmtimer runs. This is the best the kernel can do if + * the alarmtimer exceeds the time that the rtc device can be programmed + * for. + */ + min = rtc_bound_alarmtime(rtc, min); + now = ktime_add(now, min); /* Set alarm, if in the past reject suspend briefly to handle */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 87015e9deacc..be77b021e5d6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -4,7 +4,7 @@ * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * - * No idle tick implementation for low and high resolution timers + * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ @@ -45,7 +45,7 @@ struct tick_sched *tick_get_tick_sched(int cpu) #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* - * The time, when the last jiffy update happened. Write access must hold + * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ @@ -60,13 +60,13 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta, nextp; /* - * 64bit can do a quick check without holding jiffies lock and + * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * - * 32bit cannot do that because the store of tick_next_period - * consists of two 32bit stores and the first store could move it - * to a random point in the future. + * 32-bit cannot do that because the store of 'tick_next_period' + * consists of two 32-bit stores, and the first store could be + * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) @@ -75,7 +75,7 @@ static void tick_do_update_jiffies64(ktime_t now) unsigned int seq; /* - * Avoid contention on jiffies_lock and protect the quick + * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { @@ -90,7 +90,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* - * Reevaluate with the lock held. Another CPU might have done the + * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { @@ -114,25 +114,23 @@ static void tick_do_update_jiffies64(ktime_t now) TICK_NSEC); } - /* Advance jiffies to complete the jiffies_seq protected job */ + /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; - /* - * Keep the tick_next_period variable up to date. - */ + /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick - * check above and ensures that the update to jiffies_64 is - * not reordered vs. the store to tick_next_period, neither + * check above, and ensures that the update to 'jiffies_64' is + * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* - * A plain store is good enough on 32bit as the quick check + * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; @@ -140,7 +138,7 @@ static void tick_do_update_jiffies64(ktime_t now) /* * Release the sequence count. calc_global_load() below is not - * protected by it, but jiffies_lock needs to be held to prevent + * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); @@ -160,7 +158,8 @@ static ktime_t tick_init_jiffy_update(void) raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); - /* Did we start the jiffies update yet ? */ + + /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; @@ -175,8 +174,10 @@ static ktime_t tick_init_jiffy_update(void) last_jiffies_update = tick_next_period; } period = last_jiffies_update; + write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); + return period; } @@ -192,10 +193,10 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by - * jiffies_lock. + * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the - * tick_do_timer_cpu never relinquishes. + * 'tick_do_timer_cpu' CPU never relinquishes. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL @@ -205,12 +206,12 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) } #endif - /* Check, if the jiffies need an update */ + /* Check if jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); /* - * If jiffies update stalled for too long (timekeeper in stop_machine() + * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { @@ -234,10 +235,10 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long - * time. This happens on complete idle SMP systems while + * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do - * when we go busy again does not account too much ticks. + * when we go busy again does not account too many ticks. */ if (ts->tick_stopped) { touch_softlockup_watchdog_sched(); @@ -362,7 +363,7 @@ static void tick_nohz_kick_task(struct task_struct *tsk) /* * If the task is not running, run_posix_cpu_timers() - * has nothing to elapse, IPI can then be spared. + * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq @@ -425,7 +426,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep, /* * Set a global tick dependency. Used by perf events that rely on freq and - * by unstable clock. + * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { @@ -439,7 +440,7 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit) /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to - * manage events throttling. + * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { @@ -455,7 +456,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { - /* Remote irq work not NMI-safe */ + /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } @@ -473,7 +474,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* - * Set a per-task tick dependency. RCU need this. Also posix CPU timers + * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) @@ -546,7 +547,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* - * The tick_do_timer_cpu CPU handles housekeeping duty (unbound + * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ @@ -568,12 +569,12 @@ void __init tick_nohz_init(void) return; /* - * Full dynticks uses irq work to drive the tick rescheduling on safe - * locking contexts. But then we need irq work to raise its own - * interrupts to avoid circular dependency on the tick + * Full dynticks uses IRQ work to drive the tick rescheduling on safe + * locking contexts. But then we need IRQ work to raise its own + * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { - pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; @@ -643,7 +644,7 @@ bool tick_nohz_tick_stopped_cpu(int cpu) * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the - * CPU, which has the update task assigned is in a long sleep. + * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { @@ -726,7 +727,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, * counters if NULL. * * Return the cumulative idle time (since boot) for a given - * CPU, in microseconds. Note this is partially broken due to + * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. @@ -787,7 +788,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) } /* - * Reset to make sure next tick stop doesn't get fooled by past + * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; @@ -816,11 +817,11 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. - * Aside of that check whether the local timer softirq is - * pending. If so its a bad idea to call get_next_timer_interrupt() + * Aside of that, check whether the local timer softirq is + * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a - * minimal delta which brings us back to this place + * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || @@ -861,7 +862,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) /* * If this CPU is the one which had the do_timer() duty last, we limit - * the sleep time to the timekeeping max_deferment value. + * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); @@ -895,8 +896,8 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we - * don't drop this here the jiffies might be stale and - * do_timer() never invoked. Keep track of the fact that it + * don't drop this here, the jiffies might be stale and + * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ if (cpu == tick_do_timer_cpu) { @@ -906,7 +907,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) ts->do_timer_last = 0; } - /* Skip reprogram of event if its not changed */ + /* Skip reprogram of event if it's not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) @@ -919,11 +920,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) } /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when + * nohz_stop_sched_tick() can be called several times before + * nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. + * the scheduler tick in nohz_restart_sched_tick(). */ if (!ts->tick_stopped) { calc_load_nohz_start(); @@ -985,9 +986,8 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); - /* - * Cancel the scheduled timer and restore the tick - */ + + /* Cancel the scheduled timer and restore the tick: */ ts->tick_stopped = 0; tick_nohz_restart(ts, now); } @@ -1019,11 +1019,11 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't - * reach here due to the need_resched() early check in can_stop_idle_tick(). + * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, - * triggering the below since wakep_softirqd() is ignored. + * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) @@ -1044,7 +1044,7 @@ static bool report_idle_softirq(void) if (ratelimit >= 10) return false; - /* On RT, softirqs handling may be waiting on some lock */ + /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; @@ -1061,8 +1061,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the CPU which runs the tick timer next. If we don't drop - * this here the jiffies might be stale and do_timer() never - * invoked. + * this here, the jiffies might be stale and do_timer() never + * gets invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) @@ -1175,12 +1175,23 @@ void tick_nohz_idle_enter(void) } /** - * tick_nohz_irq_exit - update next tick event from interrupt exit + * tick_nohz_irq_exit - Notify the tick about IRQ exit + * + * A timer may have been added/modified/deleted either by the current IRQ, + * or by another place using this IRQ as a notification. This IRQ may have + * also updated the RCU callback list. These events may require a + * re-evaluation of the next tick. Depending on the context: + * + * 1) If the CPU is idle and no resched is pending, just proceed with idle + * time accounting. The next tick will be re-evaluated on the next idle + * loop iteration. + * + * 2) If the CPU is nohz_full: * - * When an interrupt fires while we are idle and it doesn't cause - * a reschedule, it may still add, modify or delete a timer, enqueue - * an RCU callback, etc... - * So we need to re-calculate and reprogram the next tick event. + * 2.1) If there is any tick dependency, restart the tick if stopped. + * + * 2.2) If there is no tick dependency, (re-)evaluate the next tick and + * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { @@ -1208,7 +1219,7 @@ bool tick_nohz_idle_got_tick(void) /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer - * or the tick, whatever that expires first. Note that, if the tick has been + * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled @@ -1252,7 +1263,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) return *delta_next; /* - * If the next highres timer to expire is earlier than next_event, the + * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, @@ -1296,9 +1307,9 @@ static void tick_nohz_account_idle_time(struct tick_sched *ts, if (vtime_accounting_enabled_this_cpu()) return; /* - * We stopped the tick in idle. Update process times would miss the - * time we slept as update_process_times does only a 1 tick - * accounting. Enforce that this is accounted to idle ! + * We stopped the tick in idle. update_process_times() would miss the + * time we slept, as it does only a 1 tick accounting. + * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* @@ -1330,11 +1341,20 @@ static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) } /** - * tick_nohz_idle_exit - restart the idle tick from the idle task + * tick_nohz_idle_exit - Update the tick upon idle task exit + * + * When the idle task exits, update the tick depending on the + * following situations: + * + * 1) If the CPU is not in nohz_full mode (most cases), then + * restart the tick. + * + * 2) If the CPU is in nohz_full mode (corner case): + * 2.1) If the tick can be kept stopped (no tick dependencies) + * then re-evaluate the next tick and try to keep it stopped + * as long as possible. + * 2.2) If the tick has dependencies, restart the tick. * - * Restart the idle tick when the CPU is woken up from idle - * This also exit the RCU extended quiescent state. The CPU - * can use RCU again after this function is called. */ void tick_nohz_idle_exit(void) { @@ -1364,9 +1384,15 @@ void tick_nohz_idle_exit(void) } /* - * The nohz low res interrupt handler + * In low-resolution mode, the tick handler must be implemented directly + * at the clockevent level. hrtimer can't be used instead, because its + * infrastructure actually relies on the tick itself as a backend in + * low-resolution mode (see hrtimer_run_queues()). + * + * This low-resolution handler still makes use of some hrtimer APIs meanwhile + * for convenience with expiration calculation and forwarding. */ -static void tick_nohz_handler(struct clock_event_device *dev) +static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); @@ -1377,18 +1403,16 @@ static void tick_nohz_handler(struct clock_event_device *dev) tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); - if (unlikely(ts->tick_stopped)) { - /* - * The clockevent device is not reprogrammed, so change the - * clock event device to ONESHOT_STOPPED to avoid spurious - * interrupts on devices which might not be truly one shot. - */ - tick_program_event(KTIME_MAX, 1); - return; + /* + * In dynticks mode, tick reprogram is deferred: + * - to the idle task if in dynticks-idle + * - to IRQ exit if in full-dynticks. + */ + if (likely(!ts->tick_stopped)) { + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) @@ -1402,7 +1426,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) } /** - * tick_nohz_switch_to_nohz - switch to nohz mode + * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { @@ -1412,12 +1436,12 @@ static void tick_nohz_switch_to_nohz(void) if (!tick_nohz_enabled) return; - if (tick_switch_to_oneshot(tick_nohz_handler)) + if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* - * Recycle the hrtimer in ts, so we can share the - * hrtimer_forward with the highres code. + * Recycle the hrtimer in 'ts', so we can share the + * hrtimer_forward_now() function with the highres code. */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ @@ -1440,7 +1464,7 @@ static inline void tick_nohz_irq_enter(void) if (ts->idle_active) tick_nohz_stop_idle(ts, now); /* - * If all CPUs are idle. We may need to update a stale jiffies value. + * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a @@ -1459,7 +1483,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * Called from irq_enter to notify about the possible interruption of idle() + * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { @@ -1475,7 +1499,7 @@ void tick_irq_enter(void) * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ -static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +static enum hrtimer_restart tick_nohz_highres_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); @@ -1485,15 +1509,19 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) tick_sched_do_timer(ts, now); /* - * Do not call, when we are not in irq context and have - * no valid regs pointer + * Do not call when we are not in IRQ context and have + * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; - /* No need to reprogram if we are in idle or full dynticks mode */ + /* + * In dynticks mode, tick reprogram is deferred: + * - to the idle task if in dynticks-idle + * - to IRQ exit if in full-dynticks. + */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; @@ -1520,16 +1548,14 @@ void tick_setup_sched_timer(void) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); - /* - * Emulate tick processing via per-CPU hrtimers: - */ + /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - ts->sched_timer.function = tick_sched_timer; + ts->sched_timer.function = tick_nohz_highres_handler; /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - /* Offset the tick to avert jiffies_lock contention. */ + /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); @@ -1579,10 +1605,10 @@ void tick_oneshot_notify(void) } /* - * Check, if a change happened, which makes oneshot possible. + * Check if a change happened, which makes oneshot possible. * - * Called cyclic from the hrtimer softirq (driven by the timer - * softirq) allow_nohz signals, that we can switch into low-res nohz + * Called cyclically from the hrtimer softirq (driven by the timer + * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ diff --git a/kernel/torture.c b/kernel/torture.c index b28b05bbef02..c72ab2d251f4 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -87,14 +87,15 @@ EXPORT_SYMBOL_GPL(verbose_torout_sleep); * nanosecond random fuzz. This function and its friends desynchronize * testing from the timer wheel. */ -int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp) +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, const enum hrtimer_mode mode, + struct torture_random_state *trsp) { ktime_t hto = baset_ns; if (trsp) hto += torture_random(trsp) % fuzzt_ns; set_current_state(TASK_IDLE); - return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); + return schedule_hrtimeout(&hto, mode); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); @@ -106,7 +107,7 @@ int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state { ktime_t baset_ns = baset_us * NSEC_PER_USEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_us); @@ -123,7 +124,7 @@ int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state fuzzt_ns = (u32)~0U; else fuzzt_ns = fuzzt_us * NSEC_PER_USEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ms); @@ -136,7 +137,7 @@ int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp) { ktime_t baset_ns = jiffies_to_nsecs(baset_j); - return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp); + return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies); @@ -153,7 +154,7 @@ int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state * fuzzt_ns = (u32)~0U; else fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC; - return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, HRTIMER_MODE_REL, trsp); } EXPORT_SYMBOL_GPL(torture_hrtimeout_s); @@ -520,9 +521,8 @@ static void torture_shuffle_task_unregister_all(void) * A special case is when shuffle_idle_cpu = -1, in which case we allow * the tasks to run on all CPUs. */ -static void torture_shuffle_tasks(void) +static void torture_shuffle_tasks(struct torture_random_state *trp) { - DEFINE_TORTURE_RANDOM(rand); struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); @@ -543,7 +543,7 @@ static void torture_shuffle_tasks(void) mutex_lock(&shuffle_task_mutex); list_for_each_entry(stp, &shuffle_task_list, st_l) { - if (!random_shuffle || torture_random(&rand) & 0x1) + if (!random_shuffle || torture_random(trp) & 0x1) set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); } mutex_unlock(&shuffle_task_mutex); @@ -562,7 +562,7 @@ static int torture_shuffle(void *arg) VERBOSE_TOROUT_STRING("torture_shuffle task started"); do { torture_hrtimeout_jiffies(shuffle_interval, &rand); - torture_shuffle_tasks(); + torture_shuffle_tasks(&rand); torture_shutdown_absorb("torture_shuffle"); } while (!torture_must_stop()); torture_kthread_stopping("torture_shuffle"); @@ -673,7 +673,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void)) if (ssecs > 0) { shutdown_time = ktime_add(ktime_get(), ktime_set(ssecs, 0)); return torture_create_kthread(torture_shutdown, NULL, - shutdown_task); + shutdown_task); } return 0; } @@ -720,7 +720,7 @@ static void torture_shutdown_cleanup(void) * suddenly applied to or removed from the system. */ static struct task_struct *stutter_task; -static int stutter_pause_test; +static ktime_t stutter_till_abs_time; static int stutter; static int stutter_gap; @@ -730,30 +730,16 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - unsigned int i = 0; bool ret = false; - int spt; + ktime_t till_ns; cond_resched_tasks_rcu_qs(); - spt = READ_ONCE(stutter_pause_test); - for (; spt; spt = READ_ONCE(stutter_pause_test)) { - if (!ret && !rt_task(current)) { - sched_set_normal(current, MAX_NICE); - ret = true; - } - if (spt == 1) { - torture_hrtimeout_jiffies(1, NULL); - } else if (spt == 2) { - while (READ_ONCE(stutter_pause_test)) { - if (!(i++ & 0xffff)) - torture_hrtimeout_us(10, 0, NULL); - cond_resched(); - } - } else { - torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL); - } - torture_shutdown_absorb(title); + till_ns = READ_ONCE(stutter_till_abs_time); + if (till_ns && ktime_before(ktime_get(), till_ns)) { + torture_hrtimeout_ns(till_ns, 0, HRTIMER_MODE_ABS, NULL); + ret = true; } + torture_shutdown_absorb(title); return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -764,23 +750,16 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { - DEFINE_TORTURE_RANDOM(rand); - int wtime; + ktime_t till_ns; VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - wtime = stutter; - if (stutter > 2) { - WRITE_ONCE(stutter_pause_test, 1); - wtime = stutter - 3; - torture_hrtimeout_jiffies(wtime, &rand); - wtime = 2; - } - WRITE_ONCE(stutter_pause_test, 2); - torture_hrtimeout_jiffies(wtime, NULL); + till_ns = ktime_add_ns(ktime_get(), + jiffies_to_nsecs(stutter)); + WRITE_ONCE(stutter_till_abs_time, till_ns); + torture_hrtimeout_jiffies(stutter - 1, NULL); } - WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) torture_hrtimeout_jiffies(stutter_gap, NULL); torture_shutdown_absorb("torture_stutter"); @@ -812,6 +791,13 @@ static void torture_stutter_cleanup(void) stutter_task = NULL; } +static void +torture_print_module_parms(void) +{ + pr_alert("torture module --- %s: disable_onoff_at_boot=%d ftrace_dump_at_shutdown=%d verbose_sleep_frequency=%d verbose_sleep_duration=%d random_shuffle=%d\n", + torture_type, disable_onoff_at_boot, ftrace_dump_at_shutdown, verbose_sleep_frequency, verbose_sleep_duration, random_shuffle); +} + /* * Initialize torture module. Please note that this is -not- invoked via * the usual module_init() mechanism, but rather by an explicit call from @@ -834,6 +820,7 @@ bool torture_init_begin(char *ttype, int v) torture_type = ttype; verbose = v; fullstop = FULLSTOP_DONTSTOP; + torture_print_module_parms(); return true; } EXPORT_SYMBOL_GPL(torture_init_begin); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index a7264b2c17ad..868008f56fec 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2853,6 +2853,17 @@ static int get_modules_for_addrs(struct module ***mods, unsigned long *addrs, u3 return arr.mods_cnt; } +static int addrs_check_error_injection_list(unsigned long *addrs, u32 cnt) +{ + u32 i; + + for (i = 0; i < cnt; i++) { + if (!within_error_injection_list(addrs[i])) + return -EINVAL; + } + return 0; +} + int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_kprobe_multi_link *link = NULL; @@ -2930,6 +2941,11 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr goto error; } + if (prog->kprobe_override && addrs_check_error_injection_list(addrs, cnt)) { + err = -EINVAL; + goto error; + } + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { err = -ENOMEM; @@ -3207,8 +3223,10 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr rcu_read_lock(); task = get_pid_task(find_vpid(pid), PIDTYPE_PID); rcu_read_unlock(); - if (!task) + if (!task) { + err = -ESRCH; goto error_path_put; + } } err = -ENOMEM; diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 3b21f4063258..881f90f0cbcf 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -189,7 +189,7 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) { int i, size; - if (num < 0) + if (num <= 0) return -EINVAL; if (!fp->exit_handler) { @@ -202,8 +202,8 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) size = fp->nr_maxactive; else size = num * num_possible_cpus() * 2; - if (size < 0) - return -E2BIG; + if (size <= 0) + return -EINVAL; fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); if (!fp->rethook) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1651edc48d5..515cafdb18d9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -354,6 +354,11 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } +static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + static void free_buffer_page(struct buffer_page *bpage) { free_page((unsigned long)bpage->page); @@ -1132,6 +1137,9 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu, if (full) { poll_wait(filp, &work->full_waiters, poll_table); work->full_waiters_pending = true; + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; } else { poll_wait(filp, &work->waiters, poll_table); work->waiters_pending = true; @@ -2003,7 +2011,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) * Increment overrun to account for the lost events. */ local_add(page_entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); } @@ -2367,11 +2375,6 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read); } -static __always_inline unsigned rb_page_commit(struct buffer_page *bpage) -{ - return local_read(&bpage->page->commit); -} - static struct ring_buffer_event * rb_iter_head_event(struct ring_buffer_iter *iter) { @@ -2517,7 +2520,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); - local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); + local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes); local_inc(&cpu_buffer->pages_lost); /* @@ -2660,9 +2663,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); - /* account for padding bytes */ - local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); - /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2676,7 +2676,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * write counter enough to allow another writer to slip * in on this page. * We put in a discarded commit instead, to make sure - * that this space is not used again. + * that this space is not used again, and this space will + * not be accounted into 'entries_bytes'. * * If we are less than the minimum size, we don't need to * worry about it. @@ -2701,6 +2702,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* Make sure the padding is visible before the tail_page->write update */ smp_wmb(); @@ -4215,7 +4219,7 @@ u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); /** - * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to read from. */ @@ -4723,6 +4727,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) length = rb_event_length(event); cpu_buffer->reader_page->read += length; + cpu_buffer->read_bytes += length; } static void rb_advance_iter(struct ring_buffer_iter *iter) @@ -5816,7 +5821,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); - cpu_buffer->read_bytes += BUF_PAGE_SIZE; + cpu_buffer->read_bytes += rb_page_commit(reader); /* swap the pages */ rb_init_page(bpage); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 91951d038ba4..f49d6ddb6342 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2770,6 +2770,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) update_event_fields(call, map[i]); } } + cond_resched(); } up_write(&trace_event_sem); } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index 6f046650e527..b87f41187c6a 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -127,8 +127,13 @@ struct user_event_enabler { /* Bit 7 is for freeing status of enablement */ #define ENABLE_VAL_FREEING_BIT 7 -/* Only duplicate the bit value */ -#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK +/* Bit 8 is for marking 32-bit on 64-bit */ +#define ENABLE_VAL_32_ON_64_BIT 8 + +#define ENABLE_VAL_COMPAT_MASK (1 << ENABLE_VAL_32_ON_64_BIT) + +/* Only duplicate the bit and compat values */ +#define ENABLE_VAL_DUP_MASK (ENABLE_VAL_BIT_MASK | ENABLE_VAL_COMPAT_MASK) #define ENABLE_BITOPS(e) (&(e)->values) @@ -174,6 +179,30 @@ struct user_event_validator { int flags; }; +static inline void align_addr_bit(unsigned long *addr, int *bit, + unsigned long *flags) +{ + if (IS_ALIGNED(*addr, sizeof(long))) { +#ifdef __BIG_ENDIAN + /* 32 bit on BE 64 bit requires a 32 bit offset when aligned. */ + if (test_bit(ENABLE_VAL_32_ON_64_BIT, flags)) + *bit += 32; +#endif + return; + } + + *addr = ALIGN_DOWN(*addr, sizeof(long)); + + /* + * We only support 32 and 64 bit values. The only time we need + * to align is a 32 bit value on a 64 bit kernel, which on LE + * is always 32 bits, and on BE requires no change when unaligned. + */ +#ifdef __LITTLE_ENDIAN + *bit += 32; +#endif +} + typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted); @@ -482,6 +511,7 @@ static int user_event_enabler_write(struct user_event_mm *mm, unsigned long *ptr; struct page *page; void *kaddr; + int bit = ENABLE_BIT(enabler); int ret; lockdep_assert_held(&event_mutex); @@ -497,6 +527,8 @@ static int user_event_enabler_write(struct user_event_mm *mm, test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)))) return -EBUSY; + align_addr_bit(&uaddr, &bit, ENABLE_BITOPS(enabler)); + ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT, &page, NULL); @@ -515,9 +547,9 @@ static int user_event_enabler_write(struct user_event_mm *mm, /* Update bit atomically, user tracers must be atomic as well */ if (enabler->event && enabler->event->status) - set_bit(ENABLE_BIT(enabler), ptr); + set_bit(bit, ptr); else - clear_bit(ENABLE_BIT(enabler), ptr); + clear_bit(bit, ptr); kunmap_local(kaddr); unpin_user_pages_dirty_lock(&page, 1, true); @@ -849,6 +881,12 @@ static struct user_event_enabler enabler->event = user; enabler->addr = uaddr; enabler->values = reg->enable_bit; + +#if BITS_PER_LONG >= 64 + if (reg->enable_size == 4) + set_bit(ENABLE_VAL_32_ON_64_BIT, ENABLE_BITOPS(enabler)); +#endif + retry: /* Prevents state changes from racing with new enablers */ mutex_lock(&event_mutex); @@ -2377,7 +2415,8 @@ static long user_unreg_get(struct user_unreg __user *ureg, } static int user_event_mm_clear_bit(struct user_event_mm *user_mm, - unsigned long uaddr, unsigned char bit) + unsigned long uaddr, unsigned char bit, + unsigned long flags) { struct user_event_enabler enabler; int result; @@ -2385,7 +2424,7 @@ static int user_event_mm_clear_bit(struct user_event_mm *user_mm, memset(&enabler, 0, sizeof(enabler)); enabler.addr = uaddr; - enabler.values = bit; + enabler.values = bit | flags; retry: /* Prevents state changes from racing with new enablers */ mutex_lock(&event_mutex); @@ -2415,6 +2454,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) struct user_event_mm *mm = current->user_event_mm; struct user_event_enabler *enabler, *next; struct user_unreg reg; + unsigned long flags; long ret; ret = user_unreg_get(ureg, ®); @@ -2425,6 +2465,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) if (!mm) return -ENOENT; + flags = 0; ret = -ENOENT; /* @@ -2441,6 +2482,9 @@ static long user_events_ioctl_unreg(unsigned long uarg) ENABLE_BIT(enabler) == reg.disable_bit) { set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler)); + /* We must keep compat flags for the clear */ + flags |= enabler->values & ENABLE_VAL_COMPAT_MASK; + if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler))) user_event_enabler_destroy(enabler, true); @@ -2454,7 +2498,7 @@ static long user_events_ioctl_unreg(unsigned long uarg) /* Ensure bit is now cleared for user, regardless of event status */ if (!ret) ret = user_event_mm_clear_bit(mm, reg.disable_addr, - reg.disable_bit); + reg.disable_bit, flags); return ret; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3d7a180a8427..e834f149695b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -705,6 +705,41 @@ static struct notifier_block trace_kprobe_module_nb = { .priority = 1 /* Invoked after kprobe module callback */ }; +static int count_symbols(void *data, unsigned long unused) +{ + unsigned int *count = data; + + (*count)++; + + return 0; +} + +struct sym_count_ctx { + unsigned int count; + const char *name; +}; + +static int count_mod_symbols(void *data, const char *name, unsigned long unused) +{ + struct sym_count_ctx *ctx = data; + + if (strcmp(name, ctx->name) == 0) + ctx->count++; + + return 0; +} + +static unsigned int number_of_same_symbols(char *func_name) +{ + struct sym_count_ctx ctx = { .count = 0, .name = func_name }; + + kallsyms_on_each_match_symbol(count_symbols, func_name, &ctx.count); + + module_kallsyms_on_each_symbol(NULL, count_mod_symbols, &ctx); + + return ctx.count; +} + static int __trace_kprobe_create(int argc, const char *argv[]) { /* @@ -836,6 +871,31 @@ static int __trace_kprobe_create(int argc, const char *argv[]) } } + if (symbol && !strchr(symbol, ':')) { + unsigned int count; + + count = number_of_same_symbols(symbol); + if (count > 1) { + /* + * Users should use ADDR to remove the ambiguity of + * using KSYM only. + */ + trace_probe_log_err(0, NON_UNIQ_SYMBOL); + ret = -EADDRNOTAVAIL; + + goto error; + } else if (count == 0) { + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + trace_probe_log_err(0, BAD_PROBE_ADDR); + ret = -ENOENT; + + goto error; + } + } + trace_probe_log_set_index(0); if (event) { ret = traceprobe_parse_event_name(&event, &group, gbuf, @@ -963,7 +1023,7 @@ EXPORT_SYMBOL_GPL(kprobe_event_cmd_init); * @name: The name of the kprobe event * @loc: The location of the kprobe event * @kretprobe: Is this a return probe? - * @args: Variable number of arg (pairs), one pair for each field + * @...: Variable number of arg (pairs), one pair for each field * * NOTE: Users normally won't want to call this function directly, but * rather use the kprobe_event_gen_cmd_start() wrapper, which automatically @@ -1036,7 +1096,7 @@ EXPORT_SYMBOL_GPL(__kprobe_event_gen_cmd_start); /** * __kprobe_event_add_fields - Add probe fields to a kprobe command from arg list * @cmd: A pointer to the dynevent_cmd struct representing the new event - * @args: Variable number of arg (pairs), one pair for each field + * @...: Variable number of arg (pairs), one pair for each field * * NOTE: Users normally won't want to call this function directly, but * rather use the kprobe_event_add_fields() wrapper, which @@ -1695,6 +1755,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) } #ifdef CONFIG_PERF_EVENTS + /* create a trace_kprobe, but don't add it to global lists */ struct trace_event_call * create_local_trace_kprobe(char *func, void *addr, unsigned long offs, @@ -1705,6 +1766,24 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs, int ret; char *event; + if (func) { + unsigned int count; + + count = number_of_same_symbols(func); + if (count > 1) + /* + * Users should use addr to remove the ambiguity of + * using func only. + */ + return ERR_PTR(-EADDRNOTAVAIL); + else if (count == 0) + /* + * We can return ENOENT earlier than when register the + * kprobe. + */ + return ERR_PTR(-ENOENT); + } + /* * local trace_kprobes are not added to dyn_event, so they are never * searched in find_trace_kprobe(). Therefore, there is no concern of diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index db575094c498..d8b302d01083 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, vmstart = vma->vm_start; } if (file) { - ret = trace_seq_path(s, &file->f_path); + ret = trace_seq_path(s, file_user_path(file)); if (ret) trace_seq_printf(s, "[+0x%lx]", ip - vmstart); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 02b432ae7513..850d9ecb6765 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -450,6 +450,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(BAD_MAXACT, "Invalid maxactive number"), \ C(MAXACT_TOO_BIG, "Maxactive is too big"), \ C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ + C(NON_UNIQ_SYMBOL, "The symbol is not unique"), \ C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ C(NO_TRACEPOINT, "Tracepoint is not found"), \ C(BAD_ADDR_SUFFIX, "Invalid probed address suffix"), \ diff --git a/kernel/up.c b/kernel/up.c index a38b8b095251..df50828cc2f0 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, struct __call_single_data *csd) +int smp_call_function_single_async(int cpu, call_single_data_t *csd) { unsigned long flags; diff --git a/kernel/user.c b/kernel/user.c index d667debeafd6..03cedc366dc9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -18,8 +18,18 @@ #include <linux/interrupt.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/binfmts.h> #include <linux/proc_ns.h> +#if IS_ENABLED(CONFIG_BINFMT_MISC) +struct binfmt_misc init_binfmt_misc = { + .entries = LIST_HEAD_INIT(init_binfmt_misc.entries), + .enabled = true, + .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock), +}; +EXPORT_SYMBOL_GPL(init_binfmt_misc); +#endif + /* * userns count is 1 for root user, 1 for init_uts_ns, * and 1 for... ? @@ -67,6 +77,9 @@ struct user_namespace init_user_ns = { .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif +#if IS_ENABLED(CONFIG_BINFMT_MISC) + .binfmt_misc = &init_binfmt_misc, +#endif }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d8e47bed3f1..d52a894ecf57 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work) kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } +#if IS_ENABLED(CONFIG_BINFMT_MISC) + kfree(ns->binfmt_misc); +#endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c85825e17df8..0f682da96e1c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2166,7 +2166,7 @@ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; - char id_buf[16]; + char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); @@ -4600,12 +4600,22 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } cpus_read_unlock(); + /* for unbound pwq, flush the pwq_release_worker ensures that the + * pwq_release_workfn() completes before calling kfree(wq). + */ + if (ret) + kthread_flush_worker(pwq_release_worker); + return ret; enomem: if (wq->cpu_pwq) { - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); + + if (pwq) + kmem_cache_free(pwq_cache, pwq); + } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; } @@ -5612,50 +5622,54 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in thread context on a particular cpu + * work_on_cpu_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg + * @key: The lock class key for lock debugging purposes * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } -EXPORT_SYMBOL_GPL(work_on_cpu); +EXPORT_SYMBOL_GPL(work_on_cpu_key); /** - * work_on_cpu_safe - run a function in thread context on a particular cpu + * work_on_cpu_safe_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function argument + * @key: The lock class key for lock debugging purposes * * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold * any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_safe_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { long ret = -ENODEV; cpus_read_lock(); if (cpu_online(cpu)) - ret = work_on_cpu(cpu, fn, arg); + ret = work_on_cpu_key(cpu, fn, arg, key); cpus_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(work_on_cpu_safe); +EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER @@ -5782,9 +5796,13 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND)) continue; + /* creating multiple pwqs breaks ordering guarantee */ - if (wq->flags & __WQ_ORDERED) - continue; + if (!list_empty(&wq->pwqs)) { + if (wq->flags & __WQ_ORDERED_EXPLICIT) + continue; + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { @@ -6535,9 +6553,6 @@ void __init workqueue_init_early(void) BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); - wq_update_pod_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_pod_attrs_buf); - pt->nr_pods = 1; cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); pt->pod_node[0] = NUMA_NO_NODE; @@ -6605,13 +6620,13 @@ static void __init wq_cpu_intensive_thresh_init(void) unsigned long thresh; unsigned long bogo; + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + /* if the user set it to a specific value, keep it */ if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; - pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); - BUG_ON(IS_ERR(pwq_release_worker)); - /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what |