diff options
Diffstat (limited to 'kernel')
60 files changed, 3344 insertions, 2020 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 42d17f730780..5b30f8baaf02 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1668,7 +1668,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (err < 0) goto free_prog; - prog->aux->load_time = ktime_get_boot_ns(); + prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); if (err) goto free_prog; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bf9dbffd46b1..cdbeff87fa99 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ @@ -5666,7 +5666,6 @@ int __init cgroup_init(void) int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); diff --git a/kernel/cpu.c b/kernel/cpu.c index 077fde6fb953..e84c0873559e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -522,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu) /* * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The - * CPU marked itself as booted_once in cpu_notify_starting() so the + * CPU marked itself as booted_once in notify_cpu_starting() so the * cpu_smt_allowed() check will now return false if this is not the * primary sibling. */ @@ -1221,6 +1221,13 @@ int freeze_secondary_cpus(int primary) for_each_online_cpu(cpu) { if (cpu == primary) continue; + + if (pm_wakeup_pending()) { + pr_info("Wakeup pending. Abort CPU freeze\n"); + error = -EBUSY; + break; + } + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); @@ -1964,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev, if (ret) return ret; + if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE) + return -EINVAL; + /* * Cannot fail STARTING/DYING callbacks. */ @@ -2339,6 +2349,9 @@ static int __init mitigations_parse_cmdline(char *arg) cpu_mitigations = CPU_MITIGATIONS_AUTO; else if (!strcmp(arg, "auto,nosmt")) cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + else + pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", + arg); return 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index abbd4b3b96c2..29e5f7880a4b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5005,6 +5005,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (perf_event_check_period(event, value)) return -EINVAL; + if (!event->attr.freq && (value & (1ULL << 63))) + return -EINVAL; + event_function_call(event, __perf_event_period, &value); return 0; @@ -5923,7 +5926,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (current->mm) { + } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs, regs_user_copy); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -10033,6 +10036,12 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static inline bool has_extended_regs(struct perf_event *event) +{ + return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || + (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); +} + static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; @@ -10064,12 +10073,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) perf_event_ctx_unlock(event->group_leader, ctx); if (!ret) { + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) + ret = -EOPNOTSUPP; + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && - event_has_any_exclude_flag(event)) { - if (event->destroy) - event->destroy(event); + event_has_any_exclude_flag(event)) ret = -EINVAL; - } + + if (ret && event->destroy) + event->destroy(event); } if (ret) @@ -10680,11 +10693,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) break; case CLOCK_BOOTTIME: - event->clock = &ktime_get_boot_ns; + event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: - event->clock = &ktime_get_tai_ns; + event->clock = &ktime_get_clocktai_ns; break; default: diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 78f61bfc6b79..97c367f0a9aa 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) -static struct percpu_rw_semaphore dup_mmap_sem; +DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -2302,7 +2302,5 @@ void __init uprobes_init(void) for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); - BUG_ON(register_die_notifier(&uprobe_exception_nb)); } diff --git a/kernel/fork.c b/kernel/fork.c index 6be686283e55..847dd147b068 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -248,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); - return page ? page_address(page) : NULL; + if (likely(page)) { + tsk->stack = page_address(page); + return tsk->stack; + } + return NULL; #endif } @@ -1714,31 +1718,6 @@ const struct file_operations pidfd_fops = { #endif }; -/** - * pidfd_create() - Create a new pid file descriptor. - * - * @pid: struct pid that the pidfd will reference - * - * This creates a new pid file descriptor with the O_CLOEXEC flag set. - * - * Note, that this function can only be called after the fd table has - * been unshared to avoid leaking the pidfd to the new process. - * - * Return: On success, a cloexec pidfd is returned. - * On error, a negative errno number will be returned. - */ -static int pidfd_create(struct pid *pid) -{ - int fd; - - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); - - return fd; -} - static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); @@ -1776,6 +1755,7 @@ static __latent_entropy struct task_struct *copy_process( int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; + struct file *pidfile = NULL; /* * Don't allow sharing the root directory with processes in a different @@ -1824,8 +1804,6 @@ static __latent_entropy struct task_struct *copy_process( } if (clone_flags & CLONE_PIDFD) { - int reserved; - /* * - CLONE_PARENT_SETTID is useless for pidfds and also * parent_tidptr is used to return pidfds. @@ -1836,16 +1814,6 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) return ERR_PTR(-EINVAL); - - /* - * Verify that parent_tidptr is sane so we can potentially - * reuse it later. - */ - if (get_user(reserved, parent_tidptr)) - return ERR_PTR(-EFAULT); - - if (reserved != 0) - return ERR_PTR(-EINVAL); } /* @@ -1986,9 +1954,6 @@ static __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP - p->lockdep_depth = 0; /* no locks held yet */ - p->curr_chain_key = 0; - p->lockdep_recursion = 0; lockdep_init_task(p); #endif @@ -2060,11 +2025,21 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = pidfd_create(pid); + retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; + + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfile)) { + put_unused_fd(pidfd); + retval = PTR_ERR(pidfile); + goto bad_fork_free_pid; + } + get_pid(pid); /* held by pidfile now */ + retval = put_user(pidfd, parent_tidptr); if (retval) goto bad_fork_put_pidfd; @@ -2141,7 +2116,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boot_ns(); + p->real_start_time = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. @@ -2182,6 +2157,9 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + /* past the last point of failure */ + if (pidfile) + fd_install(pidfd, pidfile); init_task_pid_links(p); if (likely(p->pid)) { @@ -2248,8 +2226,10 @@ bad_fork_cancel_cgroup: bad_fork_cgroup_threadgroup_change_end: cgroup_threadgroup_change_end(current); bad_fork_put_pidfd: - if (clone_flags & CLONE_PIDFD) - ksys_close(pidfd); + if (clone_flags & CLONE_PIDFD) { + fput(pidfile); + put_unused_fd(pidfd); + } bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); diff --git a/kernel/futex.c b/kernel/futex.c index 4b5b468c58b6..6d50728ef2e7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -471,6 +471,37 @@ enum futex_access { }; /** + * futex_setup_timer - set up the sleeping hrtimer. + * @time: ptr to the given timeout value + * @timeout: the hrtimer_sleeper structure to be set up + * @flags: futex flags + * @range_ns: optional range in ns + * + * Return: Initialized hrtimer_sleeper structure or NULL if no timeout + * value given + */ +static inline struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) +{ + if (!time) + return NULL; + + hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(timeout, current); + + /* + * If range_ns is 0, calling hrtimer_set_expires_range_ns() is + * effectively the same as calling hrtimer_set_expires(). + */ + hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); + + return timeout; +} + +/** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED @@ -2679,7 +2710,7 @@ out: static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; @@ -2689,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, return -EINVAL; q.bitset = bitset; - if (abs_time) { - to = &timeout; - - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } - + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); retry: /* * Prepare to wait on uaddr. On success, holds hb lock and increments @@ -2779,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart) static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -2792,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (refill_pi_state_cache()) return -ENOMEM; - if (time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires(&to->timer, *time); - } + to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); @@ -3195,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -3212,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (!bitset) return -EINVAL; - if (abs_time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); /* * The waiter is allocated on our stack, manipulated by the requeue diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index ff6e352e3a6c..b4f53717d143 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -2,6 +2,9 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_IRQ_TIMINGS) += timings.o +ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) + CFLAGS_timings.o += -DDEBUG +endif obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f18cd5aa33e8..4352b08ae48d 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int __irq_build_affinity_masks(const struct irq_affinity *affd, - unsigned int startvec, +static int __irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, cpumask_var_t *node_to_cpumask, @@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(const struct irq_affinity *affd, - unsigned int startvec, unsigned int numvecs, +static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, unsigned int firstvec, struct irq_affinity_desc *masks) { @@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_present = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, cpu_present_mask, nmsk, masks); @@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, else curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_others = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, npresmsk, nmsk, masks); put_online_cpus(); @@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(affd, curvec, this_vecs, + ret = irq_build_affinity_masks(curvec, this_vecs, curvec, masks); if (ret) { kfree(masks); diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 16cbf6beb276..ae60cae24e9a 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -90,7 +90,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } else if (i < 32) mask |= 1 << i; @@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 29d6c7d070b4..b76703b2c0af 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc) } irq_state_clr_started(desc); } +} + + +void irq_shutdown_and_deactivate(struct irq_desc *desc) +{ + irq_shutdown(desc); /* * This must be called even if the interrupt was never started up, * because the activation can happen before the interrupt is @@ -748,6 +754,8 @@ void handle_fasteoi_nmi(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; + __kstat_incr_irqs_this_cpu(desc); + trace_irq_handler_entry(irq, action); /* * NMIs cannot be shared, there is only one action. @@ -962,6 +970,8 @@ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; + __kstat_incr_irqs_this_cpu(desc); + trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 5b1072e394b2..6c7ca2e983a5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc) */ if (irqd_affinity_is_managed(d)) { irqd_set_managed_shutdown(d); - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); return false; } affinity = cpu_online_mask; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 70c3053bc1f6..3924fbe829d4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -82,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); +extern void irq_shutdown_and_deactivate(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); @@ -96,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { } extern void irq_mark_irq(unsigned int irq); #endif +extern int __irq_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); @@ -354,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp) return value & U16_MAX; } +static __always_inline void irq_timings_push(u64 ts, int irq) +{ + struct irq_timings *timings = this_cpu_ptr(&irq_timings); + + timings->values[timings->count & IRQ_TIMINGS_MASK] = + irq_timing_encode(ts, irq); + + timings->count++; +} + /* * The function record_irq_time is only called in one place in the * interrupts handler. We want this function always inline so the code @@ -367,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc) if (!static_branch_likely(&irq_timing_enabled)) return; - if (desc->istate & IRQS_TIMINGS) { - struct irq_timings *timings = this_cpu_ptr(&irq_timings); - - timings->values[timings->count & IRQ_TIMINGS_MASK] = - irq_timing_encode(local_clock(), - irq_desc_get_irq(desc)); - - timings->count++; - } + if (desc->istate & IRQS_TIMINGS) + irq_timings_push(local_clock(), irq_desc_get_irq(desc)); } #else static inline void irq_remove_timings(struct irq_desc *desc) {} diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c52b737ab8e3..9484e88dabc2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -680,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, * @hwirq: The HW irq number to convert to a logical one * @regs: Register file coming from the low-level handling code * + * This function must be called from an NMI context. + * * Returns: 0 on success, or -EINVAL if conversion has failed */ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, @@ -689,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, unsigned int irq; int ret = 0; - nmi_enter(); + /* + * NMI context needs to be setup earlier in order to deal with tracing. + */ + WARN_ON(!in_nmi()); irq = irq_find_mapping(domain, hwirq); @@ -702,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, else ret = -EINVAL; - nmi_exit(); set_irq_regs(old_regs); return ret; } @@ -946,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } +static bool irq_is_nmi(struct irq_desc *desc) +{ + return desc->istate & IRQS_NMI; +} + /** * kstat_irqs - Get the statistics for an interrupt * @irq: The interrupt number @@ -963,7 +972,8 @@ unsigned int kstat_irqs(unsigned int irq) if (!desc || !desc->kstat_irqs) return 0; if (!irq_settings_is_per_cpu_devid(desc) && - !irq_settings_is_per_cpu(desc)) + !irq_settings_is_per_cpu(desc) && + !irq_is_nmi(desc)) return desc->tot_count; for_each_possible_cpu(cpu) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index a453e229f99c..3078d0e48bba 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); * @ops: domain callbacks * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. + * Allocates and initializes an irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, @@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); - if (WARN_ON(!domain)) + if (!domain) return NULL; if (fwnode && is_fwnode_irqchip(fwnode)) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 78f3ddeb7fe4..e8f7f179bf77 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> +#include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> @@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif -static void __synchronize_hardirq(struct irq_desc *desc) +static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { + struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { @@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); inprogress = irqd_irq_inprogress(&desc->irq_data); + + /* + * If requested and supported, check at the chip whether it + * is in flight at the hardware level, i.e. already pending + * in a CPU and waiting for service and acknowledge. + */ + if (!inprogress && sync_chip) { + /* + * Ignore the return code. inprogress is only updated + * when the chip supports it. + */ + __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, + &inprogress); + } raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ @@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc) * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. + * + * It does not check whether there is an interrupt in flight at the + * hardware level, but not serviced yet, as this might deadlock when + * called with interrupts disabled and the target CPU of the interrupt + * is the current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { - __synchronize_hardirq(desc); + __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } @@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq); * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * - * This function may be called - with care - from IRQ context. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. + * + * It optionally makes sure (when the irq chip supports that method) + * that the interrupt is not pending in any CPU and waiting for + * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { - __synchronize_hardirq(desc); + __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is * running. Now verify that no threaded handlers are @@ -1699,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); + /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } @@ -1727,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) unregister_handler_proc(irq, action); - /* Make sure it's not being used on another CPU: */ - synchronize_hardirq(irq); + /* + * Make sure it's not being used on another CPU and if the chip + * supports it also make sure that there is no (not yet serviced) + * interrupt in flight at the hardware level. + */ + __synchronize_hardirq(desc, true); #ifdef CONFIG_DEBUG_SHIRQ /* @@ -1768,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); + /* + * There is no interrupt on the fly anymore. Deactivate it + * completely. + */ + raw_spin_lock_irqsave(&desc->lock, flags); + irq_domain_deactivate_irq(&desc->irq_data); + raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); @@ -1855,7 +1894,7 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) } irq_settings_clr_disable_unlazy(desc); - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); irq_release_resources(desc); @@ -2578,6 +2617,28 @@ out: irq_put_desc_unlock(desc, flags); } +int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, + bool *state) +{ + struct irq_chip *chip; + int err = -EINVAL; + + do { + chip = irq_data_get_irq_chip(data); + if (chip->irq_get_irqchip_state) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) + err = chip->irq_get_irqchip_state(data, which, state); + return err; +} + /** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM @@ -2596,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, { struct irq_desc *desc; struct irq_data *data; - struct irq_chip *chip; unsigned long flags; int err = -EINVAL; @@ -2606,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, data = irq_desc_get_irq_data(desc); - do { - chip = irq_data_get_irq_chip(data); - if (chip->irq_get_irqchip_state) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); - - if (data) - err = chip->irq_get_irqchip_state(data, which, state); + err = __irq_get_irqchip_state(data, which, state); irq_put_desc_busunlock(desc, flags); return err; diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 90c735da15d0..e960d7ce7bcc 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> +#define pr_fmt(fmt) "irq_timings: " fmt #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/static_key.h> +#include <linux/init.h> #include <linux/interrupt.h> #include <linux/idr.h> #include <linux/irq.h> @@ -261,12 +263,29 @@ void irq_timings_disable(void) #define EMA_ALPHA_VAL 64 #define EMA_ALPHA_SHIFT 7 -#define PREDICTION_PERIOD_MIN 2 +#define PREDICTION_PERIOD_MIN 3 #define PREDICTION_PERIOD_MAX 5 #define PREDICTION_FACTOR 4 #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ #define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ +/* + * Number of elements in the circular buffer: If it happens it was + * flushed before, then the number of elements could be smaller than + * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is + * used as we wrapped. The index begins from zero when we did not + * wrap. That could be done in a nicer way with the proper circular + * array structure type but with the cost of extra computation in the + * interrupt handler hot path. We choose efficiency. + */ +#define for_each_irqts(i, irqts) \ + for (i = irqts->count < IRQ_TIMINGS_SIZE ? \ + 0 : irqts->count & IRQ_TIMINGS_MASK, \ + irqts->count = min(IRQ_TIMINGS_SIZE, \ + irqts->count); \ + irqts->count > 0; irqts->count--, \ + i = (i + 1) & IRQ_TIMINGS_MASK) + struct irqt_stat { u64 last_ts; u64 ema_time[PREDICTION_BUFFER_SIZE]; @@ -297,7 +316,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old) static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) { - int i; + int period; + + /* + * Move the beginning pointer to the end minus the max period x 3. + * We are at the point we can begin searching the pattern + */ + buffer = &buffer[len - (period_max * 3)]; + + /* Adjust the length to the maximum allowed period x 3 */ + len = period_max * 3; /* * The buffer contains the suite of intervals, in a ilog2 @@ -306,21 +334,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) * period beginning at the end of the buffer. We do that for * each suffix. */ - for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { + for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) { - int *begin = &buffer[len - (i * 3)]; - int *ptr = begin; + /* + * The first comparison always succeed because the + * suffix is deduced from the first n-period bytes of + * the buffer and we compare the initial suffix with + * itself, so we can skip the first iteration. + */ + int idx = period; + size_t size = period; /* * We look if the suite with period 'i' repeat * itself. If it is truncated at the end, as it * repeats we can use the period to find out the next - * element. + * element with the modulo. */ - while (!memcmp(ptr, begin, i * sizeof(*ptr))) { - ptr += i; - if (ptr >= &buffer[len]) - return begin[((i * 3) % i)]; + while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) { + + /* + * Move the index in a period basis + */ + idx += size; + + /* + * If this condition is reached, all previous + * memcmp were successful, so the period is + * found. + */ + if (idx == len) + return buffer[len % period]; + + /* + * If the remaining elements to compare are + * smaller than the period, readjust the size + * of the comparison for the last iteration. + */ + if (len - idx < period) + size = len - idx; } } @@ -380,11 +432,43 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) return irqs->last_ts + irqs->ema_time[index]; } +static __always_inline int irq_timings_interval_index(u64 interval) +{ + /* + * The PREDICTION_FACTOR increase the interval size for the + * array of exponential average. + */ + u64 interval_us = (interval >> 10) / PREDICTION_FACTOR; + + return likely(interval_us) ? ilog2(interval_us) : 0; +} + +static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, + u64 interval) +{ + int index; + + /* + * Get the index in the ema table for this interrupt. + */ + index = irq_timings_interval_index(interval); + + /* + * Store the index as an element of the pattern in another + * circular array. + */ + irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; + + irqs->ema_time[index] = irq_timings_ema_new(interval, + irqs->ema_time[index]); + + irqs->count++; +} + static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) { u64 old_ts = irqs->last_ts; u64 interval; - int index; /* * The timestamps are absolute time values, we need to compute @@ -415,24 +499,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) return; } - /* - * Get the index in the ema table for this interrupt. The - * PREDICTION_FACTOR increase the interval size for the array - * of exponential average. - */ - index = likely(interval) ? - ilog2((interval >> 10) / PREDICTION_FACTOR) : 0; - - /* - * Store the index as an element of the pattern in another - * circular array. - */ - irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; - - irqs->ema_time[index] = irq_timings_ema_new(interval, - irqs->ema_time[index]); - - irqs->count++; + __irq_timings_store(irq, irqs, interval); } /** @@ -493,11 +560,7 @@ u64 irq_timings_next_event(u64 now) * model while decrementing the counter because we consume the * data from our circular buffer. */ - - i = (irqts->count & IRQ_TIMINGS_MASK) - 1; - irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); - - for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { + for_each_irqts(i, irqts) { irq = irq_timing_decode(irqts->values[i], &ts); s = idr_find(&irqt_stats, irq); if (s) @@ -564,3 +627,325 @@ int irq_timings_alloc(int irq) return 0; } + +#ifdef CONFIG_TEST_IRQ_TIMINGS +struct timings_intervals { + u64 *intervals; + size_t count; +}; + +/* + * Intervals are given in nanosecond base + */ +static u64 intervals0[] __initdata = { + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, +}; + +static u64 intervals1[] __initdata = { + 223947000, 1240000, 1384000, 1386000, 1386000, + 217416000, 1236000, 1384000, 1386000, 1387000, + 214719000, 1241000, 1386000, 1387000, 1384000, + 213696000, 1234000, 1384000, 1386000, 1388000, + 219904000, 1240000, 1385000, 1389000, 1385000, + 212240000, 1240000, 1386000, 1386000, 1386000, + 214415000, 1236000, 1384000, 1386000, 1387000, + 214276000, 1234000, +}; + +static u64 intervals2[] __initdata = { + 4000, 3000, 5000, 100000, + 3000, 3000, 5000, 117000, + 4000, 4000, 5000, 112000, + 4000, 3000, 4000, 110000, + 3000, 5000, 3000, 117000, + 4000, 4000, 5000, 112000, + 4000, 3000, 4000, 110000, + 3000, 4000, 5000, 112000, + 4000, +}; + +static u64 intervals3[] __initdata = { + 1385000, 212240000, 1240000, + 1386000, 214415000, 1236000, + 1384000, 214276000, 1234000, + 1386000, 214415000, 1236000, + 1385000, 212240000, 1240000, + 1386000, 214415000, 1236000, + 1384000, 214276000, 1234000, + 1386000, 214415000, 1236000, + 1385000, 212240000, 1240000, +}; + +static u64 intervals4[] __initdata = { + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, +}; + +static struct timings_intervals tis[] __initdata = { + { intervals0, ARRAY_SIZE(intervals0) }, + { intervals1, ARRAY_SIZE(intervals1) }, + { intervals2, ARRAY_SIZE(intervals2) }, + { intervals3, ARRAY_SIZE(intervals3) }, + { intervals4, ARRAY_SIZE(intervals4) }, +}; + +static int __init irq_timings_test_next_index(struct timings_intervals *ti) +{ + int _buffer[IRQ_TIMINGS_SIZE]; + int buffer[IRQ_TIMINGS_SIZE]; + int index, start, i, count, period_max; + + count = ti->count - 1; + + period_max = count > (3 * PREDICTION_PERIOD_MAX) ? + PREDICTION_PERIOD_MAX : count / 3; + + /* + * Inject all values except the last one which will be used + * to compare with the next index result. + */ + pr_debug("index suite: "); + + for (i = 0; i < count; i++) { + index = irq_timings_interval_index(ti->intervals[i]); + _buffer[i & IRQ_TIMINGS_MASK] = index; + pr_cont("%d ", index); + } + + start = count < IRQ_TIMINGS_SIZE ? 0 : + count & IRQ_TIMINGS_MASK; + + count = min_t(int, count, IRQ_TIMINGS_SIZE); + + for (i = 0; i < count; i++) { + int index = (start + i) & IRQ_TIMINGS_MASK; + buffer[i] = _buffer[index]; + } + + index = irq_timings_next_event_index(buffer, count, period_max); + i = irq_timings_interval_index(ti->intervals[ti->count - 1]); + + if (index != i) { + pr_err("Expected (%d) and computed (%d) next indexes differ\n", + i, index); + return -EINVAL; + } + + return 0; +} + +static int __init irq_timings_next_index_selftest(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(tis); i++) { + + pr_info("---> Injecting intervals number #%d (count=%zd)\n", + i, tis[i].count); + + ret = irq_timings_test_next_index(&tis[i]); + if (ret) + break; + } + + return ret; +} + +static int __init irq_timings_test_irqs(struct timings_intervals *ti) +{ + struct irqt_stat __percpu *s; + struct irqt_stat *irqs; + int i, index, ret, irq = 0xACE5; + + ret = irq_timings_alloc(irq); + if (ret) { + pr_err("Failed to allocate irq timings\n"); + return ret; + } + + s = idr_find(&irqt_stats, irq); + if (!s) { + ret = -EIDRM; + goto out; + } + + irqs = this_cpu_ptr(s); + + for (i = 0; i < ti->count; i++) { + + index = irq_timings_interval_index(ti->intervals[i]); + pr_debug("%d: interval=%llu ema_index=%d\n", + i, ti->intervals[i], index); + + __irq_timings_store(irq, irqs, ti->intervals[i]); + if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { + pr_err("Failed to store in the circular buffer\n"); + goto out; + } + } + + if (irqs->count != ti->count) { + pr_err("Count differs\n"); + goto out; + } + + ret = 0; +out: + irq_timings_free(irq); + + return ret; +} + +static int __init irq_timings_irqs_selftest(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(tis); i++) { + pr_info("---> Injecting intervals number #%d (count=%zd)\n", + i, tis[i].count); + ret = irq_timings_test_irqs(&tis[i]); + if (ret) + break; + } + + return ret; +} + +static int __init irq_timings_test_irqts(struct irq_timings *irqts, + unsigned count) +{ + int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0; + int i, irq, oirq = 0xBEEF; + u64 ots = 0xDEAD, ts; + + /* + * Fill the circular buffer by using the dedicated function. + */ + for (i = 0; i < count; i++) { + pr_debug("%d: index=%d, ts=%llX irq=%X\n", + i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i); + + irq_timings_push(ots + i, oirq + i); + } + + /* + * Compute the first elements values after the index wrapped + * up or not. + */ + ots += start; + oirq += start; + + /* + * Test the circular buffer count is correct. + */ + pr_debug("---> Checking timings array count (%d) is right\n", count); + if (WARN_ON(irqts->count != count)) + return -EINVAL; + + /* + * Test the macro allowing to browse all the irqts. + */ + pr_debug("---> Checking the for_each_irqts() macro\n"); + for_each_irqts(i, irqts) { + + irq = irq_timing_decode(irqts->values[i], &ts); + + pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n", + i, ts, ots, irq, oirq); + + if (WARN_ON(ts != ots || irq != oirq)) + return -EINVAL; + + ots++; oirq++; + } + + /* + * The circular buffer should have be flushed when browsed + * with for_each_irqts + */ + pr_debug("---> Checking timings array is empty after browsing it\n"); + if (WARN_ON(irqts->count)) + return -EINVAL; + + return 0; +} + +static int __init irq_timings_irqts_selftest(void) +{ + struct irq_timings *irqts = this_cpu_ptr(&irq_timings); + int i, ret; + + /* + * Test the circular buffer with different number of + * elements. The purpose is to test at the limits (empty, half + * full, full, wrapped with the cursor at the boundaries, + * wrapped several times, etc ... + */ + int count[] = { 0, + IRQ_TIMINGS_SIZE >> 1, + IRQ_TIMINGS_SIZE, + IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1), + 2 * IRQ_TIMINGS_SIZE, + (2 * IRQ_TIMINGS_SIZE) + 3, + }; + + for (i = 0; i < ARRAY_SIZE(count); i++) { + + pr_info("---> Checking the timings with %d/%d values\n", + count[i], IRQ_TIMINGS_SIZE); + + ret = irq_timings_test_irqts(irqts, count[i]); + if (ret) + break; + } + + return ret; +} + +static int __init irq_timings_selftest(void) +{ + int ret; + + pr_info("------------------- selftest start -----------------\n"); + + /* + * At this point, we don't except any subsystem to use the irq + * timings but us, so it should not be enabled. + */ + if (static_branch_unlikely(&irq_timing_enabled)) { + pr_warn("irq timings already initialized, skipping selftest\n"); + return 0; + } + + ret = irq_timings_irqts_selftest(); + if (ret) + goto out; + + ret = irq_timings_irqs_selftest(); + if (ret) + goto out; + + ret = irq_timings_next_index_selftest(); +out: + pr_info("---------- selftest end with %s -----------\n", + ret ? "failure" : "success"); + + return ret; +} +early_initcall(irq_timings_selftest); +#endif diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 0bfa10f4410c..df3008419a1d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -37,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b) const struct jump_entry *jea = a; const struct jump_entry *jeb = b; + /* + * Entrires are sorted by key. + */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; + /* + * In the batching mode, entries should also be sorted by the code + * inside the already sorted list of entries, enabling a bsearch in + * the vector. + */ + if (jump_entry_code(jea) < jump_entry_code(jeb)) + return -1; + + if (jump_entry_code(jea) > jump_entry_code(jeb)) + return 1; + return 0; } @@ -384,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry) return enabled ^ branch; } +static bool jump_label_can_update(struct jump_entry *entry, bool init) +{ + /* + * Cannot update code that was in an init text area. + */ + if (!init && jump_entry_is_init(entry)) + return false; + + if (!kernel_text_address(jump_entry_code(entry))) { + WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); + return false; + } + + return true; +} + +#ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { - /* - * An entry->code of 0 indicates an entry which has been - * disabled because it was in an init text area. - */ - if (init || !jump_entry_is_init(entry)) { - if (kernel_text_address(jump_entry_code(entry))) - arch_jump_label_transform(entry, jump_label_type(entry)); - else - WARN_ONCE(1, "can't patch jump_label at %pS", - (void *)jump_entry_code(entry)); + if (jump_label_can_update(entry, init)) + arch_jump_label_transform(entry, jump_label_type(entry)); + } +} +#else +static void __jump_label_update(struct static_key *key, + struct jump_entry *entry, + struct jump_entry *stop, + bool init) +{ + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { + + if (!jump_label_can_update(entry, init)) + continue; + + if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { + /* + * Queue is full: Apply the current queue and try again. + */ + arch_jump_label_transform_apply(); + BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); } } + arch_jump_label_transform_apply(); } +#endif void __init jump_label_init(void) { diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6fe2f333aecb..45452facff3b 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 46b71af8eef2..8c7e7d25f09c 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -31,50 +31,13 @@ enum lock_events { DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); /* - * The purpose of the lock event counting subsystem is to provide a low - * overhead way to record the number of specific locking events by using - * percpu counters. It is the percpu sum that matters, not specifically - * how many of them happens in each cpu. - * - * It is possible that the same percpu counter may be modified in both - * the process and interrupt contexts. For architectures that perform - * percpu operation with multiple instructions, it is possible to lose - * count if a process context percpu update is interrupted in the middle - * and the same counter is updated in the interrupt context. Therefore, - * the generated percpu sum may not be precise. The error, if any, should - * be small and insignificant. - * - * For those architectures that do multi-instruction percpu operation, - * preemption in the middle and moving the task to another cpu may cause - * a larger error in the count. Again, this will be few and far between. - * Given the imprecise nature of the count and the possibility of resetting - * the count and doing the measurement again, this is not really a big - * problem. - * - * To get a better picture of what is happening under the hood, it is - * suggested that a few measurements should be taken with the counts - * reset in between to stamp out outliner because of these possible - * error conditions. - * - * To minimize overhead, we use __this_cpu_*() in all cases except when - * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*() - * will be used to avoid the appearance of unwanted BUG messages. - */ -#ifdef CONFIG_DEBUG_PREEMPT -#define lockevent_percpu_inc(x) this_cpu_inc(x) -#define lockevent_percpu_add(x, v) this_cpu_add(x, v) -#else -#define lockevent_percpu_inc(x) __this_cpu_inc(x) -#define lockevent_percpu_add(x, v) __this_cpu_add(x, v) -#endif - -/* - * Increment the PV qspinlock statistical counters + * Increment the statistical counters. use raw_cpu_inc() because of lower + * overhead and we don't care if we loose the occasional update. */ static inline void __lockevent_inc(enum lock_events event, bool cond) { if (cond) - lockevent_percpu_inc(lockevents[event]); + raw_cpu_inc(lockevents[event]); } #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) @@ -82,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond) static inline void __lockevent_add(enum lock_events event, int inc) { - lockevent_percpu_add(lockevents[event], inc); + raw_cpu_add(lockevents[event], inc); } #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index ad7668cfc9da..239039d0ce21 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,12 +56,16 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ -LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ -LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ +LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ +LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ +LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ +LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ +LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ -LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ +LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ -LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ +LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c47788fa85f9..341f52117f88 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -151,17 +151,28 @@ unsigned long nr_lock_classes; static #endif struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { - if (!hlock->class_idx) { + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */ + barrier(); + + if (!test_bit(class_idx, lock_classes_in_use)) { /* * Someone passed in garbage, we give up. */ DEBUG_LOCKS_WARN_ON(1); return NULL; } - return lock_classes + hlock->class_idx - 1; + + /* + * At this point, if the passed hlock->class_idx is still garbage, + * we just have to live with it + */ + return lock_classes + class_idx; } #ifdef CONFIG_LOCK_STAT @@ -359,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) return k0 | (u64)k1 << 32; } +void lockdep_init_task(struct task_struct *task) +{ + task->lockdep_depth = 0; /* no locks held yet */ + task->curr_chain_key = INITIAL_CHAIN_KEY; + task->lockdep_recursion = 0; +} + void lockdep_off(void) { current->lockdep_recursion++; @@ -419,13 +437,6 @@ static int verbose(struct lock_class *class) return 0; } -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); @@ -435,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg) #endif } +unsigned long nr_stack_trace_entries; + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the graph_lock. + */ +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; + static int save_trace(struct lock_trace *trace) { unsigned long *entries = stack_trace + nr_stack_trace_entries; @@ -457,6 +477,7 @@ static int save_trace(struct lock_trace *trace) return 1; } +#endif unsigned int nr_hardirq_chains; unsigned int nr_softirq_chains; @@ -470,6 +491,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Locking printouts: */ @@ -487,6 +509,7 @@ static const char *usage_str[] = #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", }; +#endif const char * __get_key_name(struct lockdep_subclass_key *key, char *str) { @@ -500,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit) static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { + /* + * The usage character defaults to '.' (i.e., irqs disabled and not in + * irq context), which is the safest usage category. + */ char c = '.'; - if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) + /* + * The order of the following usage checks matters, which will + * result in the outcome character as follows: + * + * - '+': irq is enabled and not in irq context + * - '-': in irq context and irq is disabled + * - '?': in irq context and irq is enabled + */ + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) { c = '+'; - if (class->usage_mask & lock_flag(bit)) { - c = '-'; - if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) + if (class->usage_mask & lock_flag(bit)) c = '?'; - } + } else if (class->usage_mask & lock_flag(bit)) + c = '-'; return c; } @@ -572,19 +606,22 @@ static void print_lock(struct held_lock *hlock) /* * We can be called locklessly through debug_show_all_locks() so be * extra careful, the hlock might have been released and cleared. + * + * If this indeed happens, lets pretend it does not hurt to continue + * to print the lock unless the hlock class_idx does not point to a + * registered class. The rationale here is: since we don't attempt + * to distinguish whether we are in this situation, if it just + * happened we can't count on class_idx to tell either. */ - unsigned int class_idx = hlock->class_idx; - - /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ - barrier(); + struct lock_class *lock = hlock_class(hlock); - if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + if (!lock) { printk(KERN_CONT "<RELEASED>\n"); return; } printk(KERN_CONT "%p", hlock->instance); - print_lock_name(lock_classes + class_idx - 1); + print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -732,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name); + WARN_ON_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__); return class; } } @@ -838,11 +876,11 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; static bool check_lock_chain_key(struct lock_chain *chain) { #ifdef CONFIG_PROVE_LOCKING - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int i; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); /* * The 'unsigned long long' casts avoid that a compiler warning * is reported when building tools/lib/lockdep. @@ -1117,6 +1155,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) return NULL; } nr_lock_classes++; + __set_bit(class - lock_classes, lock_classes_in_use); debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; @@ -1228,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this, #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) /* - * The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. + * The circular_queue and helpers are used to implement graph + * breadth-first search (BFS) algorithm, by which we can determine + * whether there is a path from a lock to another. In deadlock checks, + * a path from the next lock to be acquired to a previous held lock + * indicates that adding the <prev> -> <next> lock dependency will + * produce a circle in the graph. Breadth-first search instead of + * depth-first search is used in order to find the shortest (circular) + * path. */ struct circular_queue { - unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE]; unsigned int front, rear; }; @@ -1260,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq) return ((cq->rear + 1) & CQ_MASK) == cq->front; } -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem) { if (__cq_full(cq)) return -1; @@ -1270,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) return 0; } -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +/* + * Dequeue an element from the circular_queue, return a lock_list if + * the queue is not empty, or NULL if otherwise. + */ +static inline struct lock_list * __cq_dequeue(struct circular_queue *cq) { + struct lock_list * lock; + if (__cq_empty(cq)) - return -1; + return NULL; - *elem = cq->element[cq->front]; + lock = cq->element[cq->front]; cq->front = (cq->front + 1) & CQ_MASK; - return 0; + + return lock; } static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) @@ -1322,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child) return depth; } +/* + * Return the forward or backward dependency list. + * + * @lock: the lock_list to get its class's dependency list + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + */ +static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) +{ + void *lock_class = lock->class; + + return lock_class + offset; +} + +/* + * Forward- or backward-dependency search, used for both circular dependency + * checking and hardirq-unsafe/softirq-unsafe checking. + */ static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry, - int forward) + int offset) { struct lock_list *entry; + struct lock_list *lock; struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; @@ -1339,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (forward) - head = &source_entry->class->locks_after; - else - head = &source_entry->class->locks_before; - + head = get_dep_list(source_entry, offset); if (list_empty(head)) goto exit; __cq_init(cq); - __cq_enqueue(cq, (unsigned long)source_entry); + __cq_enqueue(cq, source_entry); - while (!__cq_empty(cq)) { - struct lock_list *lock; - - __cq_dequeue(cq, (unsigned long *)&lock); + while ((lock = __cq_dequeue(cq))) { if (!lock->class) { ret = -2; goto exit; } - if (forward) - head = &lock->class->locks_after; - else - head = &lock->class->locks_before; + head = get_dep_list(lock, offset); DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1377,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (__cq_enqueue(cq, (unsigned long)entry)) { + if (__cq_enqueue(cq, entry)) { ret = -1; goto exit; } @@ -1396,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 1); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_after)); } @@ -1405,16 +1465,11 @@ static inline int __bfs_backwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 0); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_before)); } -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - */ - static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) { unsigned long *entries = stack_trace + trace->offset; @@ -1426,16 +1481,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) * Print a dependency chain entry (this is only done when a deadlock * has been detected): */ -static noinline int +static noinline void print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) - return 0; + return; printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); print_lock_trace(&target->trace, 6); - return 0; } static void @@ -1492,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src, * When a circular dependency is detected, print the * header first: */ -static noinline int +static noinline void print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct held_lock *check_src, struct held_lock *check_tgt) @@ -1500,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct task_struct *curr = current; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("======================================================\n"); @@ -1518,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); - - return 0; } static inline int class_equal(struct lock_list *entry, void *data) @@ -1527,10 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data) return entry->class == data; } -static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt) +static noinline void print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1538,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; if (!save_trace(&this->trace)) - return 0; + return; depth = get_lock_depth(target); @@ -1563,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); - - return 0; } -static noinline int print_bfs_bug(int ret) +static noinline void print_bfs_bug(int ret) { if (!debug_locks_off_graph_unlock()) - return 0; + return; /* * Breadth-first-search failed, graph got corrupted? */ WARN(1, "lockdep bfs error:%d\n", ret); - - return 0; } static int noop_count(struct lock_list *entry, void *data) @@ -1640,36 +1688,95 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) } /* - * Prove that the dependency graph starting at <entry> can not - * lead to <target>. Print an error and return 0 if it does. + * Check that the dependency graph starting at <src> can lead to + * <target> or not. Print an error and return 0 if it does. */ static noinline int -check_noncircular(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_path(struct lock_class *target, struct lock_list *src_entry, + struct lock_list **target_entry) { - int result; + int ret; + + ret = __bfs_forwards(src_entry, (void *)target, class_equal, + target_entry); + + if (unlikely(ret < 0)) + print_bfs_bug(ret); + + return ret; +} + +/* + * Prove that the dependency graph starting at <src> can not + * lead to <target>. If it can, there is a circle when adding + * <target> -> <src> dependency. + * + * Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct held_lock *src, struct held_lock *target, + struct lock_trace *trace) +{ + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_cyclic_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (unlikely(!ret)) { + if (!trace->nr_entries) { + /* + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. + */ + save_trace(trace); + } + + print_circular_bug(&src_entry, target_entry, src, target); + } + + return ret; } +#ifdef CONFIG_LOCKDEP_SMALL +/* + * Check that the dependency graph starting at <src> can lead to + * <target> or not. If it can, <src> -> <target> dependency is already + * in the graph. + * + * Print an error and return 2 if it does or 1 if it does not. + */ static noinline int -check_redundant(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_redundant(struct held_lock *src, struct held_lock *target) { - int result; + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_redundant_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (!ret) { + debug_atomic_inc(nr_redundant); + ret = 2; + } else if (ret < 0) + ret = 0; + + return ret; } +#endif -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +#ifdef CONFIG_TRACE_IRQFLAGS static inline int usage_accumulate(struct lock_list *entry, void *mask) { @@ -1766,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) */ static void __used print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) + struct lock_list *root) { struct lock_list *entry = leaf; int depth; @@ -1788,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf, entry = get_lock_parent(entry); depth--; } while (entry && (depth >= 0)); - - return; } static void @@ -1848,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, struct lock_list *next_root, @@ -1861,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr, const char *irqclass) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================================\n"); @@ -1907,19 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) - return 0; + return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) - return 0; + return; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static const char *state_names[] = { @@ -2066,8 +2169,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, this.class = hlock_class(prev); ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } usage_mask &= LOCKF_USED_IN_IRQ_ALL; if (!usage_mask) @@ -2083,8 +2188,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, that.class = hlock_class(next); ret = find_usage_forwards(&that, forward_mask, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; @@ -2096,8 +2203,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, backward_mask = original_mask(target_entry1->class->usage_mask); ret = find_usage_backwards(&this, backward_mask, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (DEBUG_LOCKS_WARN_ON(ret == 1)) return 1; @@ -2111,11 +2220,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, if (DEBUG_LOCKS_WARN_ON(ret == -1)) return 1; - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - backward_bit, forward_bit, - state_name(backward_bit)); + print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); + + return 0; } static void inc_chains(void) @@ -2143,11 +2254,10 @@ static inline void inc_chains(void) nr_process_chains++; } -#endif +#endif /* CONFIG_TRACE_IRQFLAGS */ static void -print_deadlock_scenario(struct held_lock *nxt, - struct held_lock *prv) +print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) { struct lock_class *next = hlock_class(nxt); struct lock_class *prev = hlock_class(prv); @@ -2165,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" May be due to missing lock nesting notation\n\n"); } -static int +static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("============================================\n"); @@ -2189,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2202,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read */ static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) +check_deadlock(struct task_struct *curr, struct held_lock *next) { struct held_lock *prev; struct held_lock *nest = NULL; @@ -2222,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ - if ((read == 2) && prev->read) + if ((next->read == 2) && prev->read) return 2; /* @@ -2232,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - return print_deadlock_bug(curr, prev, next); + print_deadlock_bug(curr, prev, next); + return 0; } return 1; } /* * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: + * to a previous lock. We validate the following rules: * * - would the adding of the <prev> -> <next> dependency create a * circular dependency in the graph? [== circular deadlock] @@ -2263,9 +2371,7 @@ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, struct held_lock *next, int distance, struct lock_trace *trace) { - struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; - struct lock_list this; int ret; if (!hlock_class(prev)->key || !hlock_class(next)->key) { @@ -2289,28 +2395,16 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at <next>, and - * checking whether we can reach <prev>.) + * a breadth-first search into the graph starting at <next>, + * and check whether we can reach <prev>.) * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: + * The search is limited by the size of the circular queue (i.e., + * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes + * in the graph whose neighbours are to be checked. */ - this.class = hlock_class(next); - this.parent = NULL; - ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) { - if (!trace->nr_entries) { - /* - * If save_trace fails here, the printing might - * trigger a WARN but because of the !nr_entries it - * should not do bad things. - */ - save_trace(trace); - } - return print_circular_bug(&this, target_entry, next, prev); - } - else if (unlikely(ret < 0)) - return print_bfs_bug(ret); + ret = check_noncircular(next, prev, trace); + if (unlikely(ret <= 0)) + return 0; if (!check_irq_usage(curr, prev, next)) return 0; @@ -2341,19 +2435,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } +#ifdef CONFIG_LOCKDEP_SMALL /* * Is the <prev> -> <next> link redundant? */ - this.class = hlock_class(prev); - this.parent = NULL; - ret = check_redundant(&this, hlock_class(next), &target_entry); - if (!ret) { - debug_atomic_inc(nr_redundant); - return 2; - } - if (ret < 0) - return print_bfs_bug(ret); - + ret = check_redundant(prev, next); + if (ret != 1) + return ret; +#endif if (!trace->nr_entries && !save_trace(trace)) return 0; @@ -2505,12 +2594,13 @@ static void print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) { struct held_lock *hlock; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int depth = curr->lockdep_depth; - int i; + int i = get_first_held_lock(curr, hlock_next); - printk("depth: %u\n", depth + 1); - for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + printk("depth: %u (irq_context %u)\n", depth - i + 1, + hlock_next->irq_context); + for (; i < depth; i++) { hlock = curr->held_locks + i; chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); @@ -2524,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne static void print_chain_keys_chain(struct lock_chain *chain) { int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int class_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id + 1, chain_key); + chain_key = print_chain_key_iteration(class_id, chain_key); print_lock_name(lock_classes + class_id); printk("\n"); @@ -2581,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx - 1; + id = curr->held_locks[i].class_idx; if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -2664,7 +2754,7 @@ static inline int add_chain_cache(struct task_struct *curr, if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; + int lock_id = curr->held_locks[i].class_idx; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; @@ -2754,8 +2844,9 @@ cache_hit: return 1; } -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) +static int validate_chain(struct task_struct *curr, + struct held_lock *hlock, + int chain_head, u64 chain_key) { /* * Trylock needs to maintain the stack of held locks, but it @@ -2776,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * - is softirq-safe, if this lock is hardirq-unsafe * * And check whether the new lock's dependency graph - * could lead back to the previous lock. + * could lead back to the previous lock: * - * any of these scenarios could lead to a deadlock. If - * All validations + * - within the current held-lock stack + * - across our accumulated lock dependency records + * + * any of these scenarios could lead to a deadlock. */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); + /* + * The simple case: does the current hold the same lock + * already? + */ + int ret = check_deadlock(curr, hlock); if (!ret) return 0; @@ -2812,16 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, } #else static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) + struct held_lock *hlock, + int chain_head, u64 chain_key) { return 1; } - -static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) -{ -} -#endif +#endif /* CONFIG_PROVE_LOCKING */ /* * We are building curr_chain_key incrementally, so double-check @@ -2832,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; unsigned int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -2848,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } + /* - * Whoops ran out of static storage again? + * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is + * it registered lock class index? */ - if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use))) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } @@ -2874,14 +2969,11 @@ static void check_chain_key(struct task_struct *curr) #endif } +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) static int mark_lock(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit); -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) - - -static void -print_usage_bug_scenario(struct held_lock *lock) +static void print_usage_bug_scenario(struct held_lock *lock) { struct lock_class *class = hlock_class(lock); @@ -2898,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock) printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("================================\n"); @@ -2933,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2944,8 +3034,10 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + print_usage_bug(curr, this, bad_bit, new_bit); + return 0; + } return 1; } @@ -2953,7 +3045,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, /* * print irq inversion bug: */ -static int +static void print_irq_inversion_bug(struct task_struct *curr, struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, @@ -2964,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("========================================================\n"); @@ -3005,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) - return 0; + return; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -3029,13 +3119,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); + return 0; } /* @@ -3053,13 +3146,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, irqclass); + return 0; } void print_irqtrace_events(struct task_struct *curr) @@ -3142,7 +3238,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, * Validate that the lock dependencies don't have conflicting usage * states. */ - if ((!read || !dir || STRICT_READ_CHECKS) && + if ((!read || STRICT_READ_CHECKS) && !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) return 0; @@ -3367,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +static int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { + if (!check) + goto lock_used; + /* * If non-trylock use in a hardirq or softirq context, then * mark the lock as used in these contexts: @@ -3412,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } +lock_used: + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED)) + return 0; + return 1; } @@ -3443,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ - return 1; -} - -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) -{ - return 1; -} - -static inline unsigned int task_irq_context(struct task_struct *task) -{ - return 0; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - /* * Mark a lock with a usage bit, and validate the state transition: */ @@ -3480,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, { unsigned int new_mask = 1 << new_bit, ret = 1; + if (new_bit >= LOCK_USAGE_STATES) { + DEBUG_LOCKS_WARN_ON(1); + return 0; + } + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3503,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; switch (new_bit) { -#define LOCKDEP_STATE(__STATE) \ - case LOCK_USED_IN_##__STATE: \ - case LOCK_USED_IN_##__STATE##_READ: \ - case LOCK_ENABLED_##__STATE: \ - case LOCK_ENABLED_##__STATE##_READ: -#include "lockdep_states.h" -#undef LOCKDEP_STATE - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; - break; case LOCK_USED: debug_atomic_dec(nr_unused_locks); break; default: - if (!debug_locks_off_graph_unlock()) + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) return 0; - WARN_ON(1); - return 0; } graph_unlock(); @@ -3539,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +static inline int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) +{ + return 1; +} + +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + /* * Initialize a lock instance's lock-class mapping info: */ @@ -3602,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); -static int +static void print_lock_nested_lock_not_held(struct task_struct *curr, struct held_lock *hlock, unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("==================================\n"); @@ -3632,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int __lock_is_held(const struct lockdep_map *lock, int read); @@ -3698,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; - class_idx = class - lock_classes + 1; + class_idx = class - lock_classes; if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) { - /* - * Check: unsigned int references:12, overflow. - */ - if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) - return 0; + if (!references) + references++; + if (!hlock->references) hlock->references++; - } else { - hlock->references = 2; - } - return 1; + hlock->references += references; + + /* Overflow */ + if (DEBUG_LOCKS_WARN_ON(hlock->references < references)) + return 0; + + return 2; } } @@ -3742,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; - if (check && !mark_irqflags(curr, hlock)) - return 0; - - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) + /* Initialize the lock usage bit */ + if (!mark_usage(curr, hlock, check)) return 0; /* @@ -3760,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ /* - * Whoops, we did it again.. ran straight out of our static allocation. + * Whoops, we did it again.. class_idx is invalid. */ - if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use))) return 0; chain_key = curr->curr_chain_key; @@ -3770,27 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, /* * How can we have a chain hash when we ain't got no keys?! */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY)) return 0; chain_head = 1; } hlock->prev_chain_key = chain_key; if (separate_irq_context(curr, hlock)) { - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock, -1)) - return print_lock_nested_lock_not_held(curr, hlock, ip); + if (nest_lock && !__lock_is_held(nest_lock, -1)) { + print_lock_nested_lock_not_held(curr, hlock, ip); + return 0; + } if (!debug_locks_silent) { WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); WARN_ON_ONCE(!hlock_class(hlock)->key); } - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) + if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; curr->curr_chain_key = chain_key; @@ -3819,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 1; } -static int -print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_unlock_imbalance_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================\n"); @@ -3844,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int match_held_lock(const struct held_lock *hlock, @@ -3877,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock, if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; - if (hlock->class_idx == class - lock_classes + 1) + if (hlock->class_idx == class - lock_classes) return 1; } @@ -3921,22 +4006,33 @@ out: } static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, - int idx) + int idx, unsigned int *merged) { struct held_lock *hlock; + int first_idx = idx; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { - if (!__lock_acquire(hlock->instance, + switch (__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) + hlock->references, hlock->pin_count)) { + case 0: return 1; + case 1: + break; + case 2: + *merged += (idx == first_idx); + break; + default: + WARN_ON(1); + return 0; + } } return 0; } @@ -3947,9 +4043,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; struct lock_class *class; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -3964,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class - lock_classes; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) return 0; /* * I took it apart and put it back together again, except now I have * these 'spare' parts.. where shall I put them. */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged)) return 0; return 1; } @@ -3989,8 +4087,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name, static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4005,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -4015,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) hlock->read = 1; hlock->acquire_ip = ip; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) + return 0; + + /* Merging can't happen with unchanged classes.. */ + if (DEBUG_LOCKS_WARN_ON(merged)) return 0; /* @@ -4024,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; + return 1; } @@ -4035,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +__lock_release(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 1; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -4050,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(depth <= 0)) - return print_unlock_imbalance_bug(curr, lock, ip); + if (depth <= 0) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } /* * Check whether the lock exists in the current stack * of held locks: */ hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -4094,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) if (i == depth-1) return 1; - if (reacquire_held_locks(curr, depth, i + 1)) + if (reacquire_held_locks(curr, depth, i + 1, &merged)) return 0; /* * We had N bottles of beer on the wall, we drank one, but now * there's not N-1 bottles of beer left on the wall... + * Pouring two of the bottles together is acceptable. */ - DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); + DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged); /* * Since reacquire_held_locks() would have called check_chain_key() @@ -4319,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested, check_flags(flags); current->lockdep_recursion = 1; trace_lock_release(lock, ip); - if (__lock_release(lock, nested, ip)) + if (__lock_release(lock, ip)) check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -4402,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) EXPORT_SYMBOL_GPL(lock_unpin_lock); #ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_lock_contention_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=================================\n"); @@ -4427,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static void @@ -4573,9 +4681,7 @@ void lockdep_reset(void) int i; raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; + lockdep_init_task(current); memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); nr_hardirq_chains = 0; nr_softirq_chains = 0; @@ -4615,9 +4721,9 @@ static void remove_class_from_lock_chain(struct pending_free *pf, return; recalc: - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; for (i = chain->base; i < chain->base + chain->depth; i++) - chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); if (chain->depth && chain->chain_key == chain_key) return; /* Overwrite the chain key for concurrent RCU readers. */ @@ -4691,6 +4797,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) WRITE_ONCE(class->key, NULL); WRITE_ONCE(class->name, NULL); nr_lock_classes--; + __clear_bit(class - lock_classes, lock_classes_in_use); } else { WARN_ONCE(true, "%s() failed for class %s\n", __func__, class->name); @@ -5036,6 +5143,7 @@ void __init lockdep_init(void) printk(" memory used by lock dependency info: %zu kB\n", (sizeof(lock_classes) + + sizeof(lock_classes_in_use) + sizeof(classhash_table) + sizeof(list_entries) + sizeof(list_entries_in_use) + diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 150ec3f0c5b5..cc83568d5012 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; -extern unsigned int max_recursion_depth; extern unsigned int max_bfs_queue_depth; @@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class) * and we want to avoid too much cache bouncing. */ struct lockdep_stats { - int chain_lookup_hits; - int chain_lookup_misses; - int hardirqs_on_events; - int hardirqs_off_events; - int redundant_hardirqs_on; - int redundant_hardirqs_off; - int softirqs_on_events; - int softirqs_off_events; - int redundant_softirqs_on; - int redundant_softirqs_off; - int nr_unused_locks; - int nr_redundant_checks; - int nr_redundant; - int nr_cyclic_checks; - int nr_cyclic_check_recursions; - int nr_find_usage_forwards_checks; - int nr_find_usage_forwards_recursions; - int nr_find_usage_backwards_checks; - int nr_find_usage_backwards_recursions; + unsigned long chain_lookup_hits; + unsigned int chain_lookup_misses; + unsigned long hardirqs_on_events; + unsigned long hardirqs_off_events; + unsigned long redundant_hardirqs_on; + unsigned long redundant_hardirqs_off; + unsigned long softirqs_on_events; + unsigned long softirqs_off_events; + unsigned long redundant_softirqs_on; + unsigned long redundant_softirqs_off; + int nr_unused_locks; + unsigned int nr_redundant_checks; + unsigned int nr_redundant; + unsigned int nr_cyclic_checks; + unsigned int nr_find_usage_forwards_checks; + unsigned int nr_find_usage_backwards_checks; /* * Per lock class locking operation stat counts diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 80a463d31a8d..c513031cd7e3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -975,7 +975,7 @@ static int __init lock_torture_init(void) goto unwind; } if (stutter > 0) { - firsterr = torture_stutter_init(stutter); + firsterr = torture_stutter_init(stutter, stutter); if (firsterr) goto unwind; } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index b6a9cc62099a..364d38a0c444 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + rcu_sync_init(&sem->rss); __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); sem->readers_block = 0; diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c deleted file mode 100644 index 0b1f77957240..000000000000 --- a/kernel/locking/rwsem-xadd.c +++ /dev/null @@ -1,745 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi <alex.shi@intel.com> - * and Michel Lespinasse <walken@google.com> - * - * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> - * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. - */ -#include <linux/rwsem.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/sched/signal.h> -#include <linux/sched/rt.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/debug.h> -#include <linux/osq_lock.h> - -#include "rwsem.h" - -/* - * Guide to the rw_semaphore's count field for common values. - * (32-bit case illustrated, similar for 64-bit) - * - * 0x0000000X (1) X readers active or attempting lock, no writer waiting - * X = #active_readers + #readers attempting to lock - * (X*ACTIVE_BIAS) - * - * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or - * attempting to read lock or write lock. - * - * 0xffff000X (1) X readers active or attempting lock, with waiters for lock - * X = #active readers + # readers attempting lock - * (X*ACTIVE_BIAS + WAITING_BIAS) - * (2) 1 writer attempting lock, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * (3) 1 writer active, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * - * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock - * (WAITING_BIAS + ACTIVE_BIAS) - * (2) 1 writer active or attempting lock, no waiters for lock - * (ACTIVE_WRITE_BIAS) - * - * 0xffff0000 (1) There are writers or readers queued but none active - * or in the process of attempting lock. - * (WAITING_BIAS) - * Note: writer can attempt to steal lock for this count by adding - * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count - * - * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. - * (ACTIVE_WRITE_BIAS + WAITING_BIAS) - * - * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking - * the count becomes more than 0 for successful lock acquisition, - * i.e. the case where there are only readers or nobody has lock. - * (1st and 2nd case above). - * - * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and - * checking the count becomes ACTIVE_WRITE_BIAS for successful lock - * acquisition (i.e. nobody else has lock or attempts lock). If - * unsuccessful, in rwsem_down_write_failed, we'll check to see if there - * are only waiters but none active (5th case above), and attempt to - * steal the lock. - * - */ - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - sem->owner = NULL; - osq_lock_init(&sem->osq); -#endif -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue - * - the wait_lock must be held by the caller - * - tasks are marked for wakeup, the caller must later invoke wake_up_q() - * to actually wakeup the blocked task(s) and drop the reference count, - * preferably when the wait_lock is released - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only marked woken if downgrading is false - */ -static void __rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, - struct wake_q_head *wake_q) -{ - struct rwsem_waiter *waiter, *tmp; - long oldcount, woken = 0, adjustment = 0; - struct list_head wlist; - - /* - * Take a peek at the queue head waiter such that we can determine - * the wakeup(s) to perform. - */ - waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) { - /* - * Mark writer at the front of the queue for wakeup. - * Until the task is actually later awoken later by - * the caller, other writers are able to steal it. - * Readers, on the other hand, will block as they - * will notice the queued writer. - */ - wake_q_add(wake_q, waiter->task); - lockevent_inc(rwsem_wake_writer); - } - - return; - } - - /* - * Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: - oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* - * If the count is still less than RWSEM_WAITING_BIAS - * after removing the adjustment, it is assumed that - * a writer has stolen the lock. We have to undo our - * reader grant. - */ - if (atomic_long_add_return(-adjustment, &sem->count) < - RWSEM_WAITING_BIAS) - return; - - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; - } - /* - * Set it to reader-owned to give spinners an early - * indication that readers now have the lock. - */ - __rwsem_set_reader_owned(sem, waiter->task); - } - - /* - * Grant an infinite number of read locks to the readers at the front - * of the queue. We know that woken will be at least 1 as we accounted - * for above. Note we increment the 'active part' of the count by the - * number of readers before waking any processes up. - * - * We have to do wakeup in 2 passes to prevent the possibility that - * the reader count may be decremented before it is incremented. It - * is because the to-be-woken waiter may not have slept yet. So it - * may see waiter->task got cleared, finish its critical section and - * do an unlock before the reader count increment. - * - * 1) Collect the read-waiters in a separate list, count them and - * fully increment the reader count in rwsem. - * 2) For each waiters in the new list, clear waiter->task and - * put them into wake_q to be woken up later. - */ - list_for_each_entry(waiter, &sem->wait_list, list) { - if (waiter->type == RWSEM_WAITING_FOR_WRITE) - break; - - woken++; - } - list_cut_before(&wlist, &sem->wait_list, &waiter->list); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - lockevent_cond_inc(rwsem_wake_reader, woken); - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - } - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { - struct task_struct *tsk; - - tsk = waiter->task; - get_task_struct(tsk); - - /* - * Ensure calling get_task_struct() before setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot - * race with do_exit() by always holding a reference count - * to the task to wakeup. - */ - smp_store_release(&waiter->task, NULL); - /* - * Ensure issuing the wakeup (either by us or someone else) - * after setting the reader waiter to nil. - */ - wake_q_add_safe(wake_q, tsk); - } -} - -/* - * This function must be called with the sem->wait_lock held to prevent - * race conditions between checking the rwsem wait list and setting the - * sem->count accordingly. - */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) -{ - /* - * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. - */ - if (count != RWSEM_WAITING_BIAS) - return false; - - /* - * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there - * are other tasks on the wait list, we need to add on WAITING_BIAS. - */ - count = list_is_singular(&sem->wait_list) ? - RWSEM_ACTIVE_WRITE_BIAS : - RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; - - if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) - == RWSEM_WAITING_BIAS) { - rwsem_set_owner(sem); - return true; - } - - return false; -} - -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * Try to acquire write lock before the writer has been put on wait queue. - */ -static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) -{ - long count = atomic_long_read(&sem->count); - - while (!count || count == RWSEM_WAITING_BIAS) { - if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, - count + RWSEM_ACTIVE_WRITE_BIAS)) { - rwsem_set_owner(sem); - lockevent_inc(rwsem_opt_wlock); - return true; - } - } - return false; -} - -static inline bool owner_on_cpu(struct task_struct *owner) -{ - /* - * As lock holder preemption issue, we both skip spinning if - * task is not on cpu or its cpu is preempted - */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -} - -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner; - bool ret = true; - - BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); - - if (need_resched()) - return false; - - rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (owner) { - ret = is_rwsem_owner_spinnable(owner) && - owner_on_cpu(owner); - } - rcu_read_unlock(); - return ret; -} - -/* - * Return true only if we can still spin on the owner field of the rwsem. - */ -static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner = READ_ONCE(sem->owner); - - if (!is_rwsem_owner_spinnable(owner)) - return false; - - rcu_read_lock(); - while (owner && (READ_ONCE(sem->owner) == owner)) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking sem->owner still matches owner, if that fails, - * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - /* - * abort spinning when need_resched or owner is not running or - * owner's cpu is preempted. - */ - if (need_resched() || !owner_on_cpu(owner)) { - rcu_read_unlock(); - return false; - } - - cpu_relax(); - } - rcu_read_unlock(); - - /* - * If there is a new owner or the owner is not set, we continue - * spinning. - */ - return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); -} - -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - bool taken = false; - - preempt_disable(); - - /* sem->wait_lock should not be held when doing optimistic spinning */ - if (!rwsem_can_spin_on_owner(sem)) - goto done; - - if (!osq_lock(&sem->osq)) - goto done; - - /* - * Optimistically spin on the owner field and attempt to acquire the - * lock whenever the owner changes. Spinning will be stopped when: - * 1) the owning writer isn't running; or - * 2) readers own the lock as we can't determine if they are - * actively running or not. - */ - while (rwsem_spin_on_owner(sem)) { - /* - * Try to acquire the lock - */ - if (rwsem_try_write_lock_unqueued(sem)) { - taken = true; - break; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!sem->owner && (need_resched() || rt_task(current))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - osq_unlock(&sem->osq); -done: - preempt_enable(); - lockevent_cond_inc(rwsem_opt_fail, !taken); - return taken; -} - -/* - * Return true if the rwsem has active spinner - */ -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return osq_is_locked(&sem->osq); -} - -#else -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - return false; -} - -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return false; -} -#endif - -/* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { - /* - * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_ACTIVE_READ_BIAS has already - * been set in the count. - */ - if (atomic_long_read(&sem->count) >= 0) { - raw_spin_unlock_irq(&sem->wait_lock); - rwsem_set_reader_owned(sem); - lockevent_inc(rwsem_rlock_fast); - return sem; - } - adjustment += RWSEM_WAITING_BIAS; - } - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(state); - if (!waiter.task) - break; - if (signal_pending_state(state, current)) { - raw_spin_lock_irq(&sem->wait_lock); - if (waiter.task) - goto out_nolock; - raw_spin_unlock_irq(&sem->wait_lock); - break; - } - schedule(); - lockevent_inc(rwsem_sleep_reader); - } - - __set_current_state(TASK_RUNNING); - lockevent_inc(rwsem_rlock); - return sem; -out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - raw_spin_unlock_irq(&sem->wait_lock); - __set_current_state(TASK_RUNNING); - lockevent_inc(rwsem_rlock_fail); - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - -/* - * Wait until we successfully acquire the write lock - */ -static inline struct rw_semaphore * -__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) -{ - long count; - bool waiting = true; /* any queued threads before us */ - struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; - DEFINE_WAKE_Q(wake_q); - - /* undo write bias from down_write operation, stop active locking */ - count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); - - /* do optimistic spinning and steal lock if possible */ - if (rwsem_optimistic_spin(sem)) - return sem; - - /* - * Optimistic spinning failed, proceed to the slowpath - * and block until we can acquire the sem. - */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - if (list_empty(&sem->wait_list)) - waiting = false; - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - if (waiting) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and there are - * no active writers, the lock must be read owned; so we try to - * wake any read locks that were queued ahead of us. - */ - if (count > RWSEM_WAITING_BIAS) { - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* - * Reinitialize wake_q after use. - */ - wake_q_init(&wake_q); - } - - } else - count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); - - /* wait until we successfully acquire the lock */ - set_current_state(state); - while (true) { - if (rwsem_try_write_lock(count, sem)) - break; - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - __set_current_state(TASK_RUNNING); - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - lockevent_inc(rwsem_wlock); - - return ret; - -out_nolock: - __set_current_state(TASK_RUNNING); - raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - else - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - lockevent_inc(rwsem_wlock_fail); - - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed_killable); - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -__visible -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - /* - * __rwsem_down_write_failed_common(sem) - * rwsem_optimistic_spin(sem) - * osq_unlock(sem->osq) - * ... - * atomic_long_add_return(&sem->count) - * - * - VS - - * - * __up_write() - * if (atomic_long_sub_return_release(&sem->count) < 0) - * rwsem_wake(sem) - * osq_is_locked(&sem->osq) - * - * And __up_write() must observe !osq_is_locked() when it observes the - * atomic_long_add_return() in order to not miss a wakeup. - * - * This boils down to: - * - * [S.rel] X = 1 [RmW] r0 = (Y += 0) - * MB RMB - * [RmW] Y += 1 [L] r1 = X - * - * exists (r0=1 /\ r1=0) - */ - smp_rmb(); - - /* - * If a spinner is present, it is not necessary to do the wakeup. - * Try to do wakeup only if the trylock succeeds to minimize - * spinlock contention which may introduce too much delay in the - * unlock operation. - * - * spinning writer up_write/up_read caller - * --------------- ----------------------- - * [S] osq_unlock() [L] osq - * MB RMB - * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) - * - * Here, it is important to make sure that there won't be a missed - * wakeup while the rwsem is free and the only spinning writer goes - * to sleep without taking the rwsem. Even when the spinning writer - * is just going to break out of the waiting loop, it will still do - * a trylock in rwsem_down_write_failed() before sleeping. IOW, if - * rwsem_has_spinner() is true, it will guarantee at least one - * trylock attempt on the rwsem later on. - */ - if (rwsem_has_spinner(sem)) { - /* - * The smp_rmb() here is to make sure that the spinner - * state is consulted before reading the wait_lock. - */ - smp_rmb(); - if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) - return sem; - goto locked; - } - raw_spin_lock_irqsave(&sem->wait_lock, flags); -locked: - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_wake); - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -__visible -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ccbf18f560ff..37524a47f002 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -3,17 +3,1438 @@ * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h + * + * Writer lock-stealing by Alex Shi <alex.shi@intel.com> + * and Michel Lespinasse <walken@google.com> + * + * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> + * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. + * + * Rwsem count bit fields re-definition and rwsem rearchitecture by + * Waiman Long <longman@redhat.com> and + * Peter Zijlstra <peterz@infradead.org>. */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/sched/task.h> #include <linux/sched/debug.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/signal.h> +#include <linux/sched/clock.h> #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include "rwsem.h" +#include "lock_events.h" + +/* + * The least significant 3 bits of the owner value has the following + * meanings when set. + * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers + * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. + * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. + * + * When the rwsem is either owned by an anonymous writer, or it is + * reader-owned, but a spinning writer has timed out, both nonspinnable + * bits will be set to disable optimistic spinning by readers and writers. + * In the later case, the last unlocking reader should then check the + * writer nonspinnable bit and clear it only to give writers preference + * to acquire the lock via optimistic spinning, but not readers. Similar + * action is also done in the reader slowpath. + + * When a writer acquires a rwsem, it puts its task_struct pointer + * into the owner field. It is cleared after an unlock. + * + * When a reader acquires a rwsem, it will also puts its task_struct + * pointer into the owner field with the RWSEM_READER_OWNED bit set. + * On unlock, the owner field will largely be left untouched. So + * for a free or reader-owned rwsem, the owner value may contain + * information about the last reader that acquires the rwsem. + * + * That information may be helpful in debugging cases where the system + * seems to hang on a reader owned rwsem especially if only one reader + * is involved. Ideally we would like to track all the readers that own + * a rwsem, but the overhead is simply too big. + * + * Reader optimistic spinning is helpful when the reader critical section + * is short and there aren't that many readers around. It makes readers + * relatively more preferred than writers. When a writer times out spinning + * on a reader-owned lock and set the nospinnable bits, there are two main + * reasons for that. + * + * 1) The reader critical section is long, perhaps the task sleeps after + * acquiring the read lock. + * 2) There are just too many readers contending the lock causing it to + * take a while to service all of them. + * + * In the former case, long reader critical section will impede the progress + * of writers which is usually more important for system performance. In + * the later case, reader optimistic spinning tends to make the reader + * groups that contain readers that acquire the lock together smaller + * leading to more of them. That may hurt performance in some cases. In + * other words, the setting of nonspinnable bits indicates that reader + * optimistic spinning may not be helpful for those workloads that cause + * it. + * + * Therefore, any writers that had observed the setting of the writer + * nonspinnable bit for a given rwsem after they fail to acquire the lock + * via optimistic spinning will set the reader nonspinnable bit once they + * acquire the write lock. Similarly, readers that observe the setting + * of reader nonspinnable bit at slowpath entry will set the reader + * nonspinnable bits when they acquire the read lock via the wakeup path. + * + * Once the reader nonspinnable bit is on, it will only be reset when + * a writer is able to acquire the rwsem in the fast path or somehow a + * reader or writer in the slowpath doesn't observe the nonspinable bit. + * + * This is to discourage reader optmistic spinning on that particular + * rwsem and make writers more preferred. This adaptive disabling of reader + * optimistic spinning will alleviate the negative side effect of this + * feature. + */ +#define RWSEM_READER_OWNED (1UL << 0) +#define RWSEM_RD_NONSPINNABLE (1UL << 1) +#define RWSEM_WR_NONSPINNABLE (1UL << 2) +#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) + +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + atomic_long_read(&(sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * On 64-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-62 - 55-bit reader count + * Bit 63 - read fail bit + * + * On 32-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-30 - 23-bit reader count + * Bit 31 - read fail bit + * + * It is not likely that the most significant bit (read fail bit) will ever + * be set. This guard bit is still checked anyway in the down_read() fastpath + * just in case we need to use up more of the reader bits for other purpose + * in the future. + * + * atomic_long_fetch_add() is used to obtain reader lock, whereas + * atomic_long_cmpxchg() will be used to obtain writer lock. + * + * There are three places where the lock handoff bit may be set or cleared. + * 1) rwsem_mark_wake() for readers. + * 2) rwsem_try_write_lock() for writers. + * 3) Error path of rwsem_down_write_slowpath(). + * + * For all the above cases, wait_lock will be held. A writer must also + * be the first one in the wait_list to be eligible for setting the handoff + * bit. So concurrent setting/clearing of handoff bit is not possible. + */ +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_FLAG_HANDOFF (1UL << 2) +#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) + +#define RWSEM_READER_SHIFT 8 +#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) +#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) +#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED +#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ + RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) + +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + atomic_long_set(&sem->owner, (long)current); +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + atomic_long_set(&sem->owner, 0); +} + +/* + * Test the flags in the owner field. + */ +static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) +{ + return atomic_long_read(&sem->owner) & flags; +} + +/* + * The task_struct pointer of the last owning reader will be left in + * the owner field. + * + * Note that the owner value just indicates the task has owned the rwsem + * previously, it may not be the real owner or one of the real owners + * anymore when that field is examined, so take it with a grain of salt. + * + * The reader non-spinnable bit is preserved. + */ +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | + (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); + + atomic_long_set(&sem->owner, val); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + __rwsem_set_reader_owned(sem, current); +} + +/* + * Return true if the rwsem is owned by a reader. + */ +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +{ +#ifdef CONFIG_DEBUG_RWSEMS + /* + * Check the count to see if it is write-locked. + */ + long count = atomic_long_read(&sem->count); + + if (count & RWSEM_WRITER_MASK) + return false; +#endif + return rwsem_test_oflags(sem, RWSEM_READER_OWNED); +} + +#ifdef CONFIG_DEBUG_RWSEMS +/* + * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there + * is a task pointer in owner of a reader-owned rwsem, it will be the + * real owner or one of the real owners. The only exception is when the + * unlock is done by up_read_non_owner(). + */ +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ + unsigned long val = atomic_long_read(&sem->owner); + + while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { + if (atomic_long_try_cmpxchg(&sem->owner, &val, + val & RWSEM_OWNER_FLAGS_MASK)) + return; + } +} +#else +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ +} +#endif + +/* + * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag + * remains set. Otherwise, the operation will be aborted. + */ +static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + do { + if (!(owner & RWSEM_READER_OWNED)) + break; + if (owner & RWSEM_NONSPINNABLE) + break; + } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, + owner | RWSEM_NONSPINNABLE)); +} + +static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +{ + long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + if (WARN_ON_ONCE(cnt < 0)) + rwsem_set_nonspinnable(sem); + return !(cnt & RWSEM_READ_FAILED_MASK); +} + +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Return the real task structure pointer of the owner and the embedded + * flags in the owner. pflags must be non-NULL. + */ +static inline struct task_struct * +rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + *pflags = owner & RWSEM_OWNER_FLAGS_MASK; + return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Guide to the rw_semaphore's count field. + * + * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned + * by a writer. + * + * The lock is owned by readers when + * (1) the RWSEM_WRITER_LOCKED isn't set in count, + * (2) some of the reader bits are set in count, and + * (3) the owner field has RWSEM_READ_OWNED bit set. + * + * Having some reader bits set is not enough to guarantee a readers owned + * lock as the readers may be in the process of backing out from the count + * and a writer has just released the lock. So another writer may steal + * the lock immediately after that. + */ + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); + atomic_long_set(&sem->owner, 0L); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + osq_lock_init(&sem->osq); +#endif +} +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; + unsigned long timeout; + unsigned long last_rowner; +}; +#define rwsem_first_waiter(sem) \ + list_first_entry(&sem->wait_list, struct rwsem_waiter, list) + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +enum writer_wait_state { + WRITER_NOT_FIRST, /* Writer is not first in wait list */ + WRITER_FIRST, /* Writer is first in wait list */ + WRITER_HANDOFF /* Writer is first & handoff needed */ +}; + +/* + * The typical HZ value is either 250 or 1000. So set the minimum waiting + * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait + * queue before initiating the handoff protocol. + */ +#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) + +/* + * Magic number to batch-wakeup waiting readers, even when writers are + * also present in the queue. This both limits the amount of work the + * waking thread must do and also prevents any potential counter overflow, + * however unlikely. + */ +#define MAX_READERS_WAKEUP 0x100 + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must + * have been set. + * - there must be someone on the queue + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only marked woken if downgrading is false + */ +static void rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) +{ + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; + + lockdep_assert_held(&sem->wait_lock); + + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = rwsem_first_waiter(sem); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. + */ + wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); + } + + return; + } + + /* + * No reader wakeup if there are too many of them already. + */ + if (unlikely(atomic_long_read(&sem->count) < 0)) + return; + + /* + * Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + if (wake_type != RWSEM_WAKE_READ_OWNED) { + struct task_struct *owner; + + adjustment = RWSEM_READER_BIAS; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount & RWSEM_WRITER_MASK)) { + /* + * When we've been waiting "too" long (for writers + * to give up the lock), request a HANDOFF to + * force the issue. + */ + if (!(oldcount & RWSEM_FLAG_HANDOFF) && + time_after(jiffies, waiter->timeout)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + + atomic_long_add(-adjustment, &sem->count); + return; + } + /* + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. + * The reader nonspinnable bit seen at slowpath entry of + * the reader is copied over. + */ + owner = waiter->task; + if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { + owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); + lockevent_inc(rwsem_opt_norspin); + } + __rwsem_set_reader_owned(sem, owner); + } + + /* + * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the + * queue. We know that the woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. + * + * This is an adaptation of the phase-fair R/W locks where at the + * reader phase (first waiter is a reader), all readers are eligible + * to acquire the lock at the same time irrespective of their order + * in the queue. The writers acquire the lock according to their + * order in the queue. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. + */ + INIT_LIST_HEAD(&wlist); + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + if (waiter->type == RWSEM_WAITING_FOR_WRITE) + continue; + + woken++; + list_move_tail(&waiter->list, &wlist); + + /* + * Limit # of readers that can be woken up per wakeup call. + */ + if (woken >= MAX_READERS_WAKEUP) + break; + } + + adjustment = woken * RWSEM_READER_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_FLAG_WAITERS; + } + + /* + * When we've woken a reader, we no longer need to force writers + * to give up the lock and we can clear HANDOFF. + */ + if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) + adjustment -= RWSEM_FLAG_HANDOFF; + + if (adjustment) + atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + + tsk = waiter->task; + get_task_struct(tsk); + + /* + * Ensure calling get_task_struct() before setting the reader + * waiter to nil such that rwsem_down_read_slowpath() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. + */ + smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add_safe(wake_q, tsk); + } +} + +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + * + * If wstate is WRITER_HANDOFF, it will make sure that either the handoff + * bit is set or the lock is acquired with handoff bit cleared. + */ +static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, + enum writer_wait_state wstate) +{ + long count, new; + + lockdep_assert_held(&sem->wait_lock); + + count = atomic_long_read(&sem->count); + do { + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); + + if (has_handoff && wstate == WRITER_NOT_FIRST) + return false; + + new = count; + + if (count & RWSEM_LOCK_MASK) { + if (has_handoff || (wstate != WRITER_HANDOFF)) + return false; + + new |= RWSEM_FLAG_HANDOFF; + } else { + new |= RWSEM_WRITER_LOCKED; + new &= ~RWSEM_FLAG_HANDOFF; + + if (list_is_singular(&sem->wait_list)) + new &= ~RWSEM_FLAG_WAITERS; + } + } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); + + /* + * We have either acquired the lock with handoff bit cleared or + * set the handoff bit. + */ + if (new & RWSEM_FLAG_HANDOFF) + return false; + + rwsem_set_owner(sem); + return true; +} + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire read lock before the reader is put on wait queue. + * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff + * is ongoing. + */ +static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) + return false; + + count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); + if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_opt_rlock); + return true; + } + + /* Back out the change */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + return false; +} + +/* + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count | RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_wlock); + return true; + } + } + return false; +} + +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) +{ + struct task_struct *owner; + unsigned long flags; + bool ret = true; + + BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); + + if (need_resched()) { + lockevent_inc(rwsem_opt_fail); + return false; + } + + preempt_disable(); + rcu_read_lock(); + owner = rwsem_owner_flags(sem, &flags); + if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) + ret = false; + rcu_read_unlock(); + preempt_enable(); + + lockevent_cond_inc(rwsem_opt_fail, !ret); + return ret; +} + +/* + * The rwsem_spin_on_owner() function returns the folowing 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. + */ +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; +#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) + +static inline enum owner_state +rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) +{ + if (flags & nonspinnable) + return OWNER_NONSPINNABLE; + + if (flags & RWSEM_READER_OWNED) + return OWNER_READER; + + return owner ? OWNER_WRITER : OWNER_NULL; +} + +static noinline enum owner_state +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +{ + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); + if (state != OWNER_WRITER) + return state; + + rcu_read_lock(); + for (;;) { + if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { + state = OWNER_NONSPINNABLE; + break; + } + + new = rwsem_owner_flags(sem, &new_flags); + if ((new != owner) || (new_flags != flags)) { + state = rwsem_owner_state(new, new_flags, nonspinnable); + break; + } + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (need_resched() || !owner_on_cpu(owner)) { + state = OWNER_NONSPINNABLE; + break; + } + + cpu_relax(); + } + rcu_read_unlock(); + + return state; +} + +/* + * Calculate reader-owned rwsem spinning threshold for writer + * + * The more readers own the rwsem, the longer it will take for them to + * wind down and free the rwsem. So the empirical formula used to + * determine the actual spinning time limit here is: + * + * Spinning threshold = (10 + nr_readers/2)us + * + * The limit is capped to a maximum of 25us (30 readers). This is just + * a heuristic and is subjected to change in the future. + */ +static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + int readers = count >> RWSEM_READER_SHIFT; + u64 delta; + + if (readers > 30) + readers = 30; + delta = (20 + readers) * NSEC_PER_USEC / 2; + + return sched_clock() + delta; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +{ + bool taken = false; + int prev_owner_state = OWNER_NULL; + int loop = 0; + u64 rspin_threshold = 0; + unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE + : RWSEM_RD_NONSPINNABLE; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!osq_lock(&sem->osq)) + goto done; + + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock and spinning time has exceeded limit. + */ + for (;;) { + enum owner_state owner_state; + + owner_state = rwsem_spin_on_owner(sem, nonspinnable); + if (!(owner_state & OWNER_SPINNABLE)) + break; + + /* + * Try to acquire the lock + */ + taken = wlock ? rwsem_try_write_lock_unqueued(sem) + : rwsem_try_read_lock_unqueued(sem); + + if (taken) + break; + + /* + * Time-based reader-owned rwsem optimistic spinning + */ + if (wlock && (owner_state == OWNER_READER)) { + /* + * Re-initialize rspin_threshold every time when + * the owner state changes from non-reader to reader. + * This allows a writer to steal the lock in between + * 2 reader phases and have the threshold reset at + * the beginning of the 2nd reader phase. + */ + if (prev_owner_state != OWNER_READER) { + if (rwsem_test_oflags(sem, nonspinnable)) + break; + rspin_threshold = rwsem_rspin_threshold(sem); + loop = 0; + } + + /* + * Check time threshold once every 16 iterations to + * avoid calling sched_clock() too frequently so + * as to reduce the average latency between the times + * when the lock becomes free and when the spinner + * is ready to do a trylock. + */ + else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { + rwsem_set_nonspinnable(sem); + lockevent_inc(rwsem_opt_nospin); + break; + } + } + + /* + * An RT task cannot do optimistic spinning if it cannot + * be sure the lock holder is running or live-lock may + * happen if the current task and the lock holder happen + * to run in the same CPU. However, aborting optimistic + * spinning while a NULL owner is detected may miss some + * opportunity where spinning can continue without causing + * problem. + * + * There are 2 possible cases where an RT task may be able + * to continue spinning. + * + * 1) The lock owner is in the process of releasing the + * lock, sem->owner is cleared but the lock has not + * been released yet. + * 2) The lock was free and owner cleared, but another + * task just comes in and acquire the lock before + * we try to get it. The new owner may be a spinnable + * writer. + * + * To take advantage of two scenarios listed agove, the RT + * task is made to retry one more time to see if it can + * acquire the lock or continue spinning on the new owning + * writer. Of course, if the time lag is long enough or the + * new owner is not a writer or spinnable, the RT task will + * quit spinning. + * + * If the owner is a writer, the need_resched() check is + * done inside rwsem_spin_on_owner(). If the owner is not + * a writer, need_resched() check needs to be done here. + */ + if (owner_state != OWNER_WRITER) { + if (need_resched()) + break; + if (rt_task(current) && + (prev_owner_state != OWNER_WRITER)) + break; + } + prev_owner_state = owner_state; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + lockevent_cond_inc(rwsem_opt_fail, !taken); + return taken; +} + +/* + * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should + * only be called when the reader count reaches 0. + * + * This give writers better chance to acquire the rwsem first before + * readers when the rwsem was being held by readers for a relatively long + * period of time. Race can happen that an optimistic spinner may have + * just stolen the rwsem and set the owner, but just clearing the + * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. + */ +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) +{ + if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) + atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); +} + +/* + * This function is called when the reader fails to acquire the lock via + * optimistic spinning. In this case we will still attempt to do a trylock + * when comparing the rwsem state right now with the state when entering + * the slowpath indicates that the reader is still in a valid reader phase. + * This happens when the following conditions are true: + * + * 1) The lock is currently reader owned, and + * 2) The lock is previously not reader-owned or the last read owner changes. + * + * In the former case, we have transitioned from a writer phase to a + * reader-phase while spinning. In the latter case, it means the reader + * phase hasn't ended when we entered the optimistic spinning loop. In + * both cases, the reader is eligible to acquire the lock. This is the + * secondary path where a read lock is acquired optimistically. + * + * The reader non-spinnable bit wasn't set at time of entry or it will + * not be here at all. + */ +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + if (!(owner & RWSEM_READER_OWNED)) + return false; + + if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && + rwsem_try_read_lock_unqueued(sem)) { + lockevent_inc(rwsem_opt_rlock2); + lockevent_add(rwsem_opt_fail, -1); + return true; + } + return false; +} +#else +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) +{ + return false; +} + +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +{ + return false; +} + +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } + +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + return false; +} +#endif + +/* + * Wait for the read lock to be granted + */ +static struct rw_semaphore __sched * +rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) +{ + long count, adjustment = -RWSEM_READER_BIAS; + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + bool wake = false; + + /* + * Save the current read-owner of rwsem, if available, and the + * reader nonspinnable bit. + */ + waiter.last_rowner = atomic_long_read(&sem->owner); + if (!(waiter.last_rowner & RWSEM_READER_OWNED)) + waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; + + if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) + goto queue; + + /* + * Undo read bias from down_read() and do optimistic spinning. + */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + adjustment = 0; + if (rwsem_optimistic_spin(sem, false)) { + /* + * Wake up other readers in the wait list if the front + * waiter is a reader. + */ + if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { + raw_spin_lock_irq(&sem->wait_lock); + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, + &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + } + return sem; + } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + return sem; + } + +queue: + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer or has the handoff bit set, this reader can + * exit the slowpath and return immediately as its + * RWSEM_READER_BIAS has already been set in the count. + */ + if (adjustment && !(atomic_long_read(&sem->count) & + (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); + return sem; + } + adjustment += RWSEM_FLAG_WAITERS; + } + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + if (adjustment) + count = atomic_long_add_return(adjustment, &sem->count); + else + count = atomic_long_read(&sem->count); + + /* + * If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (!(count & RWSEM_LOCK_MASK)) { + clear_wr_nonspinnable(sem); + wake = true; + } + if (wake || (!(count & RWSEM_WRITER_MASK) && + (adjustment & RWSEM_FLAG_WAITERS))) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + + /* wait to be given the lock */ + while (true) { + set_current_state(state); + if (!waiter.task) + break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } + schedule(); + lockevent_inc(rwsem_sleep_reader); + } + + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); + return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) { + atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, + &sem->count); + } + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); + return ERR_PTR(-EINTR); +} + +/* + * This function is called by the a write lock owner. So the owner value + * won't get changed by others. + */ +static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, + bool disable) +{ + if (unlikely(disable)) { + atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); + lockevent_inc(rwsem_opt_norspin); + } +} + +/* + * Wait until we successfully acquire the write lock + */ +static struct rw_semaphore * +rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) +{ + long count; + bool disable_rspin; + enum writer_wait_state wstate; + struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; + DEFINE_WAKE_Q(wake_q); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && + rwsem_optimistic_spin(sem, true)) + return sem; + + /* + * Disable reader optimistic spinning for this rwsem after + * acquiring the write lock when the setting of the nonspinnable + * bits are observed. + */ + disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_WRITE; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + + raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ + wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock */ + if (wstate == WRITER_NOT_FIRST) { + count = atomic_long_read(&sem->count); + + /* + * If there were already threads queued before us and: + * 1) there are no no active locks, wake the front + * queued process(es) as the handoff bit might be set. + * 2) there are no active writers and some readers, the lock + * must be read owned; so we try to wake any read lock + * waiters that were queued ahead of us. + */ + if (count & RWSEM_WRITER_MASK) + goto wait; + + rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) + ? RWSEM_WAKE_READERS + : RWSEM_WAKE_ANY, &wake_q); + + if (!wake_q_empty(&wake_q)) { + /* + * We want to minimize wait_lock hold time especially + * when a large number of readers are to be woken up. + */ + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + wake_q_init(&wake_q); /* Used again, reinit */ + raw_spin_lock_irq(&sem->wait_lock); + } + } else { + atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); + } + +wait: + /* wait until we successfully acquire the lock */ + set_current_state(state); + while (true) { + if (rwsem_try_write_lock(sem, wstate)) + break; + + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + for (;;) { + if (signal_pending_state(state, current)) + goto out_nolock; + + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); + /* + * If HANDOFF bit is set, unconditionally do + * a trylock. + */ + if (wstate == WRITER_HANDOFF) + break; + + if ((wstate == WRITER_NOT_FIRST) && + (rwsem_first_waiter(sem) == &waiter)) + wstate = WRITER_FIRST; + + count = atomic_long_read(&sem->count); + if (!(count & RWSEM_LOCK_MASK)) + break; + + /* + * The setting of the handoff bit is deferred + * until rwsem_try_write_lock() is called. + */ + if ((wstate == WRITER_FIRST) && (rt_task(current) || + time_after(jiffies, waiter.timeout))) { + wstate = WRITER_HANDOFF; + lockevent_inc(rwsem_wlock_handoff); + break; + } + } + + raw_spin_lock_irq(&sem->wait_lock); + } + __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + rwsem_disable_reader_optspin(sem, disable_rspin); + raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); + + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); + list_del(&waiter.list); + + if (unlikely(wstate == WRITER_HANDOFF)) + atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); + + if (list_empty(&sem->wait_list)) + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + else + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); + + return ERR_PTR(-EINTR); +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} + +/* + * lock for reading + */ +inline void __down_read(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + /* + * Optimize for the case when the rwsem is not locked at all. + */ + long tmp = RWSEM_UNLOCKED_VALUE; + + do { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_READER_BIAS)) { + rwsem_set_reader_owned(sem); + return 1; + } + } while (!(tmp & RWSEM_READ_FAILED_MASK)); + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) + rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); + else + rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) { + if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) + return -EINTR; + } else { + rwsem_set_owner(sem); + } + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + return true; + } + return false; +} + +/* + * unlock after reading + */ +inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + rwsem_clear_reader_owned(sem); + tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == + RWSEM_FLAG_WAITERS)) { + clear_wr_nonspinnable(sem); + rwsem_wake(sem, tmp); + } +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * sem->owner may differ from current if the ownership is transferred + * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. + */ + DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && + !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + rwsem_clear_owner(sem); + tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + if (unlikely(tmp & RWSEM_FLAG_WAITERS)) + rwsem_wake(sem, tmp); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); + tmp = atomic_long_fetch_add_release( + -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); + rwsem_set_reader_owned(sem); + if (tmp & RWSEM_FLAG_WAITERS) + rwsem_downgrade_wake(sem); +} /* * lock for reading @@ -25,7 +1446,6 @@ void __sched down_read(struct rw_semaphore *sem) LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - EXPORT_SYMBOL(down_read); int __sched down_read_killable(struct rw_semaphore *sem) @@ -40,7 +1460,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) return 0; } - EXPORT_SYMBOL(down_read_killable); /* @@ -54,7 +1473,6 @@ int down_read_trylock(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); return ret; } - EXPORT_SYMBOL(down_read_trylock); /* @@ -64,10 +1482,8 @@ void __sched down_write(struct rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(down_write); /* @@ -78,14 +1494,14 @@ int __sched down_write_killable(struct rw_semaphore *sem) might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } return 0; } - EXPORT_SYMBOL(down_write_killable); /* @@ -100,7 +1516,6 @@ int down_write_trylock(struct rw_semaphore *sem) return ret; } - EXPORT_SYMBOL(down_write_trylock); /* @@ -109,10 +1524,8 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - __up_read(sem); } - EXPORT_SYMBOL(up_read); /* @@ -121,10 +1534,8 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - __up_write(sem); } - EXPORT_SYMBOL(up_write); /* @@ -133,10 +1544,8 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - __downgrade_write(sem); } - EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -145,40 +1554,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); } - EXPORT_SYMBOL(down_read_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); - __down_read(sem); __rwsem_set_reader_owned(sem, NULL); } - EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); } - EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) @@ -186,23 +1587,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } return 0; } - EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); } - EXPORT_SYMBOL(up_read_non_owner); #endif diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 64877f5294e3..2534ce49f648 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,304 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * The least significant 2 bits of the owner value has the following - * meanings when set. - * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers - * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, - * i.e. the owner(s) cannot be readily determined. It can be reader - * owned or the owning writer is indeterminate. - * - * When a writer acquires a rwsem, it puts its task_struct pointer - * into the owner field. It is cleared after an unlock. - * - * When a reader acquires a rwsem, it will also puts its task_struct - * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will - * largely be left untouched. So for a free or reader-owned rwsem, - * the owner value may contain information about the last reader that - * acquires the rwsem. The anonymous bit is set because that particular - * reader may or may not still own the lock. - * - * That information may be helpful in debugging cases where the system - * seems to hang on a reader owned rwsem especially if only one reader - * is involved. Ideally we would like to track all the readers that own - * a rwsem, but the overhead is simply too big. - */ -#include "lock_events.h" -#define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) +#ifndef __INTERNAL_RWSEM_H +#define __INTERNAL_RWSEM_H +#include <linux/rwsem.h> -#ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ - if (!debug_locks_silent && \ - WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ - #c, atomic_long_read(&(sem)->count), \ - (long)((sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ - debug_locks_off(); \ - } while (0) -#else -# define DEBUG_RWSEMS_WARN_ON(c, sem) -#endif +extern void __down_read(struct rw_semaphore *sem); +extern void __up_read(struct rw_semaphore *sem); -/* - * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. - * Adapted largely from include/asm-i386/rwsem.h - * by Paul Mackerras <paulus@samba.org>. - */ - -/* - * the semaphore definition - */ -#ifdef CONFIG_64BIT -# define RWSEM_ACTIVE_MASK 0xffffffffL -#else -# define RWSEM_ACTIVE_MASK 0x0000ffffL -#endif - -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * All writes to owner are protected by WRITE_ONCE() to make sure that - * store tearing can't happen as optimistic spinners may read and use - * the owner value concurrently without lock. Read from owner, however, - * may not need READ_ONCE() as long as the pointer value is only used - * for comparison and isn't being dereferenced. - */ -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, current); -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, NULL); -} - -/* - * The task_struct pointer of the last owning reader will be left in - * the owner field. - * - * Note that the owner value just indicates the task has owned the rwsem - * previously, it may not be the real owner or one of the real owners - * anymore when that field is examined, so take it with a grain of salt. - */ -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - - WRITE_ONCE(sem->owner, (struct task_struct *)val); -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ - __rwsem_set_reader_owned(sem, current); -} - -/* - * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock, i.e. the lock is not anonymously owned. - * N.B. !owner is considered spinnable. - */ -static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) -{ - return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); -} - -/* - * Return true if rwsem is owned by an anonymous writer or readers. - */ -static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) -{ - return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; -} - -#ifdef CONFIG_DEBUG_RWSEMS -/* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). - */ -#define rwsem_clear_reader_owned rwsem_clear_reader_owned -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ - unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - if (READ_ONCE(sem->owner) == (struct task_struct *)val) - cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); -} -#endif - -#else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} - -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -#ifndef rwsem_clear_reader_owned -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); -extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); - -/* - * lock for reading - */ -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { - rwsem_down_read_failed(sem); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); - } else { - rwsem_set_reader_owned(sem); - } -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & - RWSEM_READER_OWNED), sem); - } else { - rwsem_set_reader_owned(sem); - } - return 0; -} - -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - /* - * Optimize for the case when the rwsem is not locked at all. - */ - long tmp = RWSEM_UNLOCKED_VALUE; - - lockevent_inc(rwsem_rtrylock); - do { - if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_ACTIVE_READ_BIAS)) { - rwsem_set_reader_owned(sem); - return 1; - } - } while (tmp >= 0); - return 0; -} - -/* - * lock for writing - */ -static inline void __down_write(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - rwsem_down_write_failed(sem); - rwsem_set_owner(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - long tmp; - - tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count); - if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - rwsem_set_owner(sem); - return 0; -} - -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long tmp; - - lockevent_inc(rwsem_wtrylock); - tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - if (tmp == RWSEM_UNLOCKED_VALUE) { - rwsem_set_owner(sem); - return true; - } - return false; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), - sem); - rwsem_clear_reader_owned(sem); - tmp = atomic_long_dec_return_release(&sem->count); - if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - rwsem_clear_owner(sem); - if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, - &sem->count) < 0)) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long tmp; - - /* - * When downgrading from exclusive to shared ownership, - * anything inside the write-locked region cannot leak - * into the read side. In contrast, anything in the - * read-locked region is ok to be re-ordered into the - * write side. As such, rely on RELEASE semantics. - */ - DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); - tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); - rwsem_set_reader_owned(sem); - if (tmp < 0) - rwsem_downgrade_wake(sem); -} +#endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/module.c b/kernel/module.c index 80c7c09584cf..a2cee14a83f3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3083,6 +3083,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_TREE_SRCU + mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", + sizeof(*mod->srcu_struct_ptrs), + &mod->num_srcu_structs); +#endif #ifdef CONFIG_BPF_EVENTS mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", sizeof(*mod->bpf_raw_events), diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 9505101ed2bc..096211299c07 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -493,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state) pm_suspend_target_state = state; + if (state == PM_SUSPEND_TO_IDLE) + pm_set_suspend_no_platform(); + error = platform_suspend_begin(state); if (error) goto Close; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8456b6e2205f..83a531cea2f3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -79,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, */ static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - rcu_read_lock(); - __ptrace_link(child, new_parent, __task_cred(new_parent)); - rcu_read_unlock(); + __ptrace_link(child, new_parent, current_cred()); } /** @@ -118,6 +116,9 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif child->parent = child->real_parent; list_del_init(&child->ptrace_entry); diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 390aab20115e..5290b01de534 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -446,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; @@ -479,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +#endif + #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efaa5b3f4d3f..fce4e7e6f502 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -299,6 +299,7 @@ struct rcu_torture_ops { int irq_capable; int can_boost; int extendables; + int slow_gps; const char *name; }; @@ -667,9 +668,51 @@ static struct rcu_torture_ops tasks_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, + .slow_gps = 1, .name = "tasks" }; +/* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1010,10 +1053,17 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer")) + if (stutter_wait("rcu_torture_writer") && + !READ_ONCE(rcu_fwd_cb_nodelay) && + !cur_ops->slow_gps && + !torture_must_stop()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) - if (list_empty(&rcu_tortures[i].rtort_free)) - WARN_ON_ONCE(1); + if (list_empty(&rcu_tortures[i].rtort_free) && + rcu_access_pointer(rcu_torture_current) != + &rcu_tortures[i]) { + rcu_ftrace_dump(DUMP_ALL); + WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + } } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) @@ -1358,8 +1408,9 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); - pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, + rcu_torture_current ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), @@ -1661,6 +1712,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) spin_unlock_irqrestore(&rcu_fwd_lock, flags); } +// Give the scheduler a chance, even on nohz_full CPUs. +static void rcu_torture_fwd_prog_cond_resched(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (need_resched()) + schedule(); + } else { + cond_resched(); + } +} + /* * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. @@ -1674,16 +1736,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) for (;;) { spin_lock_irqsave(&rcu_fwd_lock, flags); rfcp = rcu_fwd_cb_head; - if (!rfcp) + if (!rfcp) { + spin_unlock_irqrestore(&rcu_fwd_lock, flags); break; + } rcu_fwd_cb_head = rfcp->rfc_next; if (!rcu_fwd_cb_head) rcu_fwd_cb_tail = &rcu_fwd_cb_head; spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; + rcu_torture_fwd_prog_cond_resched(); } - spin_unlock_irqrestore(&rcu_fwd_lock, flags); return freed; } @@ -1707,6 +1771,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Tight loop containing cond_resched(). */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1724,7 +1790,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } (*tested_tries)++; if (!time_before(jiffies, stopat) && @@ -1745,6 +1811,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WARN_ON(READ_ONCE(fcs.stop) != 2); destroy_rcu_head_on_stack(&fcs.rh); } + schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } /* Carry out call_rcu() forward-progress testing. */ @@ -1765,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void) if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ + if (!cur_ops->call) + return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); @@ -1805,7 +1875,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); @@ -1814,7 +1884,6 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", @@ -1825,6 +1894,8 @@ static void rcu_torture_fwd_prog_cr(void) n_max_gps, n_max_cbs, cver, gps); rcu_torture_fwd_cb_hist(); } + schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } @@ -2240,7 +2311,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, + &busted_srcud_ops, &tasks_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -2363,7 +2434,10 @@ rcu_torture_init(void) if (stutter < 0) stutter = 0; if (stutter) { - firsterr = torture_stutter_init(stutter * HZ); + int t; + + t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; + firsterr = torture_stutter_init(stutter * HZ, t); if (firsterr) goto unwind; } diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9b761e546de8..cf0e886314f2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func, bool do_norm) +static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) { unsigned long flags; int idx; @@ -1310,3 +1310,68 @@ void __init srcu_init(void) queue_work(rcu_gp_wq, &ssp->work.work); } } + +#ifdef CONFIG_MODULES + +/* Initialize any global-scope srcu_struct structures used by this module. */ +static int srcu_module_coming(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + int ret; + + for (i = 0; i < mod->num_srcu_structs; i++) { + ret = init_srcu_struct(*(sspp++)); + if (WARN_ON_ONCE(ret)) + return ret; + } + return 0; +} + +/* Clean up any global-scope srcu_struct structures used by this module. */ +static void srcu_module_going(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + + for (i = 0; i < mod->num_srcu_structs; i++) + cleanup_srcu_struct(*(sspp++)); +} + +/* Handle one module, either coming or going. */ +static int srcu_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = srcu_module_coming(mod); + break; + case MODULE_STATE_GOING: + srcu_module_going(mod); + break; + default: + break; + } + return ret; +} + +static struct notifier_block srcu_module_nb = { + .notifier_call = srcu_module_notify, + .priority = 0, +}; + +static __init int init_srcu_module_notifier(void) +{ + int ret; + + ret = register_module_notifier(&srcu_module_nb); + if (ret) + pr_warn("Failed to register srcu module notifier\n"); + return ret; +} +late_initcall(init_srcu_module_notifier); + +#endif /* #ifdef CONFIG_MODULES */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index a8304d90573f..d4558ab7a07d 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -10,65 +10,18 @@ #include <linux/rcu_sync.h> #include <linux/sched.h> -#ifdef CONFIG_PROVE_RCU -#define __INIT_HELD(func) .held = func, -#else -#define __INIT_HELD(func) -#endif - -static const struct { - void (*sync)(void); - void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); - void (*wait)(void); -#ifdef CONFIG_PROVE_RCU - int (*held)(void); -#endif -} gp_ops[] = { - [RCU_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_held) - }, - [RCU_SCHED_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_sched_held) - }, - [RCU_BH_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_bh_held) - }, -}; - -enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; -enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; +enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY }; #define rss_lock gp_wait.lock -#ifdef CONFIG_PROVE_RCU -void rcu_sync_lockdep_assert(struct rcu_sync *rsp) -{ - RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), - "suspicious rcu_sync_is_idle() usage"); -} - -EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); -#endif - /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized - * @type: Flavor of RCU with which to synchronize rcu_sync structure */ -void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +void rcu_sync_init(struct rcu_sync *rsp) { memset(rsp, 0, sizeof(*rsp)); init_waitqueue_head(&rsp->gp_wait); - rsp->gp_type = type; } /** @@ -86,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp) rsp->gp_state = GP_PASSED; } -/** - * rcu_sync_enter() - Force readers onto slowpath - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * This function is used by updaters who need readers to make use of - * a slowpath during the update. After this function returns, all - * subsequent calls to rcu_sync_is_idle() will return false, which - * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. - * - * When called in isolation, rcu_sync_enter() must wait for a grace - * period, however, closely spaced calls to rcu_sync_enter() can - * optimize away the grace-period wait via a state machine implemented - * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). - */ -void rcu_sync_enter(struct rcu_sync *rsp) -{ - bool need_wait, need_sync; - spin_lock_irq(&rsp->rss_lock); - need_wait = rsp->gp_count++; - need_sync = rsp->gp_state == GP_IDLE; - if (need_sync) - rsp->gp_state = GP_PENDING; - spin_unlock_irq(&rsp->rss_lock); +static void rcu_sync_func(struct rcu_head *rhp); - WARN_ON_ONCE(need_wait && need_sync); - if (need_sync) { - gp_ops[rsp->gp_type].sync(); - rsp->gp_state = GP_PASSED; - wake_up_all(&rsp->gp_wait); - } else if (need_wait) { - wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); - } else { - /* - * Possible when there's a pending CB from a rcu_sync_exit(). - * Nobody has yet been allowed the 'fast' path and thus we can - * avoid doing any sync(). The callback will get 'dropped'. - */ - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - } +static void rcu_sync_call(struct rcu_sync *rsp) +{ + call_rcu(&rsp->cb_head, rcu_sync_func); } /** * rcu_sync_func() - Callback function managing reader access to fastpath * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * - * This function is passed to one of the call_rcu() functions by + * This function is passed to call_rcu() function by rcu_sync_enter() and * rcu_sync_exit(), so that it is invoked after a grace period following the - * that invocation of rcu_sync_exit(). It takes action based on events that + * that invocation of enter/exit. + * + * If it is called by rcu_sync_enter() it signals that all the readers were + * switched onto slow path. + * + * If it is called by rcu_sync_exit() it takes action based on events that * have taken place in the meantime, so that closely spaced rcu_sync_enter() * and rcu_sync_exit() pairs need not wait for a grace period. * @@ -152,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - WARN_ON_ONCE(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { /* - * A new rcu_sync_begin() has happened; drop the callback. + * We're at least a GP after the GP_IDLE->GP_ENTER transition. */ - rsp->cb_state = CB_IDLE; - } else if (rsp->cb_state == CB_REPLAY) { + WRITE_ONCE(rsp->gp_state, GP_PASSED); + wake_up_locked(&rsp->gp_wait); + } else if (rsp->gp_state == GP_REPLAY) { /* - * A new rcu_sync_exit() has happened; requeue the callback - * to catch a later GP. + * A new rcu_sync_exit() has happened; requeue the callback to + * catch a later GP. */ - rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); } else { /* - * We're at least a GP after rcu_sync_exit(); eveybody will now - * have observed the write side critical section. Let 'em rip!. + * We're at least a GP after the last rcu_sync_exit(); eveybody + * will now have observed the write side critical section. + * Let 'em rip!. */ - rsp->cb_state = CB_IDLE; - rsp->gp_state = GP_IDLE; + WRITE_ONCE(rsp->gp_state, GP_IDLE); } spin_unlock_irqrestore(&rsp->rss_lock, flags); } /** - * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + int gp_state; + + spin_lock_irq(&rsp->rss_lock); + gp_state = rsp->gp_state; + if (gp_state == GP_IDLE) { + WRITE_ONCE(rsp->gp_state, GP_ENTER); + WARN_ON_ONCE(rsp->gp_count); + /* + * Note that we could simply do rcu_sync_call(rsp) here and + * avoid the "if (gp_state == GP_IDLE)" block below. + * + * However, synchronize_rcu() can be faster if rcu_expedited + * or rcu_blocking_is_gp() is true. + * + * Another reason is that we can't wait for rcu callback if + * we are called at early boot time but this shouldn't happen. + */ + } + rsp->gp_count++; + spin_unlock_irq(&rsp->rss_lock); + + if (gp_state == GP_IDLE) { + /* + * See the comment above, this simply does the "synchronous" + * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED. + */ + synchronize_rcu(); + rcu_sync_func(&rsp->cb_head); + /* Not really needed, wait_event() would see GP_PASSED. */ + return; + } + + wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast path after grace period * @rsp: Pointer to rcu_sync structure to use for synchronization * * This function is used by updaters who have completed, and can therefore @@ -191,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); + spin_lock_irq(&rsp->rss_lock); if (!--rsp->gp_count) { - if (rsp->cb_state == CB_IDLE) { - rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); - } else if (rsp->cb_state == CB_PENDING) { - rsp->cb_state = CB_REPLAY; + if (rsp->gp_state == GP_PASSED) { + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); + } else if (rsp->gp_state == GP_EXIT) { + WRITE_ONCE(rsp->gp_state, GP_REPLAY); } } spin_unlock_irq(&rsp->rss_lock); @@ -209,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp) */ void rcu_sync_dtor(struct rcu_sync *rsp) { - int cb_state; + int gp_state; - WARN_ON_ONCE(rsp->gp_count); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); - if (rsp->cb_state == CB_REPLAY) - rsp->cb_state = CB_PENDING; - cb_state = rsp->cb_state; + if (rsp->gp_state == GP_REPLAY) + WRITE_ONCE(rsp->gp_state, GP_EXIT); + gp_state = rsp->gp_state; spin_unlock_irq(&rsp->rss_lock); - if (cb_state != CB_IDLE) { - gp_ops[rsp->gp_type].wait(); - WARN_ON_ONCE(rsp->cb_state != CB_IDLE); + if (gp_state != GP_IDLE) { + rcu_barrier(); + WARN_ON_ONCE(rsp->gp_state != GP_IDLE); } } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 980ca3ca643f..a14e5fbbea46 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -51,6 +51,12 @@ #include <linux/tick.h> #include <linux/sysrq.h> #include <linux/kprobes.h> +#include <linux/gfp.h> +#include <linux/oom.h> +#include <linux/smpboot.h> +#include <linux/jiffies.h> +#include <linux/sched/isolation.h> +#include "../time/tick-internal.h" #include "tree.h" #include "rcu.h" @@ -92,6 +98,9 @@ struct rcu_state rcu_state = { /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; module_param(dump_tree, bool, 0444); +/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +static bool use_softirq = 1; +module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -138,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -368,19 +376,33 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void) } /** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle * - * If the current CPU is idle or running at a first-level (not nested) + * If the current CPU is idle and running at a first-level (not nested) * interrupt from idle, return true. The caller must have at least * disabled preemption. */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; + /* Called only from within the scheduling-clock interrupt */ + lockdep_assert_in_irq(); + + /* Check for counter underflows */ + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, + "RCU dynticks_nesting counter underflow!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, + "RCU dynticks_nmi_nesting counter underflow/zero!"); + + /* Are we at first interrupt nesting level? */ + if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + return false; + + /* Does CPU appear to be idle from an RCU standpoint? */ + return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ +#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ static long blimit = DEFAULT_RCU_BLIMIT; #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ static long qhimark = DEFAULT_RCU_QHIMARK; @@ -2113,7 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); - if (rdp->blimit == LONG_MAX && count <= qlowmark) + if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ @@ -2253,7 +2275,7 @@ void rcu_force_quiescent_state(void) EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* Perform RCU core processing work for the current CPU. */ -static __latent_entropy void rcu_core(struct softirq_action *unused) +static __latent_entropy void rcu_core(void) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2287,37 +2309,126 @@ static __latent_entropy void rcu_core(struct softirq_action *unused) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rdp); + if (rcu_segcblist_ready_cbs(&rdp->cblist) && + likely(READ_ONCE(rcu_scheduler_fully_active))) + rcu_do_batch(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); } +static void rcu_core_si(struct softirq_action *h) +{ + rcu_core(); +} + +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) + wake_up_process(t); +} + +static void invoke_rcu_core_kthread(void) +{ + struct task_struct *t; + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); + if (t != NULL && t != current) + rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); + local_irq_restore(flags); +} + /* - * Schedule RCU callback invocation. If the running implementation of RCU - * does not support RCU priority boosting, just do a direct call, otherwise - * wake up the per-CPU kernel kthread. Note that because we are running - * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Wake up this CPU's rcuc kthread to do RCU core processing. */ -static void invoke_rcu_callbacks(struct rcu_data *rdp) +static void invoke_rcu_core(void) { - if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) - return; - if (likely(!rcu_state.boost)) { - rcu_do_batch(rdp); + if (!cpu_online(smp_processor_id())) return; + if (use_softirq) + raise_softirq(RCU_SOFTIRQ); + else + invoke_rcu_core_kthread(); +} + +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(rcu_data.rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + int spincnt; + + for (spincnt = 0; spincnt < 10; spincnt++) { + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + local_irq_disable(); + work = *workp; + *workp = 0; + local_irq_enable(); + if (work) + rcu_core(); + local_bh_enable(); + if (*workp == 0) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } } - invoke_rcu_callbacks_kthread(); + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; } -static void invoke_rcu_core(void) +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_data.rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + +/* + * Spawn per-CPU RCU core processing kthreads. + */ +static int __init rcu_spawn_core_kthreads(void) { - if (cpu_online(smp_processor_id())) - raise_softirq(RCU_SOFTIRQ); + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; + if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + return 0; + WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), + "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); + return 0; } +early_initcall(rcu_spawn_core_kthreads); /* * Handle any core-RCU processing required by a call_rcu() invocation. @@ -2354,7 +2465,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; + rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) rcu_force_quiescent_state(); @@ -3355,7 +3466,8 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_core); + if (use_softirq) + open_softirq(RCU_SOFTIRQ, rcu_core_si); /* * We don't need protection against CPU-hotplug here because diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e253d11af3c4..7acaf3a62d39 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -154,13 +154,15 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool deferred_qs; /* This CPU awaiting a deferred QS? */ + bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ + bool defer_qs_iw_pending; /* Scheduler attention pending? */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ @@ -407,8 +409,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); +static void rcu_cpu_kthread_setup(unsigned int cpu); static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..af7e7b9c86af 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->deferred_qs, false); + WRITE_ONCE(rdp->exp_deferred_qs, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ + smp_mb(); /* Ensure test happens before caller kfree(). */ return true; } return false; @@ -384,7 +383,12 @@ retry_ipi: mask_ofl_test |= mask; continue; } + if (get_cpu() == cpu) { + put_cpu(); + continue; + } ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); if (!ret) { mask_ofl_ipi &= ~mask; continue; @@ -611,7 +615,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -633,7 +637,7 @@ static void rcu_exp_handler(void *unused) if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -656,7 +660,7 @@ static void rcu_exp_handler(void *unused) * * Otherwise, force a context switch after the CPU enables everything. */ - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { rcu_preempt_deferred_qs(t); @@ -694,6 +698,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* Invoked on each online non-idle CPU for expedited quiescent state. */ static void rcu_exp_handler(void *unused) { @@ -709,25 +723,38 @@ static void rcu_exp_handler(void *unused) rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + rcu_exp_need_qs(); } /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ static void sync_sched_exp_online_cleanup(int cpu) { + unsigned long flags; + int my_cpu; struct rcu_data *rdp; int ret; struct rcu_node *rnp; rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + my_cpu = get_cpu(); + /* Quiescent state either not needed or already requested, leave. */ + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { + put_cpu(); + return; + } + /* Quiescent state needed on current CPU, so set it up locally. */ + if (my_cpu == cpu) { + local_irq_save(flags); + rcu_exp_need_qs(); + local_irq_restore(flags); + put_cpu(); return; + } + /* Quiescent state needed on some other CPU, send IPI. */ ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); WARN_ON_ONCE(ret); } @@ -765,7 +792,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - struct rcu_data *rdp; struct rcu_exp_work rew; struct rcu_node *rnp; unsigned long s; @@ -802,7 +828,6 @@ void synchronize_rcu_expedited(void) } /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..acb225023ed1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -11,29 +11,7 @@ * Paul E. McKenney <paulmck@linux.ibm.com> */ -#include <linux/delay.h> -#include <linux/gfp.h> -#include <linux/oom.h> -#include <linux/sched/debug.h> -#include <linux/smpboot.h> -#include <linux/sched/isolation.h> -#include <uapi/linux/sched/types.h> -#include "../time/tick-internal.h" - -#ifdef CONFIG_RCU_BOOST #include "../locking/rtmutex_common.h" -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, - * all uses are in dead code. Provide a definition to keep the compiler - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. - * This probably needs to be excluded from -rt builds. - */ -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) -#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ @@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (!use_softirq) + pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); @@ -257,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->deferred_qs); + WARN_ON_ONCE(rdp->exp_deferred_qs); } /* @@ -357,7 +337,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->deferred_qs) + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -471,14 +451,15 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->deferred_qs) { + if (!special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { + if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -490,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->deferred_qs) { + if (rdp->exp_deferred_qs) { rcu_report_exp_rdp(rdp); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); @@ -579,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.deferred_qs) || + return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } @@ -607,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) } /* + * Minimal handler to give the scheduler a chance to re-evaluate. + */ +static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + + rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + rdp->defer_qs_iw_pending = false; +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. @@ -625,16 +617,41 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); - /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled) { - /* Enabling irqs does not reschedule, so... */ + bool exp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; + + t->rcu_read_unlock_special.b.exp_hint = false; + exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || + (rdp->grpmask & rnp->expmask) || + tick_nohz_full_cpu(rdp->cpu); + // Need to defer quiescent state until everything is enabled. + if ((exp || in_irq()) && irqs_were_disabled && use_softirq && + (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + // Using softirq, safe to awaken, and we get + // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); + } else if (exp && irqs_were_disabled && !use_softirq && + !t->rcu_read_unlock_special.b.deferred_qs) { + // Safe to awaken and we get no help from enabling + // irqs, unlike bh/preempt. + invoke_rcu_core(); } else { - /* Enabling BH or preempt does reschedule, so... */ + // Enabling BH or preempt does reschedule, so... + // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->defer_qs_iw_pending && exp) { + // Get scheduler to re-evaluate and call hooks. + // If !IRQ_WORK, FQS scan will eventually IPI. + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = true; + irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); + } } + t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } @@ -760,7 +777,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); - if (++i >= 10) + if (++i >= ncheck) break; } pr_cont("\n"); @@ -944,18 +961,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +/* + * If boosting, set rcuc kthreads to realtime priority. + */ +static void rcu_cpu_kthread_setup(unsigned int cpu) +{ #ifdef CONFIG_RCU_BOOST + struct sched_param sp; -static void rcu_wake_cond(struct task_struct *t, int status) -{ - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) - wake_up_process(t); + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +#endif /* #ifdef CONFIG_RCU_BOOST */ } +#ifdef CONFIG_RCU_BOOST + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1091,23 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) } /* - * Wake up the per-CPU kthread to invoke RCU callbacks. - */ -static void invoke_rcu_callbacks_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), - __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); - } - local_irq_restore(flags); -} - -/* * Is the current CPU running the RCU-callbacks kthread? * Caller must have preemption disabled. */ @@ -1160,59 +1163,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ - struct sched_param sp; - - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_data.rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces - * the RCU softirq used in configurations of RCU that do not support RCU - * priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_do_batch(this_cpu_ptr(&rcu_data)); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } - } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; -} - /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still @@ -1243,27 +1193,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_data.rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - /* * Spawn boost kthreads -- called as soon as the scheduler is running. */ static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; - int cpu; - for_each_possible_cpu(cpu) - per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; - if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) - return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -1286,11 +1222,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void invoke_rcu_callbacks_kthread(void) -{ - WARN_ON_ONCE(1); -} - static bool rcu_is_callbacks_kthread(void) { return false; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index f65a73a97323..065183391f75 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, time_before(j, rcu_state.gp_req_activity + gpssdelay) || time_before(j, rcu_state.gp_activity + gpssdelay) || atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ + if (rnp_root != rnp) + /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp_root); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c3bf44ba42e5..61df2bf08563 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +/* Get rcutorture access to sched_setaffinity(). */ +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + int ret; + + ret = sched_setaffinity(pid, in_mask); + WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + return ret; +} +EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); +#endif + #ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b798fe7ff7cd..036be95a87e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5922,6 +5922,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 time, cost; s64 delta; int cpu, nr = INT_MAX; + int this = smp_processor_id(); this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -5945,7 +5946,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t nr = 4; } - time = local_clock(); + time = cpu_clock(this); for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!--nr) @@ -5956,7 +5957,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t break; } - time = local_clock() - time; + time = cpu_clock(this) - time; cost = this_sd->avg_scan_cost; delta = (s64)(time - cost) / 8; this_sd->avg_scan_cost += delta; diff --git a/kernel/signal.c b/kernel/signal.c index d622eac9d169..edf8915ddd54 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2912,7 +2912,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask); * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed in from userland for the syscalls. */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) +void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, + bool interrupted) { if (!usigmask) @@ -2922,7 +2923,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) * Restoring sigmask here can lead to delivering signals that the above * syscalls are intended to block because of the sigmask passed in. */ - if (signal_pending(current)) { + if (interrupted) { current->saved_sigmask = *sigsaved; set_restore_sigmask(); return; diff --git a/kernel/smp.c b/kernel/smp.c index d155374632eb..616d4d114847 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -34,7 +34,7 @@ struct call_function_data { cpumask_var_t cpumask_ipi; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); +static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); @@ -487,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many); * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function(smp_call_func_t func, void *info, int wait) +void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); - - return 0; } EXPORT_SYMBOL(smp_call_function); @@ -594,18 +592,16 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(void (*func) (void *info), void *info, int wait) { unsigned long flags; - int ret = 0; preempt_disable(); - ret = smp_call_function(func, info, wait); + smp_call_function(func, info, wait); local_irq_save(flags); func(info); local_irq_restore(flags); preempt_enable(); - return ret; } EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index a6b81c6b6bff..0427a86743a4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -649,7 +649,7 @@ static int takeover_tasklets(unsigned int cpu) /* Find end, append list for that CPU. */ if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; - this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); + __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); per_cpu(tasklet_vec, cpu).head = NULL; per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2b5a6754646f..b4f83f7bdf86 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -177,12 +177,18 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } +void __weak stop_machine_yield(const struct cpumask *cpumask) +{ + cpu_relax(); +} + /* This is the cpu_stop function which stops the CPU. */ static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; enum multi_stop_state curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; + const struct cpumask *cpumask; unsigned long flags; bool is_active; @@ -192,15 +198,18 @@ static int multi_cpu_stop(void *data) */ local_save_flags(flags); - if (!msdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, msdata->active_cpus); + if (!msdata->active_cpus) { + cpumask = cpu_online_mask; + is_active = cpu == cpumask_first(cpumask); + } else { + cpumask = msdata->active_cpus; + is_active = cpumask_test_cpu(cpu, cpumask); + } /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax_yield(); + stop_machine_yield(cpumask); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f1e46f338a9c..1867044800bb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0519a8805aab..57518efc3810 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); /** * alarmtimer_suspend - Suspend time callback * @dev: unused - * @state: unused * * When we are going into suspend, we look through the bases * to see which is the soonest timer to expire. We then diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3bcc19ceb073..fff5f64981c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static void inline clocksource_watchdog_lock(unsigned long *flags) +static inline void clocksource_watchdog_lock(unsigned long *flags) { spin_lock_irqsave(&watchdog_lock, *flags); } -static void inline clocksource_watchdog_unlock(unsigned long *flags) +static inline void clocksource_watchdog_unlock(unsigned long *flags) { spin_unlock_irqrestore(&watchdog_lock, *flags); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 41dfff23c1f9..5ee77f1a8a92 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -30,7 +30,6 @@ #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> -#include <linux/seq_file.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> @@ -1115,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * @timer: hrtimer to stop * * Returns: - * 0 when the timer was not active - * 1 when the timer was active - * -1 when the timer is currently executing the callback function and + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8de4f789dc1b..65eb796610dc 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -43,6 +43,7 @@ static u64 tick_length_base; #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +#define MAX_TAI_OFFSET 100000 /* * phase-lock loop variables @@ -691,7 +692,8 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, time_constant = max(time_constant, 0l); } - if (txc->modes & ADJ_TAI && txc->constant >= 0) + if (txc->modes & ADJ_TAI && + txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 29176635991f..d7f2d91acdac 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -980,23 +980,16 @@ retry_delete: */ static void itimer_delete(struct k_itimer *timer) { - unsigned long flags; - retry_delete: - spin_lock_irqsave(&timer->it_lock, flags); + spin_lock_irq(&timer->it_lock); if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); + spin_unlock_irq(&timer->it_lock); goto retry_delete; } list_del(&timer->list); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; - unlock_timer(timer, flags); + spin_unlock_irq(&timer->it_lock); release_posix_timer(timer, IT_ID_SET); } diff --git a/kernel/time/time.c b/kernel/time/time.c index 7f7d6914ddd5..5c54ca632d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, if (tv) { if (compat_get_timeval(&user_tv, tv)) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 44b726bab4bd..d911c8470149 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -819,7 +819,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) } while (read_seqcount_retry(&tk_core.seq, seq)); - return base + nsecs; + return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 98ba50dcb1b2..acb326f5f50a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now) SEQ_printf(m, "\n"); } -static int timer_list_show(struct seq_file *m, void *v) -{ - struct timer_list_iter *iter = v; - - if (iter->cpu == -1 && !iter->second_pass) - timer_list_header(m, iter->now); - else if (!iter->second_pass) - print_cpu(m, iter->cpu, iter->now); -#ifdef CONFIG_GENERIC_CLOCKEVENTS - else if (iter->cpu == -1 && iter->second_pass) - timer_list_show_tickdevices_header(m); - else - print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); -#endif - return 0; -} - void sysrq_timer_list_show(void) { u64 now = ktime_to_ns(ktime_get()); @@ -317,6 +300,24 @@ void sysrq_timer_list_show(void) return; } +#ifdef CONFIG_PROC_FS +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + static void *move_iter(struct timer_list_iter *iter, loff_t offset) { for (; offset; offset--) { @@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void) return 0; } __initcall(init_timer_list_procfs); +#endif diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c new file mode 100644 index 000000000000..a80893180826 --- /dev/null +++ b/kernel/time/vsyscall.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 ARM Ltd. + * + * Generic implementation of update_vsyscall and update_vsyscall_tz. + * + * Based on the x86 specific implementation. + */ + +#include <linux/hrtimer.h> +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> +#include <vdso/vsyscall.h> + +static inline void update_vdso_data(struct vdso_data *vdata, + struct timekeeper *tk) +{ + struct vdso_timestamp *vdso_ts; + u64 nsec; + + vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vdata[CS_RAW].mask = tk->tkr_raw.mask; + vdata[CS_RAW].mult = tk->tkr_raw.mult; + vdata[CS_RAW].shift = tk->tkr_raw.shift; + + /* CLOCK_REALTIME */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + + /* CLOCK_MONOTONIC */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + + /* CLOCK_BOOTTIME */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + + ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* CLOCK_TAI */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + + /* + * Read without the seqlock held by clock_getres(). + * Note: No need to have a second copy. + */ + WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_timestamp *vdso_ts; + u64 nsec; + + if (__arch_update_vdso_data()) { + /* + * Some architectures might want to skip the update of the + * data page. + */ + return; + } + + /* copy vsyscall data */ + vdso_write_begin(vdata); + + vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); + vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + + /* CLOCK_REALTIME_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + /* CLOCK_MONOTONIC_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = nsec + tk->wall_to_monotonic.tv_nsec; + while (nsec >= NSEC_PER_SEC) { + nsec = nsec - NSEC_PER_SEC; + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + if (__arch_use_vsyscall(vdata)) + update_vdso_data(vdata, tk); + + __arch_update_vsyscall(vdata, tk); + + vdso_write_end(vdata); + + __arch_sync_vdso_data(vdata); +} + +void update_vsyscall_tz(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + if (__arch_use_vsyscall(vdata)) { + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + } + + __arch_sync_vdso_data(vdata); +} diff --git a/kernel/torture.c b/kernel/torture.c index 17b2be9bde12..a8d9bdfba7c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void) static struct task_struct *stutter_task; static int stutter_pause_test; static int stutter; +static int stutter_gap; /* * Block until the stutter interval ends. This must be called periodically @@ -578,10 +579,12 @@ static int stutter; bool stutter_wait(const char *title) { int spt; + bool ret = false; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { + ret = true; if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -592,7 +595,7 @@ bool stutter_wait(const char *title) } torture_shutdown_absorb(title); } - return !!spt; + return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -602,17 +605,24 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { + int wtime; + VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - WRITE_ONCE(stutter_pause_test, 1); - schedule_timeout_interruptible(stutter - 1); + wtime = stutter; + if (stutter > HZ + 1) { + WRITE_ONCE(stutter_pause_test, 1); + wtime = stutter - HZ - 1; + schedule_timeout_interruptible(wtime); + wtime = HZ + 1; + } WRITE_ONCE(stutter_pause_test, 2); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(wtime); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter); + schedule_timeout_interruptible(stutter_gap); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -622,9 +632,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(const int s) +int torture_stutter_init(const int s, const int sgap) { stutter = s; + stutter_gap = sgap; return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38277af44f5c..576c41644e77 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,7 +34,6 @@ #include <linux/hash.h> #include <linux/rcupdate.h> #include <linux/kprobes.h> -#include <linux/memory.h> #include <trace/events/sched.h> @@ -2611,12 +2610,10 @@ static void ftrace_run_update_code(int command) { int ret; - mutex_lock(&text_mutex); - ret = ftrace_arch_code_modify_prepare(); FTRACE_WARN_ON(ret); if (ret) - goto out_unlock; + return; /* * By default we use stop_machine() to modify the code. @@ -2628,9 +2625,6 @@ static void ftrace_run_update_code(int command) ret = ftrace_arch_code_modify_post_process(); FTRACE_WARN_ON(ret); - -out_unlock: - mutex_unlock(&text_mutex); } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, @@ -5784,7 +5778,6 @@ void ftrace_module_enable(struct module *mod) struct ftrace_page *pg; mutex_lock(&ftrace_lock); - mutex_lock(&text_mutex); if (ftrace_disabled) goto out_unlock; @@ -5846,7 +5839,6 @@ void ftrace_module_enable(struct module *mod) ftrace_arch_code_modify_post_process(); out_unlock: - mutex_unlock(&text_mutex); mutex_unlock(&ftrace_lock); process_cached_mods(mod->name); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 83e08b78dbee..c3aabb576fe5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6719,11 +6719,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, break; } #endif - if (!tr->allocated_snapshot) { + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, iter->cpu_file); + else ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) - break; - } + if (ret < 0) + break; local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) @@ -7126,12 +7128,24 @@ static ssize_t tracing_err_log_write(struct file *file, return count; } +static int tracing_err_log_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; +} + static const struct file_operations tracing_err_log_fops = { .open = tracing_err_log_open, .write = tracing_err_log_write, .read = seq_read, .llseek = seq_lseek, - .release = tracing_release_generic_tr, + .release = tracing_err_log_release, }; static int tracing_buffers_open(struct inode *inode, struct file *filp) diff --git a/kernel/up.c b/kernel/up.c index 483c9962c999..862b460ab97a 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -35,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) } EXPORT_SYMBOL(smp_call_function_single_async); -int on_each_cpu(smp_call_func_t func, void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; local_irq_save(flags); func(info); local_irq_restore(flags); - return 0; } EXPORT_SYMBOL(on_each_cpu); |