From 16e53dbf10a2d7e228709a7286310e629ede5e45 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 12 Jun 2013 14:04:36 -0700 Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug There are instances in the kernel where we would like to disable CPU hotplug (from sysfs) during some important operation. Today the freezer code depends on this and the code to do it was kinda tailor-made for that. Restructure the code and make it generic enough to be useful for other usecases too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Shawn Guo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu.h | 4 ++++ kernel/cpu.c | 55 ++++++++++++++++++++++------------------------------- 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index c6f6e0839b61..9f3c7e81270a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -175,6 +175,8 @@ extern struct bus_type cpu_subsys; extern void get_online_cpus(void); extern void put_online_cpus(void); +extern void cpu_hotplug_disable(void); +extern void cpu_hotplug_enable(void); #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri) #define register_hotcpu_notifier(nb) register_cpu_notifier(nb) #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) @@ -198,6 +200,8 @@ static inline void cpu_hotplug_driver_unlock(void) #define get_online_cpus() do { } while (0) #define put_online_cpus() do { } while (0) +#define cpu_hotplug_disable() do { } while (0) +#define cpu_hotplug_enable() do { } while (0) #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) /* These aren't inline functions due to a GCC bug. */ #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -540,36 +561,6 @@ static int __init alloc_frozen_cpus(void) } core_initcall(alloc_frozen_cpus); -/* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: -- cgit v1.2.3 From cf7df378aa4ff7da3a44769b7ff6e9eef1a9f3db Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Wed, 12 Jun 2013 14:04:37 -0700 Subject: reboot: rigrate shutdown/reboot to boot cpu We recently noticed that reboot of a 1024 cpu machine takes approx 16 minutes of just stopping the cpus. The slowdown was tracked to commit f96972f2dc63 ("kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()"). The current implementation does all the work of hot removing the cpus before halting the system. We are switching to just migrating to the boot cpu and then continuing with shutdown/reboot. This also has the effect of not breaking x86's command line parameter for specifying the reboot cpu. Note, this code was shamelessly copied from arch/x86/kernel/reboot.c with bits removed pertaining to the reboot_cpu command line parameter. Signed-off-by: Robin Holt Tested-by: Shawn Guo Cc: "Srivatsa S. Bhat" Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index b95d3c72ba21..2bbd9a73b54c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_reboot_notifier); +/* Add backwards compatibility for stable trees. */ +#ifndef PF_NO_SETAFFINITY +#define PF_NO_SETAFFINITY PF_THREAD_BOUND +#endif + +static void migrate_to_reboot_cpu(void) +{ + /* The boot cpu is always logical cpu 0 */ + int cpu = 0; + + cpu_hotplug_disable(); + + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_online(cpu)) + cpu = cpumask_first(cpu_online_mask); + + /* Prevent races with other tasks migrating this task */ + current->flags |= PF_NO_SETAFFINITY; + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); @@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -419,7 +442,7 @@ void kernel_power_off(void) kernel_shutdown_prepare(SYSTEM_POWER_OFF); if (pm_power_off_prepare) pm_power_off_prepare(); - disable_nonboot_cpus(); + migrate_to_reboot_cpu(); syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); -- cgit v1.2.3 From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jun 2013 14:04:39 -0700 Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg The dmesg_restrict sysctl currently covers the syslog method for access dmesg, however /dev/kmsg isn't covered by the same protections. Most people haven't noticed because util-linux dmesg(1) defaults to using the syslog method for access in older versions. With util-linux dmesg(1) defaults to reading directly from /dev/kmsg. To fix /dev/kmsg, let's compare the existing interfaces and what they allow: - /proc/kmsg allows: - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive single-reader interface (SYSLOG_ACTION_READ). - everything, after an open. - syslog syscall allows: - anything, if CAP_SYSLOG. - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if dmesg_restrict==0. - nothing else (EPERM). The use-cases were: - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs. - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the destructive SYSLOG_ACTION_READs. AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't clear the ring buffer. Based on the comments in devkmsg_llseek, it sounds like actions besides reading aren't going to be supported by /dev/kmsg (i.e. SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive syslog syscall actions. To this end, move the check as Josh had done, but also rename the constants to reflect their new uses (SYSLOG_FROM_CALL becomes SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC). SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC allows destructive actions after a capabilities-constrained SYSLOG_ACTION_OPEN check. - /dev/kmsg allows: - open if CAP_SYSLOG or dmesg_restrict==0 - reading/polling, after open Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192 [akpm@linux-foundation.org: use pr_warn_once()] Signed-off-by: Kees Cook Reported-by: Christian Kujau Tested-by: Josh Boyer Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kmsg.c | 10 +++--- include/linux/syslog.h | 4 +-- kernel/printk.c | 91 +++++++++++++++++++++++++++----------------------- 3 files changed, 57 insertions(+), 48 deletions(-) diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a740ff1..bdfabdaefdce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 38911391a139..98a3153c0f96 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -44,8 +44,8 @@ /* Return size of the log buffer */ #define SYSLOG_ACTION_SIZE_BUFFER 10 -#define SYSLOG_FROM_CALL 0 -#define SYSLOG_FROM_FILE 1 +#define SYSLOG_FROM_READER 0 +#define SYSLOG_FROM_PROC 1 int do_syslog(int type, char __user *buf, int count, bool from_file); diff --git a/kernel/printk.c b/kernel/printk.c index fa36e1494420..8212c1aef125 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* -- cgit v1.2.3 From 5402b8047b0d286b6501f9097891cbf1e06daa3a Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 12 Jun 2013 14:04:40 -0700 Subject: lib/mpi/mpicoder.c: looping issue, need stop when equal to zero, found by 'EXTRA_FLAGS=-W'. For 'while' looping, need stop when 'nbytes == 0', or will cause issue. ('nbytes' is size_t which is always bigger or equal than zero). The related warning: (with EXTRA_CFLAGS=-W) lib/mpi/mpicoder.c:40:2: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] Signed-off-by: Chen Gang Cc: Rusty Russell Cc: David Howells Cc: James Morris Cc: Andy Shevchenko Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/mpi/mpicoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 5f9c44cdf1f5..4cc6442733f4 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -37,7 +37,7 @@ MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes) mpi_limb_t a; MPI val = NULL; - while (nbytes >= 0 && buffer[0] == 0) { + while (nbytes > 0 && buffer[0] == 0) { buffer++; nbytes--; } -- cgit v1.2.3 From 7869e590679ed71cd1a1e676e8c1c179762c3efe Mon Sep 17 00:00:00 2001 From: "Xiaowei.Hu" Date: Wed, 12 Jun 2013 14:04:41 -0700 Subject: ocfs2: ocfs2_prep_new_orphaned_file() should return ret If an error occurs, for example an EIO in __ocfs2_prepare_orphan_dir, ocfs2_prep_new_orphaned_file will release the inode_ac, then when the caller of ocfs2_prep_new_orphaned_file gets a 0 return, it will refer to a NULL ocfs2_alloc_context struct in the following functions. A kernel panic happens. Signed-off-by: "Xiaowei.Hu" Reviewed-by: shencanquan Acked-by: Sunil Mushran Cc: Joe Jin Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 04ee1b57c243..b563351753f1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2216,7 +2216,7 @@ out: brelse(orphan_dir_bh); - return 0; + return ret; } int ocfs2_create_inode_in_orphan(struct inode *dir, -- cgit v1.2.3 From f101a9464bfbda42730b54a66f926d75ed2cd31e Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 12 Jun 2013 14:04:42 -0700 Subject: memcg: don't initialize kmem-cache destroying work for root caches struct memcg_cache_params has a union. Different parts of this union are used for root and non-root caches. A part with destroying work is used only for non-root caches. BUG: unable to handle kernel paging request at 0000000fffffffe0 IP: kmem_cache_alloc+0x41/0x1f0 Modules linked in: netlink_diag af_packet_diag udp_diag tcp_diag inet_diag unix_diag ip6table_filter ip6_tables i2c_piix4 virtio_net virtio_balloon microcode i2c_core pcspkr floppy CPU: 0 PID: 1929 Comm: lt-vzctl Tainted: G D 3.10.0-rc1+ #2 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 RIP: kmem_cache_alloc+0x41/0x1f0 Call Trace: getname_flags.part.34+0x30/0x140 getname+0x38/0x60 do_sys_open+0xc5/0x1e0 SyS_open+0x22/0x30 system_call_fastpath+0x16/0x1b Code: f4 53 48 83 ec 18 8b 05 8e 53 b7 00 4c 8b 4d 08 21 f0 a8 10 74 0d 4c 89 4d c0 e8 1b 76 4a 00 4c 8b 4d c0 e9 92 00 00 00 4d 89 f5 <4d> 8b 45 00 65 4c 03 04 25 48 cd 00 00 49 8b 50 08 4d 8b 38 49 RIP [] kmem_cache_alloc+0x41/0x1f0 Signed-off-by: Andrey Vagin Cc: Konstantin Khlebnikov Cc: Glauber Costa Cc: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Cc: Li Zefan Cc: [3.9.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 010d6c14129a..931e38c6f095 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3141,8 +3141,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return -ENOMEM; } - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); s->memcg_params->is_root_cache = true; /* -- cgit v1.2.3 From 5a280844bb3bcd79076cac6ad002f71d25c798e5 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Wed, 12 Jun 2013 14:04:44 -0700 Subject: drivers/rtc/rtc-tps6586x.c: device wakeup flags correction Use device_init_wakeup() instead of device_set_wakeup_capable() and move it before rtc dev registering. This fixes alarmtimer not registered when tps6586x rtc is the only wakeup compatible rtc in the system. Signed-off-by: Dmitry Osipenko Cc: Laxman Dewangan Cc: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-tps6586x.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-tps6586x.c b/drivers/rtc/rtc-tps6586x.c index 459c2ffc95a6..426901cef14f 100644 --- a/drivers/rtc/rtc-tps6586x.c +++ b/drivers/rtc/rtc-tps6586x.c @@ -273,6 +273,8 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) return ret; } + device_init_wakeup(&pdev->dev, 1); + platform_set_drvdata(pdev, rtc); rtc->rtc = devm_rtc_device_register(&pdev->dev, dev_name(&pdev->dev), &tps6586x_rtc_ops, THIS_MODULE); @@ -292,7 +294,6 @@ static int tps6586x_rtc_probe(struct platform_device *pdev) goto fail_rtc_register; } disable_irq(rtc->irq); - device_set_wakeup_capable(&pdev->dev, 1); return 0; fail_rtc_register: -- cgit v1.2.3 From ebf8d6c8630bfd3e24683306599cb953c9a2842c Mon Sep 17 00:00:00 2001 From: Derek Basehore Date: Wed, 12 Jun 2013 14:04:45 -0700 Subject: drivers/rtc/rtc-cmos.c: fix accidentally enabling rtc channel During resume, we call hpet_rtc_timer_init after masking an irq bit in hpet. This will cause the call to hpet_disable_rtc_channel to be undone if RTC_AIE is the only bit not masked. Allowing the cmos interrupt handler to run before resuming caused some issues where the timer for the alarm was not removed. This would cause other, later timers to not be cleared, so utilities such as hwclock would time out when waiting for the update interrupt. [akpm@linux-foundation.org: coding-style tweak] Signed-off-by: Derek Basehore Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-cmos.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index cc5bea9c4b1c..f1cb706445c7 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -854,6 +854,9 @@ static int cmos_resume(struct device *dev) } spin_lock_irq(&rtc_lock); + if (device_may_wakeup(dev)) + hpet_rtc_timer_init(); + do { CMOS_WRITE(tmp, RTC_CONTROL); hpet_set_rtc_irq_bit(tmp & RTC_IRQMASK); @@ -869,7 +872,6 @@ static int cmos_resume(struct device *dev) rtc_update_irq(cmos->rtc, 1, mask); tmp &= ~RTC_AIE; hpet_mask_rtc_irq_bit(RTC_AIE); - hpet_rtc_timer_init(); } while (mask & RTC_AIE); spin_unlock_irq(&rtc_lock); } -- cgit v1.2.3 From f000cfdde5de4fc15dead5ccf524359c07eadf2b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 12 Jun 2013 14:04:46 -0700 Subject: audit: wait_for_auditd() should use TASK_UNINTERRUPTIBLE audit_log_start() does wait_for_auditd() in a loop until audit_backlog_wait_time passes or audit_skb_queue has a room. If signal_pending() is true this becomes a busy-wait loop, schedule() in TASK_INTERRUPTIBLE won't block. Thanks to Guy for fully investigating and explaining the problem. (akpm: that'll cause the system to lock up on a non-preemptible uniprocessor kernel) (Guy: "Our customer was in fact running a uniprocessor machine, and they reported a system hang.") Signed-off-by: Oleg Nesterov Reported-by: Guy Streeter Cc: Eric Paris Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/audit.c b/kernel/audit.c index 21c7fa615bd3..91e53d04b6a9 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, static void wait_for_auditd(unsigned long sleep_time) { DECLARE_WAITQUEUE(wait, current); - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&audit_backlog_wait, &wait); if (audit_backlog_limit && -- cgit v1.2.3 From 03f47e888daf56c8e9046c674719a0bcc644eed5 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Wed, 12 Jun 2013 14:04:47 -0700 Subject: cciss: fix broken mutex usage in ioctl If a new logical drive is added and the CCISS_REGNEWD ioctl is invoked (as is normal with the Array Configuration Utility) the process will hang as below. It attempts to acquire the same mutex twice, once in do_ioctl() and once in cciss_unlocked_open(). The BKL was recursive, the mutex isn't. Linux version 3.10.0-rc2 (scameron@localhost.localdomain) (gcc version 4.4.7 20120313 (Red Hat 4.4.7-3) (GCC) ) #1 SMP Fri May 24 14:32:12 CDT 2013 [...] acu D 0000000000000001 0 3246 3191 0x00000080 Call Trace: schedule+0x29/0x70 schedule_preempt_disabled+0xe/0x10 __mutex_lock_slowpath+0x17b/0x220 mutex_lock+0x2b/0x50 cciss_unlocked_open+0x2f/0x110 [cciss] __blkdev_get+0xd3/0x470 blkdev_get+0x5c/0x1e0 register_disk+0x182/0x1a0 add_disk+0x17c/0x310 cciss_add_disk+0x13a/0x170 [cciss] cciss_update_drive_info+0x39b/0x480 [cciss] rebuild_lun_table+0x258/0x370 [cciss] cciss_ioctl+0x34f/0x470 [cciss] do_ioctl+0x49/0x70 [cciss] __blkdev_driver_ioctl+0x28/0x30 blkdev_ioctl+0x200/0x7b0 block_ioctl+0x3c/0x40 do_vfs_ioctl+0x89/0x350 SyS_ioctl+0xa1/0xb0 system_call_fastpath+0x16/0x1b This mutex usage was added into the ioctl path when the big kernel lock was removed. As it turns out, these paths are all thread safe anyway (or can easily be made so) and we don't want ioctl() to be single threaded in any case. Signed-off-by: Stephen M. Cameron Cc: Jens Axboe Cc: Mike Miller Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/cciss.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 6374dc103521..62b6c2cc80b5 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -168,8 +168,6 @@ static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id); static int cciss_open(struct block_device *bdev, fmode_t mode); static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode); static void cciss_release(struct gendisk *disk, fmode_t mode); -static int do_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); static int cciss_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -235,7 +233,7 @@ static const struct block_device_operations cciss_fops = { .owner = THIS_MODULE, .open = cciss_unlocked_open, .release = cciss_release, - .ioctl = do_ioctl, + .ioctl = cciss_ioctl, .getgeo = cciss_getgeo, #ifdef CONFIG_COMPAT .compat_ioctl = cciss_compat_ioctl, @@ -1143,16 +1141,6 @@ static void cciss_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&cciss_mutex); } -static int do_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - int ret; - mutex_lock(&cciss_mutex); - ret = cciss_ioctl(bdev, mode, cmd, arg); - mutex_unlock(&cciss_mutex); - return ret; -} - #ifdef CONFIG_COMPAT static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, @@ -1179,7 +1167,7 @@ static int cciss_compat_ioctl(struct block_device *bdev, fmode_t mode, case CCISS_REGNEWD: case CCISS_RESCANDISK: case CCISS_GETLUNINFO: - return do_ioctl(bdev, mode, cmd, arg); + return cciss_ioctl(bdev, mode, cmd, arg); case CCISS_PASSTHRU32: return cciss_ioctl32_passthru(bdev, mode, cmd, arg); @@ -1219,7 +1207,7 @@ static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, if (err) return -EFAULT; - err = do_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p); + err = cciss_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p); if (err) return err; err |= @@ -1261,7 +1249,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode, if (err) return -EFAULT; - err = do_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p); + err = cciss_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p); if (err) return err; err |= @@ -1311,11 +1299,14 @@ static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp) static int cciss_getintinfo(ctlr_info_t *h, void __user *argp) { cciss_coalint_struct intinfo; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay); intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user (argp, &intinfo, sizeof(cciss_coalint_struct))) return -EFAULT; @@ -1356,12 +1347,15 @@ static int cciss_setintinfo(ctlr_info_t *h, void __user *argp) static int cciss_getnodename(ctlr_info_t *h, void __user *argp) { NodeName_type NodeName; + unsigned long flags; int i; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); for (i = 0; i < 16; i++) NodeName[i] = readb(&h->cfgtable->ServerName[i]); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) return -EFAULT; return 0; @@ -1398,10 +1392,13 @@ static int cciss_setnodename(ctlr_info_t *h, void __user *argp) static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) { Heartbeat_type heartbeat; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); heartbeat = readl(&h->cfgtable->HeartBeat); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type))) return -EFAULT; return 0; @@ -1410,10 +1407,13 @@ static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) static int cciss_getbustypes(ctlr_info_t *h, void __user *argp) { BusTypes_type BusTypes; + unsigned long flags; if (!argp) return -EINVAL; + spin_lock_irqsave(&h->lock, flags); BusTypes = readl(&h->cfgtable->BusTypes); + spin_unlock_irqrestore(&h->lock, flags); if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type))) return -EFAULT; return 0; -- cgit v1.2.3 From 24b8256a1fb28d357bc6fa09184ba29b4255ba5c Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 12 Jun 2013 14:04:48 -0700 Subject: drivers/rtc/rtc-twl.c: fix missing device_init_wakeup() when booted with device tree When booted in legacy mode device_init_wakeup() gets called by drivers/mfd/twl-core.c when the children are initialized. However, when booted using device tree, the children are created with of_platform_populate() instead add_children(). This means that the RTC driver will not have device_init_wakeup() set, and we need to call it from the driver probe like RTC drivers typically do. Without this we cannot test PM wake-up events on omaps for cases where there may not be any physical wake-up event. Signed-off-by: Tony Lindgren Reported-by: Kevin Hilman Cc: Alessandro Zummo Cc: Jingoo Han Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-twl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index 8751a5240c99..b2eab34f38d9 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -524,6 +524,7 @@ static int twl_rtc_probe(struct platform_device *pdev) } platform_set_drvdata(pdev, rtc); + device_init_wakeup(&pdev->dev, 1); return 0; out2: -- cgit v1.2.3 From cbab0e4eec299e9059199ebe6daf48730be46d2b Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Wed, 12 Jun 2013 14:04:49 -0700 Subject: swap: avoid read_swap_cache_async() race to deadlock while waiting on discard I/O completion read_swap_cache_async() can race against get_swap_page(), and stumble across a SWAP_HAS_CACHE entry in the swap map whose page wasn't brought into the swapcache yet. This transient swap_map state is expected to be transitory, but the actual placement of discard at scan_swap_map() inserts a wait for I/O completion thus making the thread at read_swap_cache_async() to loop around its -EEXIST case, while the other end at get_swap_page() is scheduled away at scan_swap_map(). This can leave the system deadlocked if the I/O completion happens to be waiting on the CPU waitqueue where read_swap_cache_async() is busy looping and !CONFIG_PREEMPT. This patch introduces a cond_resched() call to make the aforementioned read_swap_cache_async() busy loop condition to bail out when necessary, thus avoiding the subtle race window. Signed-off-by: Rafael Aquini Acked-by: Johannes Weiner Acked-by: KOSAKI Motohiro Acked-by: Hugh Dickins Cc: Shaohua Li Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index b3d40dcf3624..f24ab0dff554 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -336,8 +336,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { /* seems racy */ + if (err == -EEXIST) { radix_tree_preload_end(); + /* + * We might race against get_swap_page() and stumble + * across a SWAP_HAS_CACHE swap_map entry whose page + * has not been brought into the swapcache yet, while + * the other end is scheduled away waiting on discard + * I/O completion at scan_swap_map(). + * + * In order to avoid turning this transitory state + * into a permanent loop around this -EEXIST case + * if !CONFIG_PREEMPT and the I/O completion happens + * to be waiting on the CPU waitqueue where we are now + * busy looping, we just conditionally invoke the + * scheduler here, if there are some more important + * tasks to run. + */ + cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ -- cgit v1.2.3 From e099127169429c19544a8f55dd26937fddd5b1f4 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 12 Jun 2013 14:04:51 -0700 Subject: fs/ocfs2/namei.c: remove unecessary ERROR when removing non-empty directory While removing a non-empty directory, the kernel dumps a message: (rmdir,21743,1):ocfs2_unlink:953 ERROR: status = -39 Suppress the error message from being printed in the dmesg so users don't panic. Signed-off-by: Goldwyn Rodrigues Cc: Mark Fasheh Cc: Joel Becker Acked-by: Sunil Mushran Reviewed-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b563351753f1..b4a5cdf9dbc5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -947,7 +947,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status) + if (status && (status != -ENOTEMPTY)) mlog_errno(status); return status; -- cgit v1.2.3 From 558c61e5579a81551c0d6c2deaed1da3c7bf714a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 12 Jun 2013 14:04:52 -0700 Subject: rtc-at91rm9200: add match-table compile guard The members of Atmel's at91sam9x5 family (9x5) have a broken RTC interrupt mask register (AT91_RTC_IMR). It does not reflect enabled interrupts but instead always returns zero. The kernel's rtc-at91rm9200 driver handles the RTC for the 9x5 family. Currently when the date/time is set, an interrupt is generated and this driver neglects to handle the interrupt. The kernel complains about the un-handled interrupt and disables it henceforth. This not only breaks the RTC function, but since that interrupt is shared (Atmel's SYS interrupt) then other things break as well (e.g. the debug port no longer accepts characters). Tested on the at91sam9g25. Bug confirmed by Atmel. This patch (of 5): Add missing match-table compile guard. Signed-off-by: Johan Hovold Acked-by: Nicolas Ferre Cc: Douglas Gilbert Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Ludovic Desroches Cc: Robert Nelson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-at91rm9200.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 0eab77b22340..eeeb73f1fc3b 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -383,11 +383,13 @@ static int at91_rtc_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(at91_rtc_pm_ops, at91_rtc_suspend, at91_rtc_resume); +#ifdef CONFIG_OF static const struct of_device_id at91_rtc_dt_ids[] = { { .compatible = "atmel,at91rm9200-rtc" }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, at91_rtc_dt_ids); +#endif static struct platform_driver at91_rtc_driver = { .remove = __exit_p(at91_rtc_remove), -- cgit v1.2.3 From de645475913f677eb024b3d2bd52e264e8106497 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 12 Jun 2013 14:04:53 -0700 Subject: rtc-at91rm9200: add configuration support Add configuration support which can be used to implement SoC-specific workarounds for broken hardware. Signed-off-by: Johan Hovold Acked-by: Nicolas Ferre Cc: Douglas Gilbert Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Ludovic Desroches Cc: Robert Nelson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-at91rm9200.c | 46 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index eeeb73f1fc3b..ab2024b159fa 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -42,6 +42,10 @@ #define AT91_RTC_EPOCH 1900UL /* just like arch/arm/common/rtctime.c */ +struct at91_rtc_config { +}; + +static const struct at91_rtc_config *at91_rtc_config; static DECLARE_COMPLETION(at91_rtc_updated); static unsigned int at91_alarm_year = AT91_RTC_EPOCH; static void __iomem *at91_rtc_regs; @@ -250,6 +254,36 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id) return IRQ_NONE; /* not handled */ } +static const struct at91_rtc_config at91rm9200_config = { +}; + +#ifdef CONFIG_OF +static const struct of_device_id at91_rtc_dt_ids[] = { + { + .compatible = "atmel,at91rm9200-rtc", + .data = &at91rm9200_config, + }, { + /* sentinel */ + } +}; +MODULE_DEVICE_TABLE(of, at91_rtc_dt_ids); +#endif + +static const struct at91_rtc_config * +at91_rtc_get_config(struct platform_device *pdev) +{ + const struct of_device_id *match; + + if (pdev->dev.of_node) { + match = of_match_node(at91_rtc_dt_ids, pdev->dev.of_node); + if (!match) + return NULL; + return (const struct at91_rtc_config *)match->data; + } + + return &at91rm9200_config; +} + static const struct rtc_class_ops at91_rtc_ops = { .read_time = at91_rtc_readtime, .set_time = at91_rtc_settime, @@ -268,6 +302,10 @@ static int __init at91_rtc_probe(struct platform_device *pdev) struct resource *regs; int ret = 0; + at91_rtc_config = at91_rtc_get_config(pdev); + if (!at91_rtc_config) + return -ENODEV; + regs = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!regs) { dev_err(&pdev->dev, "no mmio resource defined\n"); @@ -383,14 +421,6 @@ static int at91_rtc_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(at91_rtc_pm_ops, at91_rtc_suspend, at91_rtc_resume); -#ifdef CONFIG_OF -static const struct of_device_id at91_rtc_dt_ids[] = { - { .compatible = "atmel,at91rm9200-rtc" }, - { /* sentinel */ } -}; -MODULE_DEVICE_TABLE(of, at91_rtc_dt_ids); -#endif - static struct platform_driver at91_rtc_driver = { .remove = __exit_p(at91_rtc_remove), .driver = { -- cgit v1.2.3 From e304fcd075a0e97d0e538dd4408b95406b505f85 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 12 Jun 2013 14:04:55 -0700 Subject: rtc-at91rm9200: refactor interrupt-register handling Add accessors for the interrupt register. This will allow us to easily add a shadow interrupt-mask register to use on SoCs where the interrupt-mask register cannot be used. Signed-off-by: Johan Hovold Acked-by: Nicolas Ferre Cc: Douglas Gilbert Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Ludovic Desroches Cc: Robert Nelson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-at91rm9200.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index ab2024b159fa..9592d08f3b11 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -51,6 +51,21 @@ static unsigned int at91_alarm_year = AT91_RTC_EPOCH; static void __iomem *at91_rtc_regs; static int irq; +static void at91_rtc_write_ier(u32 mask) +{ + at91_rtc_write(AT91_RTC_IER, mask); +} + +static void at91_rtc_write_idr(u32 mask) +{ + at91_rtc_write(AT91_RTC_IDR, mask); +} + +static u32 at91_rtc_read_imr(void) +{ + return at91_rtc_read(AT91_RTC_IMR); +} + /* * Decode time/date into rtc_time structure */ @@ -114,9 +129,9 @@ static int at91_rtc_settime(struct device *dev, struct rtc_time *tm) cr = at91_rtc_read(AT91_RTC_CR); at91_rtc_write(AT91_RTC_CR, cr | AT91_RTC_UPDCAL | AT91_RTC_UPDTIM); - at91_rtc_write(AT91_RTC_IER, AT91_RTC_ACKUPD); + at91_rtc_write_ier(AT91_RTC_ACKUPD); wait_for_completion(&at91_rtc_updated); /* wait for ACKUPD interrupt */ - at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD); + at91_rtc_write_idr(AT91_RTC_ACKUPD); at91_rtc_write(AT91_RTC_TIMR, bin2bcd(tm->tm_sec) << 0 @@ -148,7 +163,7 @@ static int at91_rtc_readalarm(struct device *dev, struct rtc_wkalrm *alrm) tm->tm_yday = rtc_year_days(tm->tm_mday, tm->tm_mon, tm->tm_year); tm->tm_year = at91_alarm_year - 1900; - alrm->enabled = (at91_rtc_read(AT91_RTC_IMR) & AT91_RTC_ALARM) + alrm->enabled = (at91_rtc_read_imr() & AT91_RTC_ALARM) ? 1 : 0; dev_dbg(dev, "%s(): %4d-%02d-%02d %02d:%02d:%02d\n", __func__, @@ -173,7 +188,7 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) tm.tm_min = alrm->time.tm_min; tm.tm_sec = alrm->time.tm_sec; - at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ALARM); + at91_rtc_write_idr(AT91_RTC_ALARM); at91_rtc_write(AT91_RTC_TIMALR, bin2bcd(tm.tm_sec) << 0 | bin2bcd(tm.tm_min) << 8 @@ -186,7 +201,7 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) if (alrm->enabled) { at91_rtc_write(AT91_RTC_SCCR, AT91_RTC_ALARM); - at91_rtc_write(AT91_RTC_IER, AT91_RTC_ALARM); + at91_rtc_write_ier(AT91_RTC_ALARM); } dev_dbg(dev, "%s(): %4d-%02d-%02d %02d:%02d:%02d\n", __func__, @@ -202,9 +217,9 @@ static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) if (enabled) { at91_rtc_write(AT91_RTC_SCCR, AT91_RTC_ALARM); - at91_rtc_write(AT91_RTC_IER, AT91_RTC_ALARM); + at91_rtc_write_ier(AT91_RTC_ALARM); } else - at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ALARM); + at91_rtc_write_idr(AT91_RTC_ALARM); return 0; } @@ -213,7 +228,7 @@ static int at91_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) */ static int at91_rtc_proc(struct device *dev, struct seq_file *seq) { - unsigned long imr = at91_rtc_read(AT91_RTC_IMR); + unsigned long imr = at91_rtc_read_imr(); seq_printf(seq, "update_IRQ\t: %s\n", (imr & AT91_RTC_ACKUPD) ? "yes" : "no"); @@ -233,7 +248,7 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id) unsigned int rtsr; unsigned long events = 0; - rtsr = at91_rtc_read(AT91_RTC_SR) & at91_rtc_read(AT91_RTC_IMR); + rtsr = at91_rtc_read(AT91_RTC_SR) & at91_rtc_read_imr(); if (rtsr) { /* this interrupt is shared! Is it ours? */ if (rtsr & AT91_RTC_ALARM) events |= (RTC_AF | RTC_IRQF); @@ -328,7 +343,7 @@ static int __init at91_rtc_probe(struct platform_device *pdev) at91_rtc_write(AT91_RTC_MR, 0); /* 24 hour mode */ /* Disable all interrupts */ - at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD | AT91_RTC_ALARM | + at91_rtc_write_idr(AT91_RTC_ACKUPD | AT91_RTC_ALARM | AT91_RTC_SECEV | AT91_RTC_TIMEV | AT91_RTC_CALEV); @@ -373,7 +388,7 @@ static int __exit at91_rtc_remove(struct platform_device *pdev) struct rtc_device *rtc = platform_get_drvdata(pdev); /* Disable all interrupts */ - at91_rtc_write(AT91_RTC_IDR, AT91_RTC_ACKUPD | AT91_RTC_ALARM | + at91_rtc_write_idr(AT91_RTC_ACKUPD | AT91_RTC_ALARM | AT91_RTC_SECEV | AT91_RTC_TIMEV | AT91_RTC_CALEV); free_irq(irq, pdev); @@ -396,13 +411,13 @@ static int at91_rtc_suspend(struct device *dev) /* this IRQ is shared with DBGU and other hardware which isn't * necessarily doing PM like we are... */ - at91_rtc_imr = at91_rtc_read(AT91_RTC_IMR) + at91_rtc_imr = at91_rtc_read_imr() & (AT91_RTC_ALARM|AT91_RTC_SECEV); if (at91_rtc_imr) { if (device_may_wakeup(dev)) enable_irq_wake(irq); else - at91_rtc_write(AT91_RTC_IDR, at91_rtc_imr); + at91_rtc_write_idr(at91_rtc_imr); } return 0; } @@ -413,7 +428,7 @@ static int at91_rtc_resume(struct device *dev) if (device_may_wakeup(dev)) disable_irq_wake(irq); else - at91_rtc_write(AT91_RTC_IER, at91_rtc_imr); + at91_rtc_write_ier(at91_rtc_imr); } return 0; } -- cgit v1.2.3 From e9f08bbe3f97829975d2b59091ef557101c83f61 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 12 Jun 2013 14:04:56 -0700 Subject: rtc-at91rm9200: add shadow interrupt mask Add shadow interrupt-mask register which can be used on SoCs where the actual hardware register is broken. Note that some care needs to be taken to make sure the shadow mask corresponds to the actual hardware state. The added overhead is not an issue for the non-broken SoCs due to the relatively infrequent interrupt-mask updates. We do, however, only use the shadow mask value as a fall-back when it actually needed as there is still a theoretical possibility that the mask is incorrect (see the code for details). Signed-off-by: Johan Hovold Acked-by: Nicolas Ferre Cc: Douglas Gilbert Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Ludovic Desroches Cc: Robert Nelson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-at91rm9200.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 9592d08f3b11..811a102092d4 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #define AT91_RTC_EPOCH 1900UL /* just like arch/arm/common/rtctime.c */ struct at91_rtc_config { + bool use_shadow_imr; }; static const struct at91_rtc_config *at91_rtc_config; @@ -50,20 +52,55 @@ static DECLARE_COMPLETION(at91_rtc_updated); static unsigned int at91_alarm_year = AT91_RTC_EPOCH; static void __iomem *at91_rtc_regs; static int irq; +static DEFINE_SPINLOCK(at91_rtc_lock); +static u32 at91_rtc_shadow_imr; static void at91_rtc_write_ier(u32 mask) { + unsigned long flags; + + spin_lock_irqsave(&at91_rtc_lock, flags); + at91_rtc_shadow_imr |= mask; at91_rtc_write(AT91_RTC_IER, mask); + spin_unlock_irqrestore(&at91_rtc_lock, flags); } static void at91_rtc_write_idr(u32 mask) { + unsigned long flags; + + spin_lock_irqsave(&at91_rtc_lock, flags); at91_rtc_write(AT91_RTC_IDR, mask); + /* + * Register read back (of any RTC-register) needed to make sure + * IDR-register write has reached the peripheral before updating + * shadow mask. + * + * Note that there is still a possibility that the mask is updated + * before interrupts have actually been disabled in hardware. The only + * way to be certain would be to poll the IMR-register, which is is + * the very register we are trying to emulate. The register read back + * is a reasonable heuristic. + */ + at91_rtc_read(AT91_RTC_SR); + at91_rtc_shadow_imr &= ~mask; + spin_unlock_irqrestore(&at91_rtc_lock, flags); } static u32 at91_rtc_read_imr(void) { - return at91_rtc_read(AT91_RTC_IMR); + unsigned long flags; + u32 mask; + + if (at91_rtc_config->use_shadow_imr) { + spin_lock_irqsave(&at91_rtc_lock, flags); + mask = at91_rtc_shadow_imr; + spin_unlock_irqrestore(&at91_rtc_lock, flags); + } else { + mask = at91_rtc_read(AT91_RTC_IMR); + } + + return mask; } /* -- cgit v1.2.3 From bba00e59107275faa615573c44eb0a513a1220a6 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 12 Jun 2013 14:04:57 -0700 Subject: rtc-at91rm9200: use shadow IMR on at91sam9x5 Add support for the at91sam9x5-family which must use the shadow interrupt mask due to a hardware issue (causing RTC_IMR to always be zero). Signed-off-by: Johan Hovold Acked-by: Nicolas Ferre Cc: Douglas Gilbert Cc: Jean-Christophe PLAGNIOL-VILLARD Cc: Ludovic Desroches Cc: Robert Nelson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/devicetree/bindings/rtc/atmel,at91rm9200-rtc.txt | 2 +- drivers/rtc/rtc-at91rm9200.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/rtc/atmel,at91rm9200-rtc.txt b/Documentation/devicetree/bindings/rtc/atmel,at91rm9200-rtc.txt index 2a3feabd3b22..34c1505774bf 100644 --- a/Documentation/devicetree/bindings/rtc/atmel,at91rm9200-rtc.txt +++ b/Documentation/devicetree/bindings/rtc/atmel,at91rm9200-rtc.txt @@ -1,7 +1,7 @@ Atmel AT91RM9200 Real Time Clock Required properties: -- compatible: should be: "atmel,at91rm9200-rtc" +- compatible: should be: "atmel,at91rm9200-rtc" or "atmel,at91sam9x5-rtc" - reg: physical base address of the controller and length of memory mapped region. - interrupts: rtc alarm/event interrupt diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c index 811a102092d4..f296f3f7db9b 100644 --- a/drivers/rtc/rtc-at91rm9200.c +++ b/drivers/rtc/rtc-at91rm9200.c @@ -309,11 +309,18 @@ static irqreturn_t at91_rtc_interrupt(int irq, void *dev_id) static const struct at91_rtc_config at91rm9200_config = { }; +static const struct at91_rtc_config at91sam9x5_config = { + .use_shadow_imr = true, +}; + #ifdef CONFIG_OF static const struct of_device_id at91_rtc_dt_ids[] = { { .compatible = "atmel,at91rm9200-rtc", .data = &at91rm9200_config, + }, { + .compatible = "atmel,at91sam9x5-rtc", + .data = &at91sam9x5_config, }, { /* sentinel */ } -- cgit v1.2.3 From 4fcc712f5c48b1e32cdbf9b9cfba42a27b2e3160 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 12 Jun 2013 14:04:59 -0700 Subject: aio: fix io_destroy() regression by using call_rcu() There was a regression introduced by 36f5588905c1 ("aio: refcounting cleanup"), reported by Jens Axboe - the refcounting cleanup switched to using RCU in the shutdown path, but the synchronize_rcu() was done in the context of the io_destroy() syscall greatly increasing the time it could block. This patch switches it to call_rcu() and makes shutdown asynchronous (more asynchronous than it was originally; before the refcount changes io_destroy() would still wait on pending kiocbs). Note that there's a global quota on the max outstanding kiocbs, and that quota must be manipulated synchronously; otherwise io_setup() could return -EAGAIN when there isn't quota available, and userspace won't have any way of waiting until shutdown of the old kioctxs has finished (besides busy looping). So we release our quota before kioctx shutdown has finished, which should be fine since the quota never corresponded to anything real anyways. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Reported-by: Jens Axboe Tested-by: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Signed-off-by: Benjamin LaHaise Tested-by: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 7fe5bdee1630..2bbcacf74d0c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -141,9 +141,6 @@ static void aio_free_ring(struct kioctx *ctx) for (i = 0; i < ctx->nr_pages; i++) put_page(ctx->ring_pages[i]); - if (ctx->mmap_size) - vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); } @@ -322,11 +319,6 @@ static void free_ioctx(struct kioctx *ctx) aio_free_ring(ctx); - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); - pr_debug("freeing %p\n", ctx); /* @@ -435,17 +427,24 @@ static void kill_ioctx(struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { hlist_del_rcu(&ctx->list); - /* Between hlist_del_rcu() and dropping the initial ref */ - synchronize_rcu(); /* - * We can't punt to workqueue here because put_ioctx() -> - * free_ioctx() will unmap the ringbuffer, and that has to be - * done in the original process's context. kill_ioctx_rcu/work() - * exist for exit_aio(), as in that path free_ioctx() won't do - * the unmap. + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). */ - kill_ioctx_work(&ctx->rcu_work); + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); + + /* Between hlist_del_rcu() and dropping the initial ref */ + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); } } @@ -495,10 +494,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); - } + kill_ioctx(ctx); } } -- cgit v1.2.3 From 282c4c0ecce9b9ac1b69acae32a4239441601405 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 12 Jun 2013 14:05:00 -0700 Subject: drivers/misc/sgi-gru/grufile.c: fix info leak in gru_get_config_info() The "info.fill" array isn't initialized so it can leak uninitialized stack information to user space. Signed-off-by: Dan Carpenter Acked-by: Robin Holt Acked-by: Dimitri Sivanich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/sgi-gru/grufile.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index 44d273c5e19d..0535d1e0bc78 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -172,6 +172,7 @@ static long gru_get_config_info(unsigned long arg) nodesperblade = 2; else nodesperblade = 1; + memset(&info, 0, sizeof(info)); info.cpus = num_online_cpus(); info.nodes = num_online_nodes(); info.blades = info.nodes / nodesperblade; -- cgit v1.2.3 From 026b08147923142e925a7d0aaa39038055ae0156 Mon Sep 17 00:00:00 2001 From: Tomasz Stanislawski Date: Wed, 12 Jun 2013 14:05:02 -0700 Subject: mm/page_alloc.c: fix watermark check in __zone_watermark_ok() The watermark check consists of two sub-checks. The first one is: if (free_pages <= min + lowmem_reserve) return false; The check assures that there is minimal amount of RAM in the zone. If CMA is used then the free_pages is reduced by the number of free pages in CMA prior to the over-mentioned check. if (!(alloc_flags & ALLOC_CMA)) free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); This prevents the zone from being drained from pages available for non-movable allocations. The second check prevents the zone from getting too fragmented. for (o = 0; o < order; o++) { free_pages -= z->free_area[o].nr_free << o; min >>= 1; if (free_pages <= min) return false; } The field z->free_area[o].nr_free is equal to the number of free pages including free CMA pages. Therefore the CMA pages are subtracted twice. This may cause a false positive fail of __zone_watermark_ok() if the CMA area gets strongly fragmented. In such a case there are many 0-order free pages located in CMA. Those pages are subtracted twice therefore they will quickly drain free_pages during the check against fragmentation. The test fails even though there are many free non-cma pages in the zone. This patch fixes this issue by subtracting CMA pages only for a purpose of (free_pages <= min + lowmem_reserve) check. Laura said: We were observing allocation failures of higher order pages (order 5 = 128K typically) under tight memory conditions resulting in driver failure. The output from the page allocation failure showed plenty of free pages of the appropriate order/type/zone and mostly CMA pages in the lower orders. For full disclosure, we still observed some page allocation failures even after applying the patch but the number was drastically reduced and those failures were attributed to fragmentation/other system issues. Signed-off-by: Tomasz Stanislawski Signed-off-by: Kyungmin Park Tested-by: Laura Abbott Cc: Bartlomiej Zolnierkiewicz Acked-by: Minchan Kim Cc: Mel Gorman Tested-by: Marek Szyprowski Cc: [3.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 378a15bcd649..c3edb624fccf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1628,6 +1628,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; + long free_cma = 0; free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) @@ -1637,9 +1638,10 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); + free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages <= min + lowmem_reserve) + + if (free_pages - free_cma <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ -- cgit v1.2.3 From 27749f2ff0717e115680922000839ad6a576eddf Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 12 Jun 2013 14:05:03 -0700 Subject: ocfs2: add missing lockres put in dlm_mig_lockres_handler dlm_mig_lockres_handler() is missing a dlm_lockres_put() on an error path. Signed-off-by: joyce Reviewed-by: shencanquan Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b3fdd1a323d6..e68588e6b1e8 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1408,6 +1408,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; -- cgit v1.2.3 From 30dad30922ccc733cfdbfe232090cf674dc374dc Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 12 Jun 2013 14:05:04 -0700 Subject: mm: migration: add migrate_entry_wait_huge() When we have a page fault for the address which is backed by a hugepage under migration, the kernel can't wait correctly and do busy looping on hugepage fault until the migration finishes. As a result, users who try to kick hugepage migration (via soft offlining, for example) occasionally experience long delay or soft lockup. This is because pte_offset_map_lock() can't get a correct migration entry or a correct page table lock for hugepage. This patch introduces migration_entry_wait_huge() to solve this. Signed-off-by: Naoya Horiguchi Reviewed-by: Rik van Riel Reviewed-by: Wanpeng Li Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Andi Kleen Cc: KOSAKI Motohiro Cc: [2.6.35+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 3 +++ mm/hugetlb.c | 2 +- mm/migrate.c | 23 ++++++++++++++++++----- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 47ead515c811..c5fd30d2a415 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -137,6 +137,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte); #else #define make_migration_entry(page, write) swp_entry(0, 0) @@ -148,6 +149,8 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } +static inline void migration_entry_wait_huge(struct mm_struct *mm, + pte_t *pte) { } static inline int is_write_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8feeeca6686..e2bfbf73a551 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2839,7 +2839,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait(mm, (pmd_t *)ptep, address); + migration_entry_wait_huge(mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | diff --git a/mm/migrate.c b/mm/migrate.c index b1f57501de9c..6f0c24438bba 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -200,15 +200,14 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) +static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) { - pte_t *ptep, pte; - spinlock_t *ptl; + pte_t pte; swp_entry_t entry; struct page *page; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) goto out; @@ -236,6 +235,20 @@ out: pte_unmap_unlock(ptep, ptl); } +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl = &(mm)->page_table_lock; + __migration_entry_wait(mm, pte, ptl); +} + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, -- cgit v1.2.3 From 736f3203a06eafd0944103775a98584082744c6b Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 12 Jun 2013 14:05:07 -0700 Subject: kernel/audit_tree.c:audit_add_tree_rule(): protect `rule' from kill_rules() audit_add_tree_rule() must set 'rule->tree = NULL;' firstly, to protect the rule itself freed in kill_rules(). The reason is when it is killed, the 'rule' itself may have already released, we should not access it. one example: we add a rule to an inode, just at the same time the other task is deleting this inode. The work flow for adding a rule: audit_receive() -> (need audit_cmd_mutex lock) audit_receive_skb() -> audit_receive_msg() -> audit_receive_filter() -> audit_add_rule() -> audit_add_tree_rule() -> (need audit_filter_mutex lock) ... unlock audit_filter_mutex get_tree() ... iterate_mounts() -> (iterate all related inodes) tag_mount() -> tag_trunk() -> create_trunk() -> (assume it is 1st rule) fsnotify_add_mark() -> fsnotify_add_inode_mark() -> (add mark to inode->i_fsnotify_marks) ... get_tree(); (each inode will get one) ... lock audit_filter_mutex The work flow for deleting an inode: __destroy_inode() -> fsnotify_inode_delete() -> __fsnotify_inode_delete() -> fsnotify_clear_marks_by_inode() -> (get mark from inode->i_fsnotify_marks) fsnotify_destroy_mark() -> fsnotify_destroy_mark_locked() -> audit_tree_freeing_mark() -> evict_chunk() -> ... tree->goner = 1 ... kill_rules() -> (assume current->audit_context == NULL) call_rcu() -> (rule->tree != NULL) audit_free_rule_rcu() -> audit_free_rule() ... audit_schedule_prune() -> (assume current->audit_context == NULL) kthread_run() -> (need audit_cmd_mutex and audit_filter_mutex lock) prune_one() -> (delete it from prue_list) put_tree(); (match the original get_tree above) Signed-off-by: Chen Gang Cc: Eric Paris Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit_tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index a291aa23fb3f..43c307dc9453 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct vfsmount *mnt; int err; + rule->tree = NULL; list_for_each_entry(tree, &tree_list, list) { if (!strcmp(seed->pathname, tree->pathname)) { put_tree(seed); -- cgit v1.2.3 From 7b57976da48e60b66fdbb9e97f5711b5382a49d7 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 12 Jun 2013 14:05:08 -0700 Subject: frontswap: fix incorrect zeroing and allocation size for frontswap_map The bitmap accessed by bitops must have enough size to hold the required numbers of bits rounded up to a multiple of BITS_PER_LONG. And the bitmap must not be zeroed by memset() if the number of bits cleared is not a multiple of BITS_PER_LONG. This fixes incorrect zeroing and allocation size for frontswap_map. The incorrect zeroing part doesn't cause any problem because frontswap_map is freed just after zeroing. But the wrongly calculated allocation size may cause the problem. For 32bit systems, the allocation size of frontswap_map is about twice as large as required size. For 64bit systems, the allocation size is smaller than requeired if the number of bits is not a multiple of BITS_PER_LONG. Signed-off-by: Akinobu Mita Cc: Konrad Rzeszutek Wilk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 2 +- mm/swapfile.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index 538367ef1372..1b24bdcb3197 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -319,7 +319,7 @@ void __frontswap_invalidate_area(unsigned type) return; frontswap_ops->invalidate_area(type); atomic_set(&sis->frontswap_pages, 0); - memset(sis->frontswap_map, 0, sis->max / sizeof(long)); + bitmap_zero(sis->frontswap_map, sis->max); } clear_bit(type, need_init); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c340d908b27..746af55b8455 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2116,7 +2116,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* frontswap enabled? set up bit-per-page map for frontswap */ if (frontswap_enabled) - frontswap_map = vzalloc(maxpages / sizeof(long)); + frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { -- cgit v1.2.3 From 89dc991f0f5272c307c746fdd57d0bff382b1ba2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 12 Jun 2013 14:05:09 -0700 Subject: mm: memcontrol: fix lockless reclaim hierarchy iterator The lockless reclaim hierarchy iterator currently has a misplaced barrier that can lead to use-after-free crashes. The reclaim hierarchy iterator consist of a sequence count and a position pointer that are read and written locklessly, with memory barriers enforcing ordering. The write side sets the position pointer first, then updates the sequence count to "publish" the new position. Likewise, the read side must read the sequence count first, then the position. If the sequence count is up to date, it's guaranteed that the position is up to date as well: writer: reader: iter->position = position if iter->sequence == expected: smp_wmb() smp_rmb() iter->sequence = sequence position = iter->position However, the read side barrier is currently misplaced, which can lead to dereferencing stale position pointers that no longer point to valid memory. Fix this. Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Reviewed-by: Tejun Heo Acked-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Glauber Costa Cc: [3.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 931e38c6f095..194721839cf5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1199,7 +1199,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - last_visited = iter->last_visited; if (prev && reclaim->generation != iter->generation) { iter->last_visited = NULL; goto out_unlock; @@ -1218,13 +1217,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, * is alive. */ dead_count = atomic_read(&root->dead_count); - smp_rmb(); - last_visited = iter->last_visited; - if (last_visited) { - if ((dead_count != iter->last_dead_count) || - !css_tryget(&last_visited->css)) { + if (dead_count == iter->last_dead_count) { + smp_rmb(); + last_visited = iter->last_visited; + if (last_visited && + !css_tryget(&last_visited->css)) last_visited = NULL; - } } } -- cgit v1.2.3 From c2853c8df57f49620d26f317d7d43347c29bfc2e Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 12 Jun 2013 14:05:10 -0700 Subject: include/linux/math64.h: add div64_ul() There is div64_long() to handle the s64/long division, but no mocro do u64/ul division. It is necessary in some scenarios, so add this function. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alex Shi Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/math64.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/math64.h b/include/linux/math64.h index b8ba85544721..2913b86eb12a 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -6,7 +6,8 @@ #if BITS_PER_LONG == 64 -#define div64_long(x,y) div64_s64((x),(y)) +#define div64_long(x, y) div64_s64((x), (y)) +#define div64_ul(x, y) div64_u64((x), (y)) /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder @@ -47,7 +48,8 @@ static inline s64 div64_s64(s64 dividend, s64 divisor) #elif BITS_PER_LONG == 32 -#define div64_long(x,y) div_s64((x),(y)) +#define div64_long(x, y) div_s64((x), (y)) +#define div64_ul(x, y) div_u64((x), (y)) #ifndef div_u64_rem static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) -- cgit v1.2.3