From e926d8a1e8675422e53104855a7bedec82fb570f Mon Sep 17 00:00:00 2001 From: ChenGang Date: Thu, 11 Jul 2019 20:52:55 -0700 Subject: fs: ocfs: fix spelling mistake "hearbeating" -> "heartbeat" There are some spelling mistakes in ocfs, fix it. Link: http://lkml.kernel.org/r/1558964623-106628-1-git-send-email-cg.chen@huawei.com Signed-off-by: ChenGang Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/cluster/quorum.c | 2 +- fs/ocfs2/cluster/tcp.c | 2 +- fs/ocfs2/dlm/dlmmaster.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7a3a096856a8..7f74fcc6d7d9 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1184,7 +1184,7 @@ bail: if (atomic_read(®->hr_steady_iterations) != 0) { if (atomic_dec_and_test(®->hr_unsteady_iterations)) { printk(KERN_NOTICE "o2hb: Unable to stabilize " - "heartbeart on region %s (%s)\n", + "heartbeat on region %s (%s)\n", config_item_name(®->hr_item), reg->hr_dev_name); atomic_set(®->hr_steady_iterations, 0); diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 3d5d4b2b1356..5c424a099280 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -76,7 +76,7 @@ static void o2quo_fence_self(void) }; } -/* Indicate that a timeout occurred on a hearbeat region write. The +/* Indicate that a timeout occurred on a heartbeat region write. The * other nodes in the cluster may consider us dead at that time so we * want to "fence" ourselves so that we don't scribble on the disk * after they think they've recovered us. This can't solve all diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c599463d0694..c07c9aac537a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1762,7 +1762,7 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, (msecs_to_jiffies(o2net_reconnect_delay()) + 1); if (node_num != o2nm_this_node()) { - /* believe it or not, accept and node hearbeating testing + /* believe it or not, accept and node heartbeating testing * can succeed for this node before we got here.. so * only use set_nn_state to clear the persistent error * if that hasn't already happened */ diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 810f841494ef..74b768ca1cd8 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2161,7 +2161,7 @@ put: * think that $RECOVERY is currently mastered by a dead node. If so, * we wait a short time to allow that node to get notified by its own * heartbeat stack, then check again. All $RECOVERY lock resources - * mastered by dead nodes are purged when the hearbeat callback is + * mastered by dead nodes are purged when the heartbeat callback is * fired, so we can know for sure that it is safe to continue once * the node returns a live node or no node. */ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, -- cgit v1.2.3 From 0e71666b8b9e21e4cb5d805219eb5ed7c5617ca3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 11 Jul 2019 20:52:58 -0700 Subject: ocfs2/dlm: use struct_size() helper One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct dlm_migratable_lockres { ... struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 }; Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. So, replace the following form: sizeof(struct dlm_migratable_lockres) + (mres->num_locks * sizeof(struct dlm_migratable_lock)) with: struct_size(mres, ml, mres->num_locks) Notice that, in this case, variable sz is not necessary, hence it is removed. This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20190605204926.GA24467@embeddedor Signed-off-by: Gustavo A. R. Silva Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index e22d6a115220..064ce5bbc3f6 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1109,7 +1109,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, { u64 mig_cookie = be64_to_cpu(mres->mig_cookie); int mres_total_locks = be32_to_cpu(mres->total_locks); - int sz, ret = 0, status = 0; + int ret = 0, status = 0; u8 orig_flags = mres->flags, orig_master = mres->master; @@ -1117,9 +1117,6 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, if (!mres->num_locks) return 0; - sz = sizeof(struct dlm_migratable_lockres) + - (mres->num_locks * sizeof(struct dlm_migratable_lock)); - /* add an all-done flag if we reached the last lock */ orig_flags = mres->flags; BUG_ON(total_locks > mres_total_locks); @@ -1133,7 +1130,8 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, /* send it */ ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, - sz, send_to, &status); + struct_size(mres, ml, mres->num_locks), + send_to, &status); if (ret < 0) { /* XXX: negative status is not handled. * this will end up killing this node. */ -- cgit v1.2.3 From 8a7f5f4c26dd4e969b5f3b30d06c54dc6a520eda Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 11 Jul 2019 20:53:02 -0700 Subject: ocfs2: add last unlock times in locking_state ocfs2 file system uses locking_state file under debugfs to dump each ocfs2 file system's dlm lock resources, but the dlm lock resources in memory are becoming more and more after the files were touched by the user. it will become a bit difficult to analyze these dlm lock resource records in locking_state file by the upper scripts, though some files are not active for now, which were accessed long time ago. Then, I'd like to add last pr/ex unlock times in locking_state file for each dlm lock resource record, the the upper scripts can use last unlock time to filter inactive dlm lock resource record. Link: http://lkml.kernel.org/r/20190611015414.27754-1-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 18 +++++++++++++++--- fs/ocfs2/ocfs2.h | 1 + 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index b5fc5d3c7525..5f696be267e7 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -460,6 +460,8 @@ static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, if (ret) stats->ls_fail++; + + stats->ls_last = ktime_to_us(ktime_get_real()); } static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) @@ -3079,8 +3081,10 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) * - Lock stats printed * New in version 3 * - Max time in lock stats is in usecs (instead of nsecs) + * New in version 4 + * - Add last pr/ex unlock times in usecs */ -#define OCFS2_DLM_DEBUG_STR_VERSION 3 +#define OCFS2_DLM_DEBUG_STR_VERSION 4 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) { int i; @@ -3131,6 +3135,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max) # define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max) # define lock_refresh(_l) ((_l)->l_lock_refresh) +# define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last) +# define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last) #else # define lock_num_prmode(_l) (0) # define lock_num_exmode(_l) (0) @@ -3141,6 +3147,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_max_prmode(_l) (0) # define lock_max_exmode(_l) (0) # define lock_refresh(_l) (0) +# define lock_last_prmode(_l) (0ULL) +# define lock_last_exmode(_l) (0ULL) #endif /* The following seq_print was added in version 2 of this output */ seq_printf(m, "%u\t" @@ -3151,7 +3159,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) "%llu\t" "%u\t" "%u\t" - "%u\t", + "%u\t" + "%llu\t" + "%llu\t", lock_num_prmode(lockres), lock_num_exmode(lockres), lock_num_prmode_failed(lockres), @@ -3160,7 +3170,9 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) lock_total_exmode(lockres), lock_max_prmode(lockres), lock_max_exmode(lockres), - lock_refresh(lockres)); + lock_refresh(lockres), + lock_last_prmode(lockres), + lock_last_exmode(lockres)); /* End the line */ seq_printf(m, "\n"); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index a4647a646f07..5c111eabaa1d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -150,6 +150,7 @@ struct ocfs2_lock_stats { /* Storing max wait in usecs saves 24 bytes per inode */ u32 ls_max; /* Max wait in USEC */ + u64 ls_last; /* Last unlock time in USEC */ }; #endif -- cgit v1.2.3 From 8056773ac4b42f36bae6406030218a5f12749c64 Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 11 Jul 2019 20:53:05 -0700 Subject: ocfs2: add locking filter debugfs file Add locking filter debugfs file, which is used to filter lock resources dump from locking_state debugfs file. We use d_filter_secs field to filter lock resources dump, the default d_filter_secs(0) value filters nothing, otherwise, only dump the last N seconds active lock resources. This enhancement can avoid dumping lots of old records. The d_filter_secs value can be changed via locking_filter file. [akpm@linux-foundation.org: fix undefined reference to `__udivdi3'] Link: http://lkml.kernel.org/r/20190611015414.27754-2-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Acked-by: Randy Dunlap [build-tested] Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/ocfs2/ocfs2.h | 2 ++ 2 files changed, 40 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 5f696be267e7..4089daba4c6f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2991,6 +2991,8 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) kref_init(&dlm_debug->d_refcnt); INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); dlm_debug->d_locking_state = NULL; + dlm_debug->d_locking_filter = NULL; + dlm_debug->d_filter_secs = 0; out: return dlm_debug; } @@ -3090,10 +3092,34 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) int i; char *lvb; struct ocfs2_lock_res *lockres = v; +#ifdef CONFIG_OCFS2_FS_STATS + u64 now, last; + struct ocfs2_dlm_debug *dlm_debug = + ((struct ocfs2_dlm_seq_priv *)m->private)->p_dlm_debug; +#endif if (!lockres) return -EINVAL; +#ifdef CONFIG_OCFS2_FS_STATS + if (dlm_debug->d_filter_secs) { + now = ktime_to_us(ktime_get_real()); + if (lockres->l_lock_prmode.ls_last > + lockres->l_lock_exmode.ls_last) + last = lockres->l_lock_prmode.ls_last; + else + last = lockres->l_lock_exmode.ls_last; + /* + * Use d_filter_secs field to filter lock resources dump, + * the default d_filter_secs(0) value filters nothing, + * otherwise, only dump the last N seconds active lock + * resources. + */ + if (div_u64(now - last, 1000000) > dlm_debug->d_filter_secs) + return 0; + } +#endif + seq_printf(m, "0x%x\t", OCFS2_DLM_DEBUG_STR_VERSION); if (lockres->l_type == OCFS2_LOCK_TYPE_DENTRY) @@ -3243,6 +3269,17 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) goto out; } + dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", + 0600, + osb->osb_debug_root, + &dlm_debug->d_filter_secs); + if (!dlm_debug->d_locking_filter) { + ret = -EINVAL; + mlog(ML_ERROR, + "Unable to create locking filter debugfs file.\n"); + goto out; + } + ocfs2_get_dlm_debug(dlm_debug); out: return ret; @@ -3254,6 +3291,7 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) if (dlm_debug) { debugfs_remove(dlm_debug->d_locking_state); + debugfs_remove(dlm_debug->d_locking_filter); ocfs2_put_dlm_debug(dlm_debug); } } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 5c111eabaa1d..c7539601555b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -223,6 +223,8 @@ struct ocfs2_orphan_scan { struct ocfs2_dlm_debug { struct kref d_refcnt; struct dentry *d_locking_state; + struct dentry *d_locking_filter; + u32 d_filter_secs; struct list_head d_lockres_tracking; }; -- cgit v1.2.3 From 5da844a2c7df642de2618fc3efe9a92eec40899d Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 11 Jul 2019 20:53:09 -0700 Subject: ocfs2: add first lock wait time in locking_state ocfs2 file system uses locking_state file under debugfs to dump each ocfs2 file system's dlm lock resources, but the users ever encountered some hang(deadlock) problems in ocfs2 file system. I'd like to add first lock wait time in locking_state file, which can help the upper scripts detect these deadlock problems via comparing the first lock wait time with the current time. Link: http://lkml.kernel.org/r/20190611015414.27754-3-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 32 +++++++++++++++++++++++++++++--- fs/ocfs2/ocfs2.h | 1 + 2 files changed, 30 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4089daba4c6f..cf90688ff2ed 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -426,6 +426,7 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) { res->l_lock_refresh = 0; + res->l_lock_wait = 0; memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats)); memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats)); } @@ -469,6 +470,21 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) lockres->l_lock_refresh++; } +static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_mask_waiter *mw; + + if (list_empty(&lockres->l_mask_waiters)) { + lockres->l_lock_wait = 0; + return; + } + + mw = list_first_entry(&lockres->l_mask_waiters, + struct ocfs2_mask_waiter, mw_item); + lockres->l_lock_wait = + ktime_to_us(ktime_mono_to_real(mw->mw_lock_start)); +} + static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { mw->mw_lock_start = ktime_get(); @@ -484,6 +500,9 @@ static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) { } +static inline void ocfs2_track_lock_wait(struct ocfs2_lock_res *lockres) +{ +} static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) { } @@ -877,6 +896,7 @@ static void lockres_set_flags(struct ocfs2_lock_res *lockres, list_del_init(&mw->mw_item); mw->mw_status = 0; complete(&mw->mw_complete); + ocfs2_track_lock_wait(lockres); } } static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) @@ -1388,6 +1408,7 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); mw->mw_mask = mask; mw->mw_goal = goal; + ocfs2_track_lock_wait(lockres); } /* returns 0 if the mw that was removed was already satisfied, -EBUSY @@ -1404,6 +1425,7 @@ static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, list_del_init(&mw->mw_item); init_completion(&mw->mw_complete); + ocfs2_track_lock_wait(lockres); } return ret; @@ -3084,7 +3106,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) * New in version 3 * - Max time in lock stats is in usecs (instead of nsecs) * New in version 4 - * - Add last pr/ex unlock times in usecs + * - Add last pr/ex unlock times and first lock wait time in usecs */ #define OCFS2_DLM_DEBUG_STR_VERSION 4 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) @@ -3102,7 +3124,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) return -EINVAL; #ifdef CONFIG_OCFS2_FS_STATS - if (dlm_debug->d_filter_secs) { + if (!lockres->l_lock_wait && dlm_debug->d_filter_secs) { now = ktime_to_us(ktime_get_real()); if (lockres->l_lock_prmode.ls_last > lockres->l_lock_exmode.ls_last) @@ -3163,6 +3185,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_refresh(_l) ((_l)->l_lock_refresh) # define lock_last_prmode(_l) ((_l)->l_lock_prmode.ls_last) # define lock_last_exmode(_l) ((_l)->l_lock_exmode.ls_last) +# define lock_wait(_l) ((_l)->l_lock_wait) #else # define lock_num_prmode(_l) (0) # define lock_num_exmode(_l) (0) @@ -3175,6 +3198,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) # define lock_refresh(_l) (0) # define lock_last_prmode(_l) (0ULL) # define lock_last_exmode(_l) (0ULL) +# define lock_wait(_l) (0ULL) #endif /* The following seq_print was added in version 2 of this output */ seq_printf(m, "%u\t" @@ -3187,6 +3211,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) "%u\t" "%u\t" "%llu\t" + "%llu\t" "%llu\t", lock_num_prmode(lockres), lock_num_exmode(lockres), @@ -3198,7 +3223,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) lock_max_exmode(lockres), lock_refresh(lockres), lock_last_prmode(lockres), - lock_last_exmode(lockres)); + lock_last_exmode(lockres), + lock_wait(lockres)); /* End the line */ seq_printf(m, "\n"); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c7539601555b..fddbbd60f434 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -192,6 +192,7 @@ struct ocfs2_lock_res { #ifdef CONFIG_OCFS2_FS_STATS struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */ u32 l_lock_refresh; /* Disk refreshes */ + u64 l_lock_wait; /* First lock wait time */ struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit v1.2.3 From e581595ea29c737587bcc349420bfdacb9a6b02b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 11 Jul 2019 20:53:12 -0700 Subject: ocfs: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Also, because there is no need to save the file dentry, remove all of the variables that were being saved, and just recursively delete the whole directory when shutting down, saving a lot of logic and local variables. [gregkh@linuxfoundation.org: v2] Link: http://lkml.kernel.org/r/20190613055455.GE19717@kroah.com Link: http://lkml.kernel.org/r/20190612152912.GA19151@kroah.com Signed-off-by: Greg Kroah-Hartman Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: Jia Guo Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/blockcheck.c | 56 ++++++----------------- fs/ocfs2/blockcheck.h | 7 +-- fs/ocfs2/cluster/heartbeat.c | 100 +++++++++++------------------------------ fs/ocfs2/cluster/heartbeat.h | 2 +- fs/ocfs2/cluster/netdebug.c | 39 +++++----------- fs/ocfs2/cluster/nodemanager.c | 4 +- fs/ocfs2/cluster/tcp.c | 3 +- fs/ocfs2/cluster/tcp.h | 5 +-- fs/ocfs2/dlm/dlmdebug.c | 44 +++--------------- fs/ocfs2/dlm/dlmdebug.h | 10 ++--- fs/ocfs2/dlm/dlmdomain.c | 10 +---- fs/ocfs2/dlmglue.c | 25 +---------- fs/ocfs2/super.c | 29 ++---------- 13 files changed, 73 insertions(+), 261 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 005b813a56b6..429e6a8359a5 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -242,57 +242,29 @@ static struct dentry *blockcheck_debugfs_create(const char *name, static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { - debugfs_remove(stats->b_debug_check); - stats->b_debug_check = NULL; - debugfs_remove(stats->b_debug_failure); - stats->b_debug_failure = NULL; - debugfs_remove(stats->b_debug_recover); - stats->b_debug_recover = NULL; - debugfs_remove(stats->b_debug_dir); + debugfs_remove_recursive(stats->b_debug_dir); stats->b_debug_dir = NULL; } } -static int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - int rc = -EINVAL; - - if (!stats) - goto out; - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); - if (!stats->b_debug_dir) - goto out; - stats->b_debug_check = - blockcheck_debugfs_create("blocks_checked", - stats->b_debug_dir, - &stats->b_check_count); + blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, + &stats->b_check_count); - stats->b_debug_failure = - blockcheck_debugfs_create("checksums_failed", - stats->b_debug_dir, - &stats->b_failure_count); + blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, + &stats->b_failure_count); - stats->b_debug_recover = - blockcheck_debugfs_create("ecc_recoveries", - stats->b_debug_dir, - &stats->b_recover_count); - if (stats->b_debug_check && stats->b_debug_failure && - stats->b_debug_recover) - rc = 0; - -out: - if (rc) - ocfs2_blockcheck_debug_remove(stats); - return rc; + blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, + &stats->b_recover_count); } #else -static inline int ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - return 0; } static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) @@ -301,10 +273,10 @@ static inline void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats * #endif /* CONFIG_DEBUG_FS */ /* Always-called wrappers for starting and stopping the debugfs files */ -int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent) +void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent) { - return ocfs2_blockcheck_debug_install(stats, parent); + ocfs2_blockcheck_debug_install(stats, parent); } void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats) diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h index f2d2689407fa..8f17d2c85f40 100644 --- a/fs/ocfs2/blockcheck.h +++ b/fs/ocfs2/blockcheck.h @@ -25,9 +25,6 @@ struct ocfs2_blockcheck_stats { * ocfs2_blockcheck_stats_debugfs_install() */ struct dentry *b_debug_dir; /* Parent of the debugfs files */ - struct dentry *b_debug_check; /* Exposes b_check_count */ - struct dentry *b_debug_failure; /* Exposes b_failure_count */ - struct dentry *b_debug_recover; /* Exposes b_recover_count */ }; @@ -56,8 +53,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, struct ocfs2_blockcheck_stats *stats); /* Debug Initialization */ -int ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, - struct dentry *parent); +void ocfs2_blockcheck_stats_debugfs_install(struct ocfs2_blockcheck_stats *stats, + struct dentry *parent); void ocfs2_blockcheck_stats_debugfs_remove(struct ocfs2_blockcheck_stats *stats); /* diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7f74fcc6d7d9..f1b613327ac8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -92,10 +92,6 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_REGION_PINNED "pinned" static struct dentry *o2hb_debug_dir; -static struct dentry *o2hb_debug_livenodes; -static struct dentry *o2hb_debug_liveregions; -static struct dentry *o2hb_debug_quorumregions; -static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -1391,11 +1387,7 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - debugfs_remove(o2hb_debug_failedregions); - debugfs_remove(o2hb_debug_quorumregions); - debugfs_remove(o2hb_debug_liveregions); - debugfs_remove(o2hb_debug_livenodes); - debugfs_remove(o2hb_debug_dir); + debugfs_remove_recursive(o2hb_debug_dir); kfree(o2hb_db_livenodes); kfree(o2hb_db_liveregions); kfree(o2hb_db_quorumregions); @@ -1419,79 +1411,37 @@ static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, &o2hb_debug_fops); } -static int o2hb_debug_init(void) +static void o2hb_debug_init(void) { - int ret = -ENOMEM; - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(ret); - goto bail; - } - o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, - o2hb_debug_dir, - &o2hb_db_livenodes, - sizeof(*o2hb_db_livenodes), - O2HB_DB_TYPE_LIVENODES, - sizeof(o2hb_live_node_bitmap), - O2NM_MAX_NODES, - o2hb_live_node_bitmap); - if (!o2hb_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, o2hb_debug_dir, + &o2hb_db_livenodes, sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, o2hb_live_node_bitmap); - o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, - o2hb_debug_dir, - &o2hb_db_liveregions, - sizeof(*o2hb_db_liveregions), - O2HB_DB_TYPE_LIVEREGIONS, - sizeof(o2hb_live_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_live_region_bitmap); - if (!o2hb_debug_liveregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, o2hb_debug_dir, + &o2hb_db_liveregions, sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); - o2hb_debug_quorumregions = - o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, - o2hb_debug_dir, - &o2hb_db_quorumregions, - sizeof(*o2hb_db_quorumregions), - O2HB_DB_TYPE_QUORUMREGIONS, - sizeof(o2hb_quorum_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_quorum_region_bitmap); - if (!o2hb_debug_quorumregions) { - mlog_errno(ret); - goto bail; - } - - o2hb_debug_failedregions = - o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, - o2hb_debug_dir, - &o2hb_db_failedregions, - sizeof(*o2hb_db_failedregions), - O2HB_DB_TYPE_FAILEDREGIONS, - sizeof(o2hb_failed_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_failed_region_bitmap); - if (!o2hb_debug_failedregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); - ret = 0; -bail: - if (ret) - o2hb_exit(); - - return ret; + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); } -int o2hb_init(void) +void o2hb_init(void) { int i; @@ -1511,7 +1461,7 @@ int o2hb_init(void) o2hb_dependent_users = 0; - return o2hb_debug_init(); + o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 7f37540ac4ab..beed31ea86cf 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -63,7 +63,7 @@ void o2hb_unregister_callback(const char *region_uuid, void o2hb_fill_node_map(unsigned long *map, unsigned bytes); void o2hb_exit(void); -int o2hb_init(void); +void o2hb_init(void); int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); void o2hb_stop_all_regions(void); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 0784575f4c2a..02bf4a1774cc 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -38,10 +38,6 @@ #define SHOW_SOCK_STATS 1 static struct dentry *o2net_dentry; -static struct dentry *sc_dentry; -static struct dentry *nst_dentry; -static struct dentry *stats_dentry; -static struct dentry *nodes_dentry; static DEFINE_SPINLOCK(o2net_debug_lock); @@ -490,36 +486,23 @@ static const struct file_operations nodes_fops = { void o2net_debugfs_exit(void) { - debugfs_remove(nodes_dentry); - debugfs_remove(stats_dentry); - debugfs_remove(sc_dentry); - debugfs_remove(nst_dentry); - debugfs_remove(o2net_dentry); + debugfs_remove_recursive(o2net_dentry); } -int o2net_debugfs_init(void) +void o2net_debugfs_init(void) { umode_t mode = S_IFREG|S_IRUSR; o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); - if (o2net_dentry) - nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, - o2net_dentry, NULL, &nst_seq_fops); - if (nst_dentry) - sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, - o2net_dentry, NULL, &sc_seq_fops); - if (sc_dentry) - stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, - o2net_dentry, NULL, &stats_seq_fops); - if (stats_dentry) - nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, - o2net_dentry, NULL, &nodes_fops); - if (nodes_dentry) - return 0; - - o2net_debugfs_exit(); - mlog_errno(-ENOMEM); - return -ENOMEM; + + debugfs_create_file(NST_DEBUG_NAME, mode, o2net_dentry, NULL, + &nst_seq_fops); + debugfs_create_file(SC_DEBUG_NAME, mode, o2net_dentry, NULL, + &sc_seq_fops); + debugfs_create_file(STATS_DEBUG_NAME, mode, o2net_dentry, NULL, + &stats_seq_fops); + debugfs_create_file(NODES_DEBUG_NAME, mode, o2net_dentry, NULL, + &nodes_fops); } #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 2234f7fd1f7c..7a7640c59f3c 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -828,9 +828,7 @@ static int __init init_o2nm(void) { int ret = -1; - ret = o2hb_init(); - if (ret) - goto out; + o2hb_init(); ret = o2net_init(); if (ret) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c07c9aac537a..48a3398f0bf5 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2129,8 +2129,7 @@ int o2net_init(void) o2quo_init(); - if (o2net_debugfs_init()) - goto out; + o2net_debugfs_init(); o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index dd4242be3f1f..de87cbffd175 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -109,16 +109,15 @@ struct o2net_send_tracking; struct o2net_sock_container; #ifdef CONFIG_DEBUG_FS -int o2net_debugfs_init(void); +void o2net_debugfs_init(void); void o2net_debugfs_exit(void); void o2net_debug_add_nst(struct o2net_send_tracking *nst); void o2net_debug_del_nst(struct o2net_send_tracking *nst); void o2net_debug_add_sc(struct o2net_sock_container *sc); void o2net_debug_del_sc(struct o2net_sock_container *sc); #else -static inline int o2net_debugfs_init(void) +static inline void o2net_debugfs_init(void) { - return 0; } static inline void o2net_debugfs_exit(void) { diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c8af5bc9e980..a4b58ba99927 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -851,7 +851,7 @@ static const struct file_operations debug_state_fops = { /* end - debug state funcs */ /* files in subroot */ -int dlm_debug_init(struct dlm_ctxt *dlm) +void dlm_debug_init(struct dlm_ctxt *dlm) { struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; @@ -860,10 +860,6 @@ int dlm_debug_init(struct dlm_ctxt *dlm) S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); - if (!dc->debug_state_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } /* for dumping lockres */ dc->debug_lockres_dentry = @@ -871,20 +867,12 @@ int dlm_debug_init(struct dlm_ctxt *dlm) S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); - if (!dc->debug_lockres_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } /* for dumping mles */ dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); - if (!dc->debug_mle_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } /* for dumping lockres on the purge list */ dc->debug_purgelist_dentry = @@ -892,15 +880,6 @@ int dlm_debug_init(struct dlm_ctxt *dlm) S_IFREG|S_IRUSR, dlm->dlm_debugfs_subroot, dlm, &debug_purgelist_fops); - if (!dc->debug_purgelist_dentry) { - mlog_errno(-ENOMEM); - goto bail; - } - - return 0; - -bail: - return -ENOMEM; } void dlm_debug_shutdown(struct dlm_ctxt *dlm) @@ -920,24 +899,16 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm) /* subroot - domain dir */ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, - dlm_debugfs_root); - if (!dlm->dlm_debugfs_subroot) { - mlog_errno(-ENOMEM); - goto bail; - } - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), GFP_KERNEL); if (!dlm->dlm_debug_ctxt) { mlog_errno(-ENOMEM); - goto bail; + return -ENOMEM; } + dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, + dlm_debugfs_root); return 0; -bail: - dlm_destroy_debugfs_subroot(dlm); - return -ENOMEM; } void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) @@ -946,14 +917,9 @@ void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) } /* debugfs root */ -int dlm_create_debugfs_root(void) +void dlm_create_debugfs_root(void) { dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL); - if (!dlm_debugfs_root) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - return 0; } void dlm_destroy_debugfs_root(void) diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 74d019694c7e..7d0c7c9013ce 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -28,20 +28,19 @@ struct debug_lockres { struct dlm_lock_resource *dl_res; }; -int dlm_debug_init(struct dlm_ctxt *dlm); +void dlm_debug_init(struct dlm_ctxt *dlm); void dlm_debug_shutdown(struct dlm_ctxt *dlm); int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); -int dlm_create_debugfs_root(void); +void dlm_create_debugfs_root(void); void dlm_destroy_debugfs_root(void); #else -static inline int dlm_debug_init(struct dlm_ctxt *dlm) +static inline void dlm_debug_init(struct dlm_ctxt *dlm) { - return 0; } static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) { @@ -53,9 +52,8 @@ static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { } -static inline int dlm_create_debugfs_root(void) +static inline void dlm_create_debugfs_root(void) { - return 0; } static inline void dlm_destroy_debugfs_root(void) { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9021e72e1f98..7338b5d4647c 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1881,11 +1881,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - status = dlm_debug_init(dlm); - if (status < 0) { - mlog_errno(status); - goto bail; - } + dlm_debug_init(dlm); snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); dlm->dlm_worker = alloc_workqueue(wq_name, WQ_MEM_RECLAIM, 0); @@ -2346,9 +2342,7 @@ static int __init dlm_init(void) goto error; } - status = dlm_create_debugfs_root(); - if (status) - goto error; + dlm_create_debugfs_root(); return 0; error: diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index cf90688ff2ed..dc987f56c2ea 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3278,9 +3278,8 @@ static const struct file_operations ocfs2_dlm_debug_fops = { .llseek = seq_lseek, }; -static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) +static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) { - int ret = 0; struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; dlm_debug->d_locking_state = debugfs_create_file("locking_state", @@ -3288,27 +3287,11 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - if (!dlm_debug->d_locking_state) { - ret = -EINVAL; - mlog(ML_ERROR, - "Unable to create locking state debugfs file.\n"); - goto out; - } dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, &dlm_debug->d_filter_secs); - if (!dlm_debug->d_locking_filter) { - ret = -EINVAL; - mlog(ML_ERROR, - "Unable to create locking filter debugfs file.\n"); - goto out; - } - - ocfs2_get_dlm_debug(dlm_debug); -out: - return ret; } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) @@ -3332,11 +3315,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) goto local; } - status = ocfs2_dlm_init_debug(osb); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_dlm_init_debug(osb); /* launch downconvert thread */ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a201f9780b35..8b2f39506648 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1079,33 +1079,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - if (!osb->osb_debug_root) { - status = -EINVAL; - mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); - goto read_super_error; - } osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, osb, &ocfs2_osb_debug_fops); - if (!osb->osb_ctxt) { - status = -EINVAL; - mlog_errno(status); - goto read_super_error; - } - if (ocfs2_meta_ecc(osb)) { - status = ocfs2_blockcheck_stats_debugfs_install( - &osb->osb_ecc_stats, - osb->osb_debug_root); - if (status) { - mlog(ML_ERROR, - "Unable to create blockcheck statistics " - "files\n"); - goto read_super_error; - } - } + if (ocfs2_meta_ecc(osb)) + ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, + osb->osb_debug_root); status = ocfs2_mount_volume(sb); if (status < 0) @@ -1592,11 +1574,6 @@ static int __init ocfs2_init(void) goto out2; ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); - if (!ocfs2_debugfs_root) { - status = -ENOMEM; - mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); - goto out3; - } ocfs2_set_locking_protocol(); -- cgit v1.2.3 From 4658d87cb38cb3ab3a234d1b8d63f65df4cce62b Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Thu, 11 Jul 2019 20:53:16 -0700 Subject: fs/ocfs2/dlmglue.c: unneeded variable: "status" fix below issue reported by coccicheck fs/ocfs2/dlmglue.c:4410:5-11: Unneeded variable: "status". Return "0" on line 4428 We can not change return type of ocfs2_downconvert_thread as its registered as callback of kthread_create. Link: http://lkml.kernel.org/r/20190702183237.GA13975@hari-Inspiron-1545 Signed-off-by: Hariprasad Kelam Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index dc987f56c2ea..14207234fa3d 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -4407,7 +4407,6 @@ static int ocfs2_downconvert_thread_should_wake(struct ocfs2_super *osb) static int ocfs2_downconvert_thread(void *arg) { - int status = 0; struct ocfs2_super *osb = arg; /* only quit once we've been asked to stop and there is no more @@ -4425,7 +4424,7 @@ static int ocfs2_downconvert_thread(void *arg) } osb->dc_task = NULL; - return status; + return 0; } void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb) -- cgit v1.2.3 From d8b2fa657deaa73ff70d40aea9a54997fc0c7fc9 Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Thu, 11 Jul 2019 20:53:19 -0700 Subject: ocfs2: use kmemdup rather than duplicating its implementation kmemdup is introduced to duplicate a region of memory in a neat way. Rather than kmalloc/kzalloc + memcpy, which the programmer needs to write the size twice (sometimes lead to mistakes), kmemdup improves readability, leads to smaller code and also reduce the chances of mistakes. Suggestion to use kmemdup rather than using kmalloc/kzalloc + memcpy. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190703163147.881-1-huangfq.daxian@gmail.com Signed-off-by: Fuqian Huang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 10 +++++----- fs/ocfs2/localalloc.c | 6 ++---- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d1348fc4ca6d..0c335b51043d 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6191,17 +6191,17 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, if (le16_to_cpu(tl->tl_used)) { trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used)); - *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + /* + * Assuming the write-out below goes well, this copy will be + * passed back to recovery for processing. + */ + *tl_copy = kmemdup(tl_bh->b_data, tl_bh->b_size, GFP_KERNEL); if (!(*tl_copy)) { status = -ENOMEM; mlog_errno(status); goto bail; } - /* Assuming the write-out below goes well, this copy - * will be passed back to recovery for processing. */ - memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); - /* All we need to do to clear the truncate log is set * tl_used. */ tl->tl_used = 0; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index f03674afbd30..158e5af767fd 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -424,12 +424,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) bh = osb->local_alloc_bh; alloc = (struct ocfs2_dinode *) bh->b_data; - alloc_copy = kmalloc(bh->b_size, GFP_NOFS); + alloc_copy = kmemdup(alloc, bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; goto out_commit; } - memcpy(alloc_copy, alloc, bh->b_size); status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), bh, OCFS2_JOURNAL_ACCESS_WRITE); @@ -1272,13 +1271,12 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, * local alloc shutdown won't try to double free main bitmap * bits. Make a copy so the sync function knows which bits to * free. */ - alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS); + alloc_copy = kmemdup(alloc, osb->local_alloc_bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; mlog_errno(status); goto bail; } - memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode), -- cgit v1.2.3 From 265de8ce3d5b3c70644a1a45457580bb07548b56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:23 -0700 Subject: jffs2: pass the correct prototype to read_cache_page Fix the callback jffs2 passes to read_cache_page to actually have the proper type expected. Casting around function pointers can easily hide typing bugs, and defeats control flow protection. Link: http://lkml.kernel.org/r/20190520055731.24538-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Nick Desaulniers Cc: Sami Tolvanen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs2/file.c | 4 ++-- fs/jffs2/fs.c | 2 +- fs/jffs2/os-linux.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 7d8654a1472e..f8fb89b10227 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -109,9 +109,9 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) return ret; } -int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) +int jffs2_do_readpage_unlock(void *data, struct page *pg) { - int ret = jffs2_do_readpage_nolock(inode, pg); + int ret = jffs2_do_readpage_nolock(data, pg); unlock_page(pg); return ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 112d85849db1..8a20ddd25f2d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -687,7 +687,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, struct page *pg; pg = read_cache_page(inode->i_mapping, offset >> PAGE_SHIFT, - (void *)jffs2_do_readpage_unlock, inode); + jffs2_do_readpage_unlock, inode); if (IS_ERR(pg)) return (void *)pg; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index a2dbbb3f4c74..bd3d5f0ddc34 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -155,7 +155,7 @@ extern const struct file_operations jffs2_file_operations; extern const struct inode_operations jffs2_file_inode_operations; extern const struct address_space_operations jffs2_file_address_operations; int jffs2_fsync(struct file *, loff_t, loff_t, int); -int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); +int jffs2_do_readpage_unlock(void *data, struct page *pg); /* ioctl.c */ long jffs2_ioctl(struct file *, unsigned int, unsigned long); -- cgit v1.2.3 From f053cbd4366051d7eb6ba1b8d529d20f719c2963 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jul 2019 20:55:26 -0700 Subject: 9p: pass the correct prototype to read_cache_page Fix the callback 9p passes to read_cache_page to actually have the proper type expected. Casting around function pointers can easily hide typing bugs, and defeats control flow protection. Link: http://lkml.kernel.org/r/20190520055731.24538-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Kees Cook Cc: Sami Tolvanen Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_addr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index bc57ae9e2963..cce9ace651a2 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -35,8 +35,9 @@ * @page: structure to page * */ -static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) +static int v9fs_fid_readpage(void *data, struct page *page) { + struct p9_fid *fid = data; struct inode *inode = page->mapping->host; struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE}; struct iov_iter to; @@ -107,7 +108,8 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) return ret; - ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); + ret = read_cache_pages(mapping, pages, v9fs_fid_readpage, + filp->private_data); p9_debug(P9_DEBUG_VFS, " = %d\n", ret); return ret; } -- cgit v1.2.3 From ec165450968b26298bd1c373de37b0ab6d826b33 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 20:55:52 -0700 Subject: memcg, fsnotify: no oom-kill for remote memcg charging Commit d46eb14b735b ("fs: fsnotify: account fsnotify metadata to kmemcg") added remote memcg charging for fanotify and inotify event objects. The aim was to charge the memory to the listener who is interested in the events but without triggering the OOM killer. Otherwise there would be security concerns for the listener. At the time, oom-kill trigger was not in the charging path. A parallel work added the oom-kill back to charging path i.e. commit 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path"). So to not trigger oom-killer in the remote memcg, explicitly add __GFP_RETRY_MAYFAIL to the fanotigy and inotify event allocations. Link: http://lkml.kernel.org/r/20190514212259.156585-2-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Jan Kara Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Michal Hocko Cc: Amir Goldstein Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify.c | 5 ++++- fs/notify/inotify/inotify_fsnotify.c | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index b428c295d13f..5778d1347b35 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -288,10 +288,13 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, /* * For queues with unlimited length lost events are not expected and * can possibly have security implications. Avoid losing events when - * memory is short. + * memory is short. For the limited size queues, avoid OOM killer in the + * target monitoring memcg as it may have security repercussion. */ if (group->max_events == UINT_MAX) gfp |= __GFP_NOFAIL; + else + gfp |= __GFP_RETRY_MAYFAIL; /* Whoever is interested in the event, pays for the allocation. */ memalloc_use_memcg(group->memcg); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 2fda08b2b885..d510223d302c 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -90,9 +90,13 @@ int inotify_handle_event(struct fsnotify_group *group, i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); - /* Whoever is interested in the event, pays for the allocation. */ + /* + * Whoever is interested in the event, pays for the allocation. Do not + * trigger OOM killer in the target monitoring memcg as it may have + * security repercussion. + */ memalloc_use_memcg(group->memcg); - event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT); + event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); memalloc_unuse_memcg(); if (unlikely(!event)) { -- cgit v1.2.3 From 8a713e7df3352b8d9392476e9cf29e4e185dac32 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 20:59:50 -0700 Subject: proc: use down_read_killable mmap_sem for /proc/pid/maps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. This function is also used for /proc/pid/smaps. Link: http://lkml.kernel.org/r/156007493160.3335.14447544314127417266.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 +++++- fs/proc/task_nommu.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 01d4eb0e6bd1..2bf210229daf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -166,7 +166,11 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!mm || !mmget_not_zero(mm)) return NULL; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) { + mmput(mm); + return ERR_PTR(-EINTR); + } + hold_task_mempolicy(priv); priv->tail_vma = get_gate_vma(mm); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 36bf0f2e102e..7907e6419e57 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -211,7 +211,11 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (!mm || !mmget_not_zero(mm)) return NULL; - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) { + mmput(mm); + return ERR_PTR(-EINTR); + } + /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) -- cgit v1.2.3 From a26a97815548574213fd37f29b4b78ccc6d9ed20 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 20:59:53 -0700 Subject: proc: use down_read_killable mmap_sem for /proc/pid/smaps_rollup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Link: http://lkml.kernel.org/r/156007493429.3335.14666825072272692455.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2bf210229daf..781879a91e3b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -832,7 +832,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) memset(&mss, 0, sizeof(mss)); - down_read(&mm->mmap_sem); + ret = down_read_killable(&mm->mmap_sem); + if (ret) + goto out_put_mm; + hold_task_mempolicy(priv); for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { @@ -849,8 +852,9 @@ static int show_smaps_rollup(struct seq_file *m, void *v) release_task_mempolicy(priv); up_read(&mm->mmap_sem); - mmput(mm); +out_put_mm: + mmput(mm); out_put_task: put_task_struct(priv->task); priv->task = NULL; -- cgit v1.2.3 From ad80b932c57d85fd6377f97f359b025baf179a87 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 20:59:56 -0700 Subject: proc: use down_read_killable mmap_sem for /proc/pid/pagemap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Link: http://lkml.kernel.org/r/156007493638.3335.4872164955523928492.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 781879a91e3b..78bed6adc62d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1547,7 +1547,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; - down_read(&mm->mmap_sem); + ret = down_read_killable(&mm->mmap_sem); + if (ret) + goto out_free; ret = walk_page_range(start_vaddr, end, &pagemap_walk); up_read(&mm->mmap_sem); start_vaddr = end; -- cgit v1.2.3 From c46038017fbdcac627b670c9d4176f1d0c2f5fa3 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 21:00:00 -0700 Subject: proc: use down_read_killable mmap_sem for /proc/pid/clear_refs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. Replace the only unkillable mmap_sem lock in clear_refs_write(). Link: http://lkml.kernel.org/r/156007493826.3335.5424884725467456239.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 78bed6adc62d..7f84d1477b5b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1140,7 +1140,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, goto out_mm; } - down_read(&mm->mmap_sem); + if (down_read_killable(&mm->mmap_sem)) { + count = -EINTR; + goto out_mm; + } tlb_gather_mmu(&tlb, mm, 0, -1); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { -- cgit v1.2.3 From cd9e2bb8271c971d9f37c722be2616c7f8ba0664 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 11 Jul 2019 21:00:03 -0700 Subject: proc: use down_read_killable mmap_sem for /proc/pid/map_files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not remain stuck forever if something goes wrong. Using a killable lock permits cleanup of stuck tasks and simplifies investigation. It seems ->d_revalidate() could return any error (except ECHILD) to abort validation and pass error as result of lookup sequence. [akpm@linux-foundation.org: fix proc_map_files_lookup() return value, per Andrei] Link: http://lkml.kernel.org/r/156007493995.3335.9595044802115356911.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Roman Gushchin Reviewed-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Al Viro Cc: Matthew Wilcox Cc: Michal Koutný Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index c40fca98f2b7..534fb1ae498a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1962,9 +1962,12 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { - down_read(&mm->mmap_sem); - exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); - up_read(&mm->mmap_sem); + status = down_read_killable(&mm->mmap_sem); + if (!status) { + exact_vma_exists = !!find_exact_vma(mm, vm_start, + vm_end); + up_read(&mm->mmap_sem); + } } mmput(mm); @@ -2010,8 +2013,11 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) if (rc) goto out_mmput; + rc = down_read_killable(&mm->mmap_sem); + if (rc) + goto out_mmput; + rc = -ENOENT; - down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { *path = vma->vm_file->f_path; @@ -2107,7 +2113,11 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, if (!mm) goto out_put_task; - down_read(&mm->mmap_sem); + result = ERR_PTR(-EINTR); + if (down_read_killable(&mm->mmap_sem)) + goto out_put_mm; + + result = ERR_PTR(-ENOENT); vma = find_exact_vma(mm, vm_start, vm_end); if (!vma) goto out_no_vma; @@ -2118,6 +2128,7 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, out_no_vma: up_read(&mm->mmap_sem); +out_put_mm: mmput(mm); out_put_task: put_task_struct(task); @@ -2160,7 +2171,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) mm = get_task_mm(task); if (!mm) goto out_put_task; - down_read(&mm->mmap_sem); + + ret = down_read_killable(&mm->mmap_sem); + if (ret) { + mmput(mm); + goto out_put_task; + } nr_files = 0; -- cgit v1.2.3 From ee2ad71b0756e995fa4f6d922463e9bccd71b198 Mon Sep 17 00:00:00 2001 From: Luigi Semenzato Date: Thu, 11 Jul 2019 21:00:10 -0700 Subject: mm: smaps: split PSS into components Report separate components (anon, file, and shmem) for PSS in smaps_rollup. This helps understand and tune the memory manager behavior in consumer devices, particularly mobile devices. Many of them (e.g. chromebooks and Android-based devices) use zram for anon memory, and perform disk reads for discarded file pages. The difference in latency is large (e.g. reading a single page from SSD is 30 times slower than decompressing a zram page on one popular device), thus it is useful to know how much of the PSS is anon vs. file. All the information is already present in /proc/pid/smaps, but much more expensive to obtain because of the large size of that procfs entry. This patch also removes a small code duplication in smaps_account, which would have gotten worse otherwise. Also updated Documentation/filesystems/proc.txt (the smaps section was a bit stale, and I added a smaps_rollup section) and Documentation/ABI/testing/procfs-smaps_rollup. [semenzato@chromium.org: v5] Link: http://lkml.kernel.org/r/20190626234333.44608-1-semenzato@chromium.org Link: http://lkml.kernel.org/r/20190626180429.174569-1-semenzato@chromium.org Signed-off-by: Luigi Semenzato Acked-by: Yu Zhao Cc: Sonny Rao Cc: Yu Zhao Cc: Brian Geffon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/procfs-smaps_rollup | 14 +++- Documentation/filesystems/proc.txt | 41 +++++++++--- fs/proc/task_mmu.c | 92 ++++++++++++++++++--------- 3 files changed, 105 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup index 0a54ed0d63c9..274df44d8b1b 100644 --- a/Documentation/ABI/testing/procfs-smaps_rollup +++ b/Documentation/ABI/testing/procfs-smaps_rollup @@ -3,18 +3,28 @@ Date: August 2017 Contact: Daniel Colascione Description: This file provides pre-summed memory information for a - process. The format is identical to /proc/pid/smaps, + process. The format is almost identical to /proc/pid/smaps, except instead of an entry for each VMA in a process, smaps_rollup has a single entry (tagged "[rollup]") for which each field is the sum of the corresponding fields from all the maps in /proc/pid/smaps. - For more details, see the procfs man page. + Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem + are not present in /proc/pid/smaps. These fields represent + the sum of the Pss field of each type (anon, file, shmem). + For more details, see Documentation/filesystems/proc.txt + and the procfs man page. Typical output looks like this: 00100000-ff709000 ---p 00000000 00:00 0 [rollup] + Size: 1192 kB + KernelPageSize: 4 kB + MMUPageSize: 4 kB Rss: 884 kB Pss: 385 kB + Pss_Anon: 301 kB + Pss_File: 80 kB + Pss_Shmem: 4 kB Shared_Clean: 696 kB Shared_Dirty: 0 kB Private_Clean: 120 kB diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a226061fa109..d750b6926899 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -154,9 +154,11 @@ Table 1-1: Process specific entries in /proc symbol the task is blocked in - or "0" if not blocked. pagemap Page table stack Report full stack trace, enable via CONFIG_STACKTRACE - smaps an extension based on maps, showing the memory consumption of + smaps An extension based on maps, showing the memory consumption of each mapping and flags associated with it - numa_maps an extension based on maps, showing the memory locality and + smaps_rollup Accumulated smaps stats for all mappings of the process. This + can be derived from smaps, but is faster and more convenient + numa_maps An extension based on maps, showing the memory locality and binding policy as well as mem usage (in pages) of each mapping. .............................................................................. @@ -366,7 +368,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7) exit_code the thread's exit_code in the form reported by the waitpid system call .............................................................................. -The /proc/PID/maps file containing the currently mapped memory regions and +The /proc/PID/maps file contains the currently mapped memory regions and their access permissions. The format is: @@ -417,11 +419,14 @@ is not associated with a file: or if empty, the mapping is anonymous. The /proc/PID/smaps is an extension based on maps, showing the memory -consumption for each of the process's mappings. For each of mappings there -is a series of lines such as the following: +consumption for each of the process's mappings. For each mapping (aka Virtual +Memory Area, or VMA) there is a series of lines such as the following: 08048000-080bc000 r-xp 00000000 03:02 13130 /bin/bash + Size: 1084 kB +KernelPageSize: 4 kB +MMUPageSize: 4 kB Rss: 892 kB Pss: 374 kB Shared_Clean: 892 kB @@ -443,11 +448,14 @@ Locked: 0 kB THPeligible: 0 VmFlags: rd ex mr mw me dw -the first of these lines shows the same information as is displayed for the -mapping in /proc/PID/maps. The remaining lines show the size of the mapping -(size), the amount of the mapping that is currently resident in RAM (RSS), the -process' proportional share of this mapping (PSS), the number of clean and -dirty private pages in the mapping. +The first of these lines shows the same information as is displayed for the +mapping in /proc/PID/maps. Following lines show the size of the mapping +(size); the size of each page allocated when backing a VMA (KernelPageSize), +which is usually the same as the size in the page table entries; the page size +used by the MMU when backing a VMA (in most cases, the same as KernelPageSize); +the amount of the mapping that is currently resident in RAM (RSS); the +process' proportional share of this mapping (PSS); and the number of clean and +dirty shared and private pages in the mapping. The "proportional set size" (PSS) of a process is the count of pages it has in memory, where each page is divided by the number of processes sharing it. @@ -532,6 +540,19 @@ guarantees: 2) If there is something at a given vaddr during the entirety of the life of the smaps/maps walk, there will be some output for it. +The /proc/PID/smaps_rollup file includes the same fields as /proc/PID/smaps, +but their values are the sums of the corresponding values for all mappings of +the process. Additionally, it contains these fields: + +Pss_Anon +Pss_File +Pss_Shmem + +They represent the proportional shares of anonymous, file, and shmem pages, as +described for smaps above. These fields are omitted in smaps since each +mapping identifies the type (anon, file, or shmem) of all pages it contains. +Thus all information in smaps_rollup can be derived from smaps, but at a +significantly higher cost. The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG bits on both physical and virtual pages associated with a process, and the diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7f84d1477b5b..dedca3da428a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -421,17 +421,53 @@ struct mem_size_stats { unsigned long shared_hugetlb; unsigned long private_hugetlb; u64 pss; + u64 pss_anon; + u64 pss_file; + u64 pss_shmem; u64 pss_locked; u64 swap_pss; bool check_shmem_swap; }; +static void smaps_page_accumulate(struct mem_size_stats *mss, + struct page *page, unsigned long size, unsigned long pss, + bool dirty, bool locked, bool private) +{ + mss->pss += pss; + + if (PageAnon(page)) + mss->pss_anon += pss; + else if (PageSwapBacked(page)) + mss->pss_shmem += pss; + else + mss->pss_file += pss; + + if (locked) + mss->pss_locked += pss; + + if (dirty || PageDirty(page)) { + if (private) + mss->private_dirty += size; + else + mss->shared_dirty += size; + } else { + if (private) + mss->private_clean += size; + else + mss->shared_clean += size; + } +} + static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked) { int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; + /* + * First accumulate quantities that depend only on |size| and the type + * of the compound page. + */ if (PageAnon(page)) { mss->anonymous += size; if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) @@ -444,42 +480,25 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, mss->referenced += size; /* + * Then accumulate quantities that may depend on sharing, or that may + * differ page-by-page. + * * page_count(page) == 1 guarantees the page is mapped exactly once. * If any subpage of the compound page mapped with PTE it would elevate * page_count(). */ if (page_count(page) == 1) { - if (dirty || PageDirty(page)) - mss->private_dirty += size; - else - mss->private_clean += size; - mss->pss += (u64)size << PSS_SHIFT; - if (locked) - mss->pss_locked += (u64)size << PSS_SHIFT; + smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty, + locked, true); return; } - for (i = 0; i < nr; i++, page++) { int mapcount = page_mapcount(page); - unsigned long pss = (PAGE_SIZE << PSS_SHIFT); - - if (mapcount >= 2) { - if (dirty || PageDirty(page)) - mss->shared_dirty += PAGE_SIZE; - else - mss->shared_clean += PAGE_SIZE; - mss->pss += pss / mapcount; - if (locked) - mss->pss_locked += pss / mapcount; - } else { - if (dirty || PageDirty(page)) - mss->private_dirty += PAGE_SIZE; - else - mss->private_clean += PAGE_SIZE; - mss->pss += pss; - if (locked) - mss->pss_locked += pss; - } + unsigned long pss = PAGE_SIZE << PSS_SHIFT; + if (mapcount >= 2) + pss /= mapcount; + smaps_page_accumulate(mss, page, PAGE_SIZE, pss, dirty, locked, + mapcount < 2); } } @@ -758,10 +777,23 @@ static void smap_gather_stats(struct vm_area_struct *vma, seq_put_decimal_ull_width(m, str, (val) >> 10, 8) /* Show the contents common for smaps and smaps_rollup */ -static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss) +static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, + bool rollup_mode) { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + if (rollup_mode) { + /* + * These are meaningful only for smaps_rollup, otherwise two of + * them are zero, and the other one is the same as Pss. + */ + SEQ_PUT_DEC(" kB\nPss_Anon: ", + mss->pss_anon >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_File: ", + mss->pss_file >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Shmem: ", + mss->pss_shmem >> PSS_SHIFT); + } SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); @@ -798,7 +830,7 @@ static int show_smap(struct seq_file *m, void *v) SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); seq_puts(m, " kB\n"); - __show_smap(m, &mss); + __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", transparent_hugepage_enabled(vma)); @@ -848,7 +880,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); - __show_smap(m, &mss); + __show_smap(m, &mss, true); release_task_mempolicy(priv); up_read(&mm->mmap_sem); -- cgit v1.2.3 From 97105f0ab7b877a8ece2005e214894e93793950c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 11 Jul 2019 21:00:13 -0700 Subject: mm: vmalloc: show number of vmalloc pages in /proc/meminfo Vmalloc() is getting more and more used these days (kernel stacks, bpf and percpu allocator are new top users), and the total % of memory consumed by vmalloc() can be pretty significant and changes dynamically. /proc/meminfo is the best place to display this information: its top goal is to show top consumers of the memory. Since the VmallocUsed field in /proc/meminfo is not in use for quite a long time (it has been defined to 0 by a5ad88ce8c7f ("mm: get rid of 'vmalloc_info' from /proc/meminfo")), let's reuse it for showing the actual physical memory consumption of vmalloc(). Link: http://lkml.kernel.org/r/20190417194002.12369-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- include/linux/vmalloc.h | 2 ++ mm/vmalloc.c | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 568d90e17c17..465ea0153b2a 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); - show_val_kb(m, "VmallocUsed: ", 0ul); + show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 51e131245379..9b21d0047710 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -72,10 +72,12 @@ extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU extern void __init vmalloc_init(void); +extern unsigned long vmalloc_nr_pages(void); #else static inline void vmalloc_init(void) { } +static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif extern void *vmalloc(unsigned long size); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index edb212298c8a..4fa8d84599b0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -406,6 +406,13 @@ static void purge_vmap_area_lazy(void); static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); static unsigned long lazy_max_pages(void); +static atomic_long_t nr_vmalloc_pages; + +unsigned long vmalloc_nr_pages(void) +{ + return atomic_long_read(&nr_vmalloc_pages); +} + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -2237,6 +2244,7 @@ static void __vunmap(const void *addr, int deallocate_pages) BUG_ON(!page); __free_pages(page, 0); } + atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); kvfree(area->pages); } @@ -2414,12 +2422,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } area->pages[i] = page; if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); } + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); if (map_vm_area(area, prot, pages)) goto fail; -- cgit v1.2.3 From 6ba749ee78ef42ffdf4b95c042fc574a37d229d9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:26 -0700 Subject: mm, oom: remove redundant task_in_mem_cgroup() check oom_unkillable_task() can be called from three different contexts i.e. global OOM, memcg OOM and oom_score procfs interface. At the moment oom_unkillable_task() does a task_in_mem_cgroup() check on the given process. Since there is no reason to perform task_in_mem_cgroup() check for global OOM and oom_score procfs interface, those contexts provide NULL memcg and skips the task_in_mem_cgroup() check. However for memcg OOM context, the oom_unkillable_task() is always called from mem_cgroup_scan_tasks() and thus task_in_mem_cgroup() check becomes redundant and effectively dead code. So, just remove the task_in_mem_cgroup() check altogether. Link: http://lkml.kernel.org/r/20190624212631.87212-2-shakeelb@google.com Signed-off-by: Shakeel Butt Signed-off-by: Tetsuo Handa Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- include/linux/memcontrol.h | 7 ------- include/linux/oom.h | 2 +- mm/memcontrol.c | 26 -------------------------- mm/oom_kill.c | 19 +++++++------------ 5 files changed, 9 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 534fb1ae498a..64dadd469786 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -532,7 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; - points = oom_badness(task, NULL, NULL, totalpages) * + points = oom_badness(task, NULL, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 68402842c337..44c41462be33 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -394,7 +394,6 @@ out: struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); @@ -875,12 +874,6 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) -{ - return true; -} - static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; diff --git a/include/linux/oom.h b/include/linux/oom.h index d07992009265..b75104690311 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,7 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - struct mem_cgroup *memcg, const nodemask_t *nodemask, + const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 773ae5674e12..4f05735b02d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1259,32 +1259,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, *lru_size += nr_pages; } -bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -{ - struct mem_cgroup *task_memcg; - struct task_struct *p; - bool ret; - - p = find_lock_task_mm(task); - if (p) { - task_memcg = get_mem_cgroup_from_mm(p->mm); - task_unlock(p); - } else { - /* - * All threads may have already detached their mm's, but the oom - * killer still needs to detect if they have already been oom - * killed to prevent needlessly killing additional tasks. - */ - rcu_read_lock(); - task_memcg = mem_cgroup_from_task(task); - css_get(&task_memcg->css); - rcu_read_unlock(); - } - ret = mem_cgroup_is_descendant(task_memcg, memcg); - css_put(&task_memcg->css); - return ret; -} - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 59326614508a..b353f468a36a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -153,17 +153,13 @@ static inline bool is_memcg_oom(struct oom_control *oc) /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - struct mem_cgroup *memcg, const nodemask_t *nodemask) + const nodemask_t *nodemask) { if (is_global_init(p)) return true; if (p->flags & PF_KTHREAD) return true; - /* When mem_cgroup_out_of_memory() and p is not member of the group */ - if (memcg && !task_in_mem_cgroup(p, memcg)) - return true; - /* p may not have freeable memory in nodemask */ if (!has_intersects_mems_allowed(p, nodemask)) return true; @@ -194,20 +190,19 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation - * @memcg: task's memory controller, if constrained * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, +unsigned long oom_badness(struct task_struct *p, const nodemask_t *nodemask, unsigned long totalpages) { long points; long adj; - if (oom_unkillable_task(p, memcg, nodemask)) + if (oom_unkillable_task(p, nodemask)) return 0; p = find_lock_task_mm(p); @@ -318,7 +313,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) struct oom_control *oc = arg; unsigned long points; - if (oom_unkillable_task(task, NULL, oc->nodemask)) + if (oom_unkillable_task(task, oc->nodemask)) goto next; /* @@ -342,7 +337,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto select; } - points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); + points = oom_badness(task, oc->nodemask, oc->totalpages); if (!points || points < oc->chosen_points) goto next; @@ -387,7 +382,7 @@ static int dump_task(struct task_struct *p, void *arg) struct oom_control *oc = arg; struct task_struct *task; - if (oom_unkillable_task(p, NULL, oc->nodemask)) + if (oom_unkillable_task(p, oc->nodemask)) return 0; task = find_lock_task_mm(p); @@ -1084,7 +1079,7 @@ bool out_of_memory(struct oom_control *oc) check_panic_on_oom(oc); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && - current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && + current->mm && !oom_unkillable_task(current, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; -- cgit v1.2.3 From ac311a14c682dcd8a120a6244d0542ec654e3d93 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 11 Jul 2019 21:00:31 -0700 Subject: oom: decouple mems_allowed from oom_unkillable_task Commit ef08e3b4981a ("[PATCH] cpusets: confine oom_killer to mem_exclusive cpuset") introduces a heuristic where a potential oom-killer victim is skipped if the intersection of the potential victim and the current (the process triggered the oom) is empty based on the reason that killing such victim most probably will not help the current allocating process. However the commit 7887a3da753e ("[PATCH] oom: cpuset hint") changed the heuristic to just decrease the oom_badness scores of such potential victim based on the reason that the cpuset of such processes might have changed and previously they may have allocated memory on mems where the current allocating process can allocate from. Unintentionally 7887a3da753e ("[PATCH] oom: cpuset hint") introduced a side effect as the oom_badness is also exposed to the user space through /proc/[pid]/oom_score, so, readers with different cpusets can read different oom_score of the same process. Later, commit 6cf86ac6f36b ("oom: filter tasks not sharing the same cpuset") fixed the side effect introduced by 7887a3da753e by moving the cpuset intersection back to only oom-killer context and out of oom_badness. However the combination of ab290adbaf8f ("oom: make oom_unkillable_task() helper function") and 26ebc984913b ("oom: /proc//oom_score treat kernel thread honestly") unintentionally brought back the cpuset intersection check into the oom_badness calculation function. Other than doing cpuset/mempolicy intersection from oom_badness, the memcg oom context is also doing cpuset/mempolicy intersection which is quite wrong and is caught by syzcaller with the following report: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 28426 Comm: syz-executor.5 Not tainted 5.2.0-rc3-next-20190607 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000607304 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 Call Trace: oom_evaluate_task+0x49/0x520 mm/oom_kill.c:321 mem_cgroup_scan_tasks+0xcc/0x180 mm/memcontrol.c:1169 select_bad_process mm/oom_kill.c:374 [inline] out_of_memory mm/oom_kill.c:1088 [inline] out_of_memory+0x6b2/0x1280 mm/oom_kill.c:1035 mem_cgroup_out_of_memory+0x1ca/0x230 mm/memcontrol.c:1573 mem_cgroup_oom mm/memcontrol.c:1905 [inline] try_charge+0xfbe/0x1480 mm/memcontrol.c:2468 mem_cgroup_try_charge+0x24d/0x5e0 mm/memcontrol.c:6073 mem_cgroup_try_charge_delay+0x1f/0xa0 mm/memcontrol.c:6088 do_huge_pmd_wp_page_fallback+0x24f/0x1680 mm/huge_memory.c:1201 do_huge_pmd_wp_page+0x7fc/0x2160 mm/huge_memory.c:1359 wp_huge_pmd mm/memory.c:3793 [inline] __handle_mm_fault+0x164c/0x3eb0 mm/memory.c:4006 handle_mm_fault+0x3b7/0xa90 mm/memory.c:4053 do_user_addr_fault arch/x86/mm/fault.c:1455 [inline] __do_page_fault+0x5ef/0xda0 arch/x86/mm/fault.c:1521 do_page_fault+0x71/0x57d arch/x86/mm/fault.c:1552 page_fault+0x1e/0x30 arch/x86/entry/entry_64.S:1156 RIP: 0033:0x400590 Code: 06 e9 49 01 00 00 48 8b 44 24 10 48 0b 44 24 28 75 1f 48 8b 14 24 48 8b 7c 24 20 be 04 00 00 00 e8 f5 56 00 00 48 8b 74 24 08 <89> 06 e9 1e 01 00 00 48 8b 44 24 08 48 8b 14 24 be 04 00 00 00 8b RSP: 002b:00007fff7bc49780 EFLAGS: 00010206 RAX: 0000000000000001 RBX: 0000000000760000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 000000002000cffc RDI: 0000000000000001 RBP: fffffffffffffffe R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000075 R11: 0000000000000246 R12: 0000000000760008 R13: 00000000004c55f2 R14: 0000000000000000 R15: 00007fff7bc499b0 Modules linked in: ---[ end trace a65689219582ffff ]--- RIP: 0010:__read_once_size include/linux/compiler.h:194 [inline] RIP: 0010:has_intersects_mems_allowed mm/oom_kill.c:84 [inline] RIP: 0010:oom_unkillable_task mm/oom_kill.c:168 [inline] RIP: 0010:oom_unkillable_task+0x180/0x400 mm/oom_kill.c:155 Code: c1 ea 03 80 3c 02 00 0f 85 80 02 00 00 4c 8b a3 10 07 00 00 48 b8 00 00 00 00 00 fc ff df 4d 8d 74 24 10 4c 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 67 02 00 00 49 8b 44 24 10 4c 8d a0 68 fa ff ff RSP: 0018:ffff888000127490 EFLAGS: 00010a03 RAX: dffffc0000000000 RBX: ffff8880a4cd5438 RCX: ffffffff818dae9c RDX: 100000000c3cc602 RSI: ffffffff818dac8d RDI: 0000000000000001 RBP: ffff8880001274d0 R08: ffff888000086180 R09: ffffed1015d26be0 R10: ffffed1015d26bdf R11: ffff8880ae935efb R12: 8000000061e63007 R13: 0000000000000000 R14: 8000000061e63017 R15: 1ffff11000024ea6 FS: 00005555561f5940(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2f823000 CR3: 000000009237e000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000600 The fix is to decouple the cpuset/mempolicy intersection check from oom_unkillable_task() and make sure cpuset/mempolicy intersection check is only done in the global oom context. [shakeelb@google.com: change function name and update comment] Link: http://lkml.kernel.org/r/20190628152421.198994-3-shakeelb@google.com Link: http://lkml.kernel.org/r/20190624212631.87212-3-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: syzbot+d0fc9d3c166bc5e4a94b@syzkaller.appspotmail.com Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: David Rientjes Cc: Johannes Weiner Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Jackson Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +-- include/linux/oom.h | 1 - mm/oom_kill.c | 57 ++++++++++++++++++++++++++++++----------------------- 3 files changed, 33 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 64dadd469786..77eb628ecc7f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -532,8 +532,7 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, unsigned long totalpages = totalram_pages() + total_swap_pages; unsigned long points = 0; - points = oom_badness(task, NULL, totalpages) * - 1000 / totalpages; + points = oom_badness(task, totalpages) * 1000 / totalpages; seq_printf(m, "%lu\n", points); return 0; diff --git a/include/linux/oom.h b/include/linux/oom.h index b75104690311..c696c265f019 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -108,7 +108,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, - const nodemask_t *nodemask, unsigned long totalpages); extern bool out_of_memory(struct oom_control *oc); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b353f468a36a..d1c9c4e66d59 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -64,21 +64,33 @@ int sysctl_oom_dump_tasks = 1; */ DEFINE_MUTEX(oom_lock); +static inline bool is_memcg_oom(struct oom_control *oc) +{ + return oc->memcg != NULL; +} + #ifdef CONFIG_NUMA /** - * has_intersects_mems_allowed() - check task eligiblity for kill + * oom_cpuset_eligible() - check task eligiblity for kill * @start: task struct of which task to consider * @mask: nodemask passed to page allocator for mempolicy ooms * * Task eligibility is determined by whether or not a candidate task, @tsk, * shares the same mempolicy nodes as current if it is bound by such a policy * and whether or not it has the same set of allowed cpuset nodes. + * + * This function is assuming oom-killer context and 'current' has triggered + * the oom-killer. */ -static bool has_intersects_mems_allowed(struct task_struct *start, - const nodemask_t *mask) +static bool oom_cpuset_eligible(struct task_struct *start, + struct oom_control *oc) { struct task_struct *tsk; bool ret = false; + const nodemask_t *mask = oc->nodemask; + + if (is_memcg_oom(oc)) + return true; rcu_read_lock(); for_each_thread(start, tsk) { @@ -105,8 +117,7 @@ static bool has_intersects_mems_allowed(struct task_struct *start, return ret; } #else -static bool has_intersects_mems_allowed(struct task_struct *tsk, - const nodemask_t *mask) +static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) { return true; } @@ -146,24 +157,13 @@ static inline bool is_sysrq_oom(struct oom_control *oc) return oc->order == -1; } -static inline bool is_memcg_oom(struct oom_control *oc) -{ - return oc->memcg != NULL; -} - /* return true if the task is not adequate as candidate victim task. */ -static bool oom_unkillable_task(struct task_struct *p, - const nodemask_t *nodemask) +static bool oom_unkillable_task(struct task_struct *p) { if (is_global_init(p)) return true; if (p->flags & PF_KTHREAD) return true; - - /* p may not have freeable memory in nodemask */ - if (!has_intersects_mems_allowed(p, nodemask)) - return true; - return false; } @@ -190,19 +190,17 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation - * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the * task consuming the most memory to avoid subsequent oom failures. */ -unsigned long oom_badness(struct task_struct *p, - const nodemask_t *nodemask, unsigned long totalpages) +unsigned long oom_badness(struct task_struct *p, unsigned long totalpages) { long points; long adj; - if (oom_unkillable_task(p, nodemask)) + if (oom_unkillable_task(p)) return 0; p = find_lock_task_mm(p); @@ -313,7 +311,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) struct oom_control *oc = arg; unsigned long points; - if (oom_unkillable_task(task, oc->nodemask)) + if (oom_unkillable_task(task)) + goto next; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc)) goto next; /* @@ -337,7 +339,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) goto select; } - points = oom_badness(task, oc->nodemask, oc->totalpages); + points = oom_badness(task, oc->totalpages); if (!points || points < oc->chosen_points) goto next; @@ -382,7 +384,11 @@ static int dump_task(struct task_struct *p, void *arg) struct oom_control *oc = arg; struct task_struct *task; - if (oom_unkillable_task(p, oc->nodemask)) + if (oom_unkillable_task(p)) + return 0; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) return 0; task = find_lock_task_mm(p); @@ -1079,7 +1085,8 @@ bool out_of_memory(struct oom_control *oc) check_panic_on_oom(oc); if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && - current->mm && !oom_unkillable_task(current, oc->nodemask) && + current->mm && !oom_unkillable_task(current) && + oom_cpuset_eligible(current, oc) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); oc->chosen = current; -- cgit v1.2.3