From 6933599697c96c3213c95f5f1fc7cb6abfd08c54 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:43:43 -0800 Subject: inotify: hide internal kernel bits from fdinfo There was a report that my patch: inotify: actually check for invalid bits in sys_inotify_add_watch() broke CRIU. The reason is that CRIU looks up raw flags in /proc/$pid/fdinfo/* to figure out how to rebuild inotify watches and then passes those flags directly back in to the inotify API. One of those flags (FS_EVENT_ON_CHILD) is set in mark->mask, but is not part of the inotify API. It is used inside the kernel to _implement_ inotify but it is not and has never been part of the API. My patch above ensured that we only allow bits which are part of the API (IN_ALL_EVENTS). This broke CRIU. FS_EVENT_ON_CHILD is really internal to the kernel. It is set _anyway_ on all inotify marks. So, CRIU was really just trying to set a bit that was already set. This patch hides that bit from fdinfo. CRIU will not see the bit, not try to set it, and should work as before. We should not have been exposing this bit in the first place, so this is a good patch independent of the CRIU problem. Signed-off-by: Dave Hansen Reported-by: Andrey Wagin Acked-by: Andrey Vagin Acked-by: Cyrill Gorcunov Acked-by: Eric Paris Cc: Pavel Emelyanov Cc: John McCutchan Cc: Robert Love Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 6b6f0d472ae8..fd98e5100cab 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(mark->inode); if (inode) { + /* + * IN_ALL_EVENTS represents all of the mask bits + * that we expose to userspace. There is at + * least one bit (FS_EVENT_ON_CHILD) which is + * used only internally to the kernel. + */ + u32 mask = mark->mask & IN_ALL_EVENTS; seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, - mark->mask, mark->ignored_mask); + mask, mark->ignored_mask); show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); -- cgit v1.2.3 From d30e2c05a1a231452c273e74851d6b70d516f7f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 5 Nov 2015 18:43:46 -0800 Subject: inotify: actually check for invalid bits in sys_inotify_add_watch() The comment here says that it is checking for invalid bits. But, the mask is *actually* checking to ensure that _any_ valid bit is set, which is quite different. Without this check, an unexpected bit could get set on an inotify object. Since these bits are also interpreted by the fsnotify/dnotify code, there is the potential for an object to be mishandled inside the kernel. For instance, can we be sure that setting the dnotify flag FS_DN_RENAME on an inotify watch is harmless? Add the actual check which was intended. Retain the existing inotify bits are being added to the watch. Plus, this is existing behavior which would be nice to preserve. I did a quick sniff test that inotify functions and that my 'inotify-tools' package passes 'make check'. Signed-off-by: Dave Hansen Cc: John McCutchan Cc: Robert Love Cc: Eric Paris Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/inotify/inotify_user.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 5b1e2a497e51..b8d08d0d0a4d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, int ret; unsigned flags = 0; - /* don't allow invalid bits: we don't want flags set */ + /* + * We share a lot of code with fs/dnotify. We also share + * the bit layout between inotify's IN_* and the fsnotify + * FS_*. This check ensures that only the inotify IN_* + * bits get passed in and set in watches/events. + */ + if (unlikely(mask & ~ALL_INOTIFY_BITS)) + return -EINVAL; + /* + * Require at least one valid bit set in the mask. + * Without _something_ set, we would have no events to + * watch for. + */ if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; -- cgit v1.2.3 From ce4f2fd7eacd84d78530fb97621621ae50d167f1 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Thu, 5 Nov 2015 18:43:49 -0800 Subject: logfs: fix build warning fs/logfs/dev_bdev.c: In function '__bdev_writeseg': include/linux/kernel.h:601:17: warning: comparison of distinct pointer types lacks a cast [enabled by default] (void) (&_min1 == &_min2); \ fs/logfs/dev_bdev.c:84:14: note: in expansion of macro 'min' max_pages = min(nr_pages, BIO_MAX_PAGES); fs/logfs/dev_bdev.c: In function 'do_erase': include/linux/kernel.h:601:17: warning: comparison of distinct pointer types lacks a cast [enabled by default] (void) (&_min1 == &_min2); \ fs/logfs/dev_bdev.c:174:14: note: in expansion of macro 'min' max_pages = min(nr_pages, BIO_MAX_PAGES); Lets use min_t and mention the type. Signed-off-by: Sudip Mukherjee Cc: Joern Engel Cc: Prasad Joshi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/logfs/dev_bdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index a7fdbd868474..a709d80c8ebc 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); @@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, unsigned int max_pages; int i; - max_pages = min(nr_pages, BIO_MAX_PAGES); + max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOFS, max_pages); BUG_ON(!bio); -- cgit v1.2.3 From d162eaad7726e5f10a2b5813bdfac9d55c8eba69 Mon Sep 17 00:00:00 2001 From: "Norton.Zhu" Date: Thu, 5 Nov 2015 18:43:52 -0800 Subject: ocfs2_direct_IO_write() misses ocfs2_is_overwrite() error code If ocfs2_is_overwrite failed, ocfs2_direct_IO_write mays till return success to the caller. Signed-off-by: Norton.Zhu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 64b11d90eca6..f04914cc19a4 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -864,6 +864,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, is_overwrite = ocfs2_is_overwrite(osb, inode, offset); if (is_overwrite < 0) { mlog_errno(is_overwrite); + ret = is_overwrite; ocfs2_inode_unlock(inode, 1); goto clean_orphan; } -- cgit v1.2.3 From 4e357b932a665e62eb0642633057b8d7156ed8db Mon Sep 17 00:00:00 2001 From: jiangyiwen Date: Thu, 5 Nov 2015 18:43:55 -0800 Subject: ocfs2: fill in the unused portion of the block with zeros by dio_zero_block() A simplified test case is (this case from Ryan): 1) dd if=/dev/zero of=/mnt/hello bs=512 count=1 oflag=direct; 2) truncate /mnt/hello -s 2097152 file 'hello' is not exist before test. After this command, file 'hello' should be all zero. But 512~4096 is some random data. Setting bh state to new when get a new block, if so, direct_io_worker()->dio_zero_block() will fill-in the unused portion of the block with zero. Signed-off-by: Yiwen Jiang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f04914cc19a4..7f604727f487 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, ret = -EIO; goto bail; } + set_buffer_new(bh_result); up_write(&OCFS2_I(inode)->ip_alloc_sem); } -- cgit v1.2.3 From 1d1aff8cf367d2216a678c722161784e207965c4 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:43:58 -0800 Subject: ocfs2: improve performance for localalloc Currently cluster allocation is always trying to find a victim chain (a chian has most space), and this may lead to poor performance because of discontiguous allocation in some scenarios. Our test case is block size 4k, cluster size 1M and mount option with localalloc=2048 (2G), since a gd is 32256M (about 31.5G) and a localalloc window is only 2G, creating 50G file will result in 2G from gd0, 2G from gd1, ... One way to improve performance is enlarge localalloc window size (max 31104M), but this will make end user feel that about 30G is suddenly "missing", and localalloc currently do not support steal, which means one node cannot use another node's localalloc even it is not used in fact. So using the last gd to record the allocation and continues with the gd if it has enough space for a localalloc window can make the allocation as more contiguous as possible. Our test result is below (evaluated in IOPS), which is using iometer running in VM, dynamic vhd virtual disk stored in ocfs2. IO model Original After Improved(%) 16K60%Write100%Random 703 876 24.59% 8K90%Write100%Random 735 827 12.59% 4K100%Write100%Random 859 915 6.52% 4K100%Read100%Random 2092 2600 24.30% Signed-off-by: Joseph Qi Tested-by: Norton Zhu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d83d2602cf2b..fc6d25f6d444 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { - hint = ocfs2_group_from_res(res); + if (ocfs2_is_cluster_bitmap(ac->ac_inode)) + hint = res->sr_bg_blkno; + else + hint = ocfs2_group_from_res(res); goto set_hint; } if (status < 0 && status != -ENOSPC) { -- cgit v1.2.3 From 30edc43c7ff0760f6896c37c06a84533546588fa Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:01 -0800 Subject: ocfs2: do not include dio entry in case of orphan scan dio entry will only do truncate in case of ORPHAN_NEED_TRUNCATE. So do not include it when doing normal orphan scan to reduce contention. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.c | 15 ++++++++++++--- fs/ocfs2/namei.c | 2 -- fs/ocfs2/namei.h | 3 +++ 3 files changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff82b28462a6..4bac6007837d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv { struct dir_context ctx; struct inode *head; struct ocfs2_super *osb; + enum ocfs2_orphan_reco_type orphan_reco_type; }; static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, @@ -2036,6 +2037,12 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (name_len == 2 && !strncmp("..", name, 2)) return 0; + /* do not include dio entry in case of orphan scan */ + if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) && + (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN))) + return 0; + /* Skip bad inodes so that recovery can continue */ iter = ocfs2_iget(p->osb, ino, OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0); @@ -2060,14 +2067,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, static int ocfs2_queue_orphans(struct ocfs2_super *osb, int slot, - struct inode **head) + struct inode **head, + enum ocfs2_orphan_reco_type orphan_reco_type) { int status; struct inode *orphan_dir_inode = NULL; struct ocfs2_orphan_filldir_priv priv = { .ctx.actor = ocfs2_orphan_filldir, .osb = osb, - .head = *head + .head = *head, + .orphan_reco_type = orphan_reco_type }; orphan_dir_inode = ocfs2_get_system_file_inode(osb, @@ -2170,7 +2179,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, trace_ocfs2_recover_orphans(slot); ocfs2_mark_recovering_orphan_dir(osb, slot); - ret = ocfs2_queue_orphans(osb, slot, &inode); + ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type); ocfs2_clear_recovering_orphan_dir(osb, slot); /* Error here should be noted, but we want to continue with as diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b7dfac226b1e..1206392a071f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) -#define OCFS2_DIO_ORPHAN_PREFIX "dio-" -#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e173329eb830..1155918d6784 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -26,6 +26,9 @@ #ifndef OCFS2_NAMEI_H #define OCFS2_NAMEI_H +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 + extern const struct inode_operations ocfs2_dir_iops; struct dentry *ocfs2_get_parent(struct dentry *child); -- cgit v1.2.3 From 93d911fcce259a3f950ee20592beee31b855cd96 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:04 -0800 Subject: ocfs2: only take lock if dio entry when recover orphans We have no need to take inode mutex, rw and inode lock if it is not dio entry when recover orphans. Optimize it by adding a flag OCFS2_INODE_DIO_ORPHAN_ENTRY to ocfs2_inode_info to reduce contention. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.h | 2 ++ fs/ocfs2/journal.c | 86 +++++++++++++++++++++++++++++------------------------- 2 files changed, 49 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..aac8b86f312e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -112,6 +112,8 @@ struct ocfs2_inode_info #define OCFS2_INODE_OPEN_DIRECT 0x00000020 /* Tell the inode wipe code it's not in orphan dir */ #define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040 +/* Entry in orphan dir with 'dio-' prefix */ +#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4bac6007837d..04d6303f39a7 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2049,6 +2049,10 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (IS_ERR(iter)) return 0; + if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX, + OCFS2_DIO_ORPHAN_PREFIX_LEN)) + OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY; + /* Skip inodes which are already added to recover list, since dio may * happen concurrently with unlink/rename */ if (OCFS2_I(iter)->ip_next_orphan) { @@ -2195,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; - mutex_lock(&inode->i_mutex); - ret = ocfs2_rw_lock(inode, 1); - if (ret < 0) { - mlog_errno(ret); - goto next; - } - /* - * We need to take and drop the inode lock to - * force read inode from disk. - */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); - goto unlock_rw; - } + if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { + mutex_lock(&inode->i_mutex); + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto unlock_mutex; + } + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto unlock_rw; + } - di = (struct ocfs2_dinode *)di_bh->b_data; + di = (struct ocfs2_dinode *)di_bh->b_data; - if (inode->i_nlink == 0) { + if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto unlock_inode; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, + di_bh, 0, 0); + if (ret) + mlog_errno(ret); + } +unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; +unlock_rw: + ocfs2_rw_unlock(inode, 1); +unlock_mutex: + mutex_unlock(&inode->i_mutex); + + /* clear dio flag in ocfs2_inode_info */ + oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; + } else { spin_lock(&oi->ip_lock); /* Set the proper information to get us going into * ocfs2_delete_inode. */ @@ -2221,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, spin_unlock(&oi->ip_lock); } - if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && - (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { - ret = ocfs2_truncate_file(inode, di_bh, - i_size_read(inode)); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto unlock_inode; - } - - ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); - if (ret) - mlog_errno(ret); - } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ -unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; -unlock_rw: - ocfs2_rw_unlock(inode, 1); -next: - mutex_unlock(&inode->i_mutex); iput(inode); inode = iter; } -- cgit v1.2.3 From 0986fe9b50f425ec81f25a1a85aaf3574b31d801 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:07 -0800 Subject: ocfs2: fix race between mount and delete node/cluster There is a race case between mount and delete node/cluster, which will lead o2hb_thread to malfunctioning dead loop. o2hb_thread { o2nm_depend_this_node(); <<<<<< race window, node may have already been deleted, and then enter the loop, o2hb thread will be malfunctioning because of no configured nodes found. while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { } So check the return value of o2nm_depend_this_node() is needed. If node has been deleted, do not enter the loop and let mount fail. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index fa15debcc02b..ddddef0021a0 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -219,7 +219,8 @@ struct o2hb_region { unsigned hr_unclean_stop:1, hr_aborted_start:1, hr_item_pinned:1, - hr_item_dropped:1; + hr_item_dropped:1, + hr_node_deleted:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data) set_user_nice(current, MIN_NICE); /* Pin node */ - o2nm_depend_this_node(); + ret = o2nm_depend_this_node(); + if (ret) { + mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret); + reg->hr_node_deleted = 1; + wake_up(&o2hb_steady_queue); + return 0; + } while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { @@ -1787,7 +1794,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, spin_unlock(&o2hb_live_lock); ret = wait_event_interruptible(o2hb_steady_queue, - atomic_read(®->hr_steady_iterations) == 0); + atomic_read(®->hr_steady_iterations) == 0 || + reg->hr_node_deleted); if (ret) { atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; @@ -1798,6 +1806,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out3; } + if (reg->hr_node_deleted) { + ret = -EINVAL; + goto out3; + } + /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; -- cgit v1.2.3 From b1529a41f777a48f95d4af29668b70ffe3360e1b Mon Sep 17 00:00:00 2001 From: alex chen Date: Thu, 5 Nov 2015 18:44:10 -0800 Subject: ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error In ocfs2_mknod_locked if '__ocfs2_mknod_locke d' returns an error, we should reclaim the inode successfully claimed above, otherwise, the inode never be reused. The case is described below: ocfs2_mknod ocfs2_mknod_locked ocfs2_claim_new_inode Successfully claim the inode __ocfs2_mknod_locked ocfs2_journal_access_di Failed because of -ENOMEM or other reasons, the inode lockres has not been initialized yet. iput(inode) ocfs2_evict_inode ocfs2_delete_inode ocfs2_inode_lock ocfs2_inode_lock_full_nested __ocfs2_cluster_lock Return -EINVAL because of the inode lockres has not been initialized. So the following operations are not performed ocfs2_wipe_inode ocfs2_remove_inode ocfs2_free_dinode ocfs2_free_suballoc_bits Signed-off-by: Alex Chen Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 1206392a071f..3b48ac25d8a7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -655,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } - return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); + if (status < 0) { + u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); + int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, + inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); + if (tmp) + mlog_errno(tmp); + } + + return status; } static int ocfs2_mkdir(struct inode *dir, -- cgit v1.2.3 From 5afc44e2e9678c0808211f1662732b368cc25f76 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:13 -0800 Subject: ocfs2: add uuid to ocfs2 thread name for problem analysis A node can mount multiple ocfs2 volumes. And if thread names are same for each volume/domain, it will bring inconvenience when analyzing problems because we have to identify which volume/domain the messages belong to. Since thread name will be printed to messages, so add volume uuid or dlm name to thread name can benefit problem analysis. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Gang He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 4 +++- fs/ocfs2/dlm/dlmrecovery.c | 2 +- fs/ocfs2/dlm/dlmthread.c | 3 ++- fs/ocfs2/dlmglue.c | 3 ++- fs/ocfs2/journal.c | 4 ++-- 5 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6918f30d02cd..2ee7fe747cea 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) int status; unsigned int backoff; unsigned int total_backoff = 0; + char wq_name[O2NM_MAX_NAME_LEN]; BUG_ON(!dlm); @@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name); + dlm->dlm_worker = create_singlethread_workqueue(wq_name); if (!dlm->dlm_worker) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 58eaa5c0d387..9e4f862d20fe 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) mlog(0, "starting dlm recovery thread...\n"); dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, - "dlm_reco_thread"); + "dlm_reco-%s", dlm->name); if (IS_ERR(dlm->dlm_reco_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); dlm->dlm_reco_thread_task = NULL; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 2e5e6d5fffe8..c5f6c241ecd7 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm) { mlog(0, "Starting dlm_thread...\n"); - dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s", + dlm->name); if (IS_ERR(dlm->dlm_thread_task)) { mlog_errno(PTR_ERR(dlm->dlm_thread_task)); dlm->dlm_thread_task = NULL; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1c91103c1333..20276e340339 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) } /* launch downconvert thread */ - osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc"); + osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s", + osb->uuid_str); if (IS_ERR(osb->dc_task)) { status = PTR_ERR(osb->dc_task); osb->dc_task = NULL; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 04d6303f39a7..13534f4fe5b5 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) /* Launch the commit thread */ if (!local) { osb->commit_task = kthread_run(ocfs2_commit_thread, osb, - "ocfs2cmt"); + "ocfs2cmt-%s", osb->uuid_str); if (IS_ERR(osb->commit_task)) { status = PTR_ERR(osb->commit_task); osb->commit_task = NULL; @@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) goto out; osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, - "ocfs2rec"); + "ocfs2rec-%s", osb->uuid_str); if (IS_ERR(osb->recovery_thread_task)) { mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); osb->recovery_thread_task = NULL; -- cgit v1.2.3 From 262d8a8779e24596ec3452ebb7d0a989de91089a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 5 Nov 2015 18:44:16 -0800 Subject: ocfs2: clean up unused variable in ocfs2_duplicate_clusters_by_page() readahead_pages in ocfs2_duplicate_clusters_by_page is defined but not used, so clean it up. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e5d57cd32505..252119860e6c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to, readahead_pages; + unsigned int from, to; loff_t offset, end, map_end; struct address_space *mapping = inode->i_mapping; trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster, new_cluster, new_len); - readahead_pages = - (ocfs2_cow_contig_clusters(sb) << - OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); /* -- cgit v1.2.3 From b64787401fd85b66403dd05159a749e333059c0a Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Thu, 5 Nov 2015 18:44:21 -0800 Subject: 9p: do not overwrite return code when locking fails If the remote locking fail, we run a local vfs unlock that should work and return success to userland when we didn't actually lock at all. We need to tell the application that tried to lock that it didn't get it, not that all went well. Signed-off-by: Dominique Martinet Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3abc447783aa..6b747394f6f5 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -231,7 +231,8 @@ out_unlock: if (res < 0 && fl->fl_type != F_UNLCK) { fl_type = fl->fl_type; fl->fl_type = F_UNLCK; - res = posix_lock_file_wait(filp, fl); + /* Even if this fails we want to return the remote error */ + posix_lock_file_wait(filp, fl); fl->fl_type = fl_type; } out: -- cgit v1.2.3 From 25ee01a2fca02dfb5a3ce316e77910c468108199 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:11 -0800 Subject: mm: hugetlb: proc: add hugetlb-related fields to /proc/PID/smaps Currently /proc/PID/smaps provides no usage info for vma(VM_HUGETLB), which is inconvenient when we want to know per-task or per-vma base hugetlb usage. To solve this, this patch adds new fields for hugetlb usage like below: Size: 20480 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB AnonHugePages: 0 kB Shared_Hugetlb: 18432 kB Private_Hugetlb: 2048 kB Swap: 0 kB KernelPageSize: 2048 kB MMUPageSize: 2048 kB Locked: 0 kB VmFlags: rd wr mr mw me de ht [hughd@google.com: fix Private_Hugetlb alignment ] Signed-off-by: Naoya Horiguchi Acked-by: Joern Engel Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mike Kravetz Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 8 ++++++++ fs/proc/task_mmu.c | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 3a9d65c912e7..a7d6c06f36c4 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -424,6 +424,9 @@ Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB +AnonHugePages: 0 kB +Shared_Hugetlb: 0 kB +Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB KernelPageSize: 4 kB @@ -452,6 +455,11 @@ and a page is modified, the file page is replaced by a private anonymous copy. "Swap" shows how much would-be-anonymous memory is also used, but out on swap. "SwapPss" shows proportional swap share of this mapping. +"AnonHugePages" shows the ammount of memory backed by transparent hugepage. +"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by +hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical +reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. + "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter encoded manner. The codes are the following: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b029d426c558..5988b83836fb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -446,6 +446,8 @@ struct mem_size_stats { unsigned long anonymous; unsigned long anonymous_thp; unsigned long swap; + unsigned long shared_hugetlb; + unsigned long private_hugetlb; u64 pss; u64 swap_pss; }; @@ -625,12 +627,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) seq_putc(m, '\n'); } +#ifdef CONFIG_HUGETLB_PAGE +static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page = NULL; + + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); + + if (is_migration_entry(swpent)) + page = migration_entry_to_page(swpent); + } + if (page) { + int mapcount = page_mapcount(page); + + if (mapcount >= 2) + mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); + else + mss->private_hugetlb += huge_page_size(hstate_vma(vma)); + } + return 0; +} +#endif /* HUGETLB_PAGE */ + static int show_smap(struct seq_file *m, void *v, int is_pid) { struct vm_area_struct *vma = v; struct mem_size_stats mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, +#ifdef CONFIG_HUGETLB_PAGE + .hugetlb_entry = smaps_hugetlb_range, +#endif .mm = vma->vm_mm, .private = &mss, }; @@ -652,6 +686,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" + "Shared_Hugetlb: %8lu kB\n" + "Private_Hugetlb: %7lu kB\n" "Swap: %8lu kB\n" "SwapPss: %8lu kB\n" "KernelPageSize: %8lu kB\n" @@ -667,6 +703,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.referenced >> 10, mss.anonymous >> 10, mss.anonymous_thp >> 10, + mss.shared_hugetlb >> 10, + mss.private_hugetlb >> 10, mss.swap >> 10, (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), vma_kernel_pagesize(vma) >> 10, -- cgit v1.2.3 From 5d317b2b6536592a9b51fe65faed43d65ca9158e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 5 Nov 2015 18:47:14 -0800 Subject: mm: hugetlb: proc: add HugetlbPages field to /proc/PID/status Currently there's no easy way to get per-process usage of hugetlb pages, which is inconvenient because userspace applications which use hugetlb typically want to control their processes on the basis of how much memory (including hugetlb) they use. So this patch simply provides easy access to the info via /proc/PID/status. Signed-off-by: Naoya Horiguchi Acked-by: Joern Engel Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 2 ++ fs/proc/task_mmu.c | 1 + include/linux/hugetlb.h | 19 +++++++++++++++++++ include/linux/mm_types.h | 3 +++ mm/hugetlb.c | 9 +++++++++ mm/rmap.c | 4 +++- 6 files changed, 37 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a7d6c06f36c4..12ac0e4455d4 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -175,6 +175,7 @@ read the file /proc/PID/status: VmLib: 1412 kB VmPTE: 20 kb VmSwap: 0 kB + HugetlbPages: 0 kB Threads: 1 SigQ: 0/28578 SigPnd: 0000000000000000 @@ -238,6 +239,7 @@ Table 1-2: Contents of the status files (as of 4.1) VmPTE size of page table entries VmPMD size of second level page tables VmSwap size of swap usage (the number of referred swapents) + HugetlbPages size of hugetlb memory portions Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5988b83836fb..288185e24762 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) ptes >> 10, pmds >> 10, swap << (PAGE_SHIFT-10)); + hugetlb_report_usage(m, mm); } unsigned long task_vsize(struct mm_struct *mm) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5e35379f58a5..685c262e0be8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -483,6 +483,17 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, #define hugepages_supported() (HPAGE_SHIFT != 0) #endif +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm); + +static inline void hugetlb_count_add(long l, struct mm_struct *mm) +{ + atomic_long_add(l, &mm->hugetlb_usage); +} + +static inline void hugetlb_count_sub(long l, struct mm_struct *mm) +{ + atomic_long_sub(l, &mm->hugetlb_usage); +} #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL @@ -519,6 +530,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, { return &mm->page_table_lock; } + +static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) +{ +} + +static inline void hugetlb_count_sub(long l, struct mm_struct *mm) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3d6baa7d4534..0a85da25a822 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -486,6 +486,9 @@ struct mm_struct { /* address of the bounds directory */ void __user *bd_addr; #endif +#ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; +#endif }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9cc773483624..abfbe8ca3323 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2790,6 +2790,12 @@ void hugetlb_show_meminfo(void) 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); } +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "HugetlbPages:\t%8lu kB\n", + atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); +} + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { @@ -3025,6 +3031,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, get_page(ptepage); page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); + hugetlb_count_add(pages_per_huge_page(h), dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -3105,6 +3112,7 @@ again: if (huge_pte_dirty(pte)) set_page_dirty(page); + hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { @@ -3509,6 +3517,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); + hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); diff --git a/mm/rmap.c b/mm/rmap.c index f5b5c1f3dcd7..d40e7aefb888 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1352,7 +1352,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (!PageHuge(page)) { + if (PageHuge(page)) { + hugetlb_count_sub(1 << compound_order(page), mm); + } else { if (PageAnon(page)) dec_mm_counter(mm, MM_ANONPAGES); else -- cgit v1.2.3 From aa750fd71c242dba02ee2034e15fbd7d0cdb2461 Mon Sep 17 00:00:00 2001 From: Junichi Nomura Date: Thu, 5 Nov 2015 18:47:23 -0800 Subject: mm/filemap.c: make global sync not clear error status of individual inodes filemap_fdatawait() is a function to wait for on-going writeback to complete but also consume and clear error status of the mapping set during writeback. The latter functionality is critical for applications to detect writeback error with system calls like fsync(2)/fdatasync(2). However filemap_fdatawait() is also used by sync(2) or FIFREEZE ioctl, which don't check error status of individual mappings. As a result, fsync() may not be able to detect writeback error if events happen in the following order: Application System admin ---------------------------------------------------------- write data on page cache Run sync command writeback completes with error filemap_fdatawait() clears error fsync returns success (but the data is not on disk) This patch adds filemap_fdatawait_keep_errors() for call sites where writeback error is not handled so that they don't clear error status. Signed-off-by: Jun'ichi Nomura Acked-by: Andi Kleen Reviewed-by: Tejun Heo Cc: Fengguang Wu Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 7 +++++- fs/sync.c | 7 +++++- include/linux/fs.h | 1 + mm/filemap.c | 67 +++++++++++++++++++++++++++++++++++++++++++----------- 4 files changed, 67 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7378169e90be..206a68b1db1a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2149,7 +2149,12 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); old_inode = inode; - filemap_fdatawait(mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(mapping); cond_resched(); diff --git a/fs/sync.c b/fs/sync.c index fbc98ee62044..4ec430ae2b0d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) static void fdatawait_one_bdev(struct block_device *bdev, void *arg) { - filemap_fdatawait(bdev->bd_inode->i_mapping); + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 72d8a844c692..9355f377fd46 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2422,6 +2422,7 @@ extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern void filemap_fdatawait_keep_errors(struct address_space *); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); diff --git a/mm/filemap.c b/mm/filemap.c index 1fe962b49f31..884766da1165 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -331,23 +331,14 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -/** - * filemap_fdatawait_range - wait for writeback to complete - * @mapping: address space structure to wait for - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Walk the list of under-writeback pages of the given address space - * in the given range and wait for all of them. - */ -int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, - loff_t end_byte) +static int __filemap_fdatawait_range(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret2, ret = 0; + int ret = 0; if (end_byte < start_byte) goto out; @@ -374,6 +365,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, cond_resched(); } out: + return ret; +} + +/** + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. Check error status of + * the address space and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) +{ + int ret, ret2; + + ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); ret2 = filemap_check_errors(mapping); if (!ret) ret = ret2; @@ -382,12 +396,39 @@ out: } EXPORT_SYMBOL(filemap_fdatawait_range); +/** + * filemap_fdatawait_keep_errors - wait for writeback without clearing errors + * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. Unlike filemap_fdatawait(), this function + * does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + */ +void filemap_fdatawait_keep_errors(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return; + + __filemap_fdatawait_range(mapping, 0, i_size - 1); +} + /** * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space - * and wait for all of them. + * and wait for all of them. Check error status of the address space + * and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. */ int filemap_fdatawait(struct address_space *mapping) { -- cgit v1.2.3 From 326c2597a3b93b8a89e0c3a2b41fd9971d095a97 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 5 Nov 2015 18:49:21 -0800 Subject: mm: clear pte in clear_soft_dirty() As mentioned in the commit 56eecdb912b5 ("mm: Use ptep/pmdp_set_numa() for updating _PAGE_NUMA bit"), architectures like ppc64 don't do tlb flush in set_pte/pmd functions. So when dealing with existing pte in clear_soft_dirty, the pte must be cleared before being modified. Signed-off-by: Laurent Dufour Reviewed-by: Aneesh Kumar K.V Cc: Pavel Emelyanov Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 288185e24762..ef44bb474ea2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -792,19 +792,20 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, pte_t ptent = *pte; if (pte_present(ptent)) { + ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); ptent = pte_wrprotect(ptent); ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); } - - set_pte_at(vma->vm_mm, addr, pte, ptent); } static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); -- cgit v1.2.3 From 5d3875a01eeafef7ef0b73a99462cb9bd55e8485 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 5 Nov 2015 18:49:24 -0800 Subject: mm: clear_soft_dirty_pmd() requires THP Don't build clear_soft_dirty_pmd() if transparent huge pages are not enabled. Signed-off-by: Laurent Dufour Reviewed-by: Aneesh Kumar K.V Cc: Pavel Emelyanov Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ef44bb474ea2..187b3b5f242e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -801,7 +801,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, set_pte_at(vma->vm_mm, addr, pte, ptent); } } +#else +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} +#endif +#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -815,14 +822,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } - #else - -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} - static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { -- cgit v1.2.3 From b72bdfa73603f2f81fbac651b9ae36807e877752 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 5 Nov 2015 18:50:32 -0800 Subject: mm, oom: add comment for why oom_adj exists /proc/pid/oom_adj exists solely to avoid breaking existing userspace binaries that write to the tunable. Add a comment in the only possible location within the kernel tree to describe the situation and motivation for keeping it around. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 29595af32866..bd3e9e68125b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1032,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, return simple_read_from_buffer(buf, count, ppos, buffer, len); } +/* + * /proc/pid/oom_adj exists solely for backwards compatibility with previous + * kernels. The effective policy is defined by oom_score_adj, which has a + * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. + * Values written to oom_adj are simply mapped linearly to oom_score_adj. + * Processes that become oom disabled via oom_adj will still be oom disabled + * with this implementation. + * + * oom_adj cannot be removed since existing userspace binaries use it. + */ static ssize_t oom_adj_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { -- cgit v1.2.3