From b0c013e2928d3696ceb6401311dbc1d7fcccd6dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:30:41 -0400 Subject: ext4: add a new ioctl EXT4_IOC_CLEAR_ES_CACHE The new ioctl EXT4_IOC_CLEAR_ES_CACHE will force an inode's extent status cache to be cleared out. This is intended for use for debugging. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf660aa7a9e0..b22f24f1d365 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -649,6 +649,8 @@ enum { #define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR -- cgit v1.2.3 From 1ad3ea6e0a694b0486eb2cbe60378ad0fbf23642 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:31:41 -0400 Subject: ext4: add a new ioctl EXT4_IOC_GETSTATE The new ioctl EXT4_IOC_GETSTATE returns some of the dynamic state of an ext4 inode for debugging purposes. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 +++++++++++ fs/ext4/ioctl.c | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b22f24f1d365..ee296797bcd2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -651,6 +651,7 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -664,6 +665,16 @@ enum { #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 15b1047878ab..ffb7bde4900d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1123,6 +1123,22 @@ resizefs_out: return 0; } + case EXT4_IOC_GETSTATE: + { + __u32 state = 0; + + if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED)) + state |= EXT4_STATE_FLAG_EXT_PRECACHED; + if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + state |= EXT4_STATE_FLAG_NEW; + if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) + state |= EXT4_STATE_FLAG_NEWENTRY; + if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) + state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE; + + return put_user(state, (__u32 __user *) arg); + } + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1242,6 +1258,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: case EXT4_IOC_CLEAR_ES_CACHE: + case EXT4_IOC_GETSTATE: break; default: return -ENOIOCTLCMD; -- cgit v1.2.3 From bb5835edcdf8bf78bbe51cff13e332c439bc0567 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 11 Aug 2019 16:32:41 -0400 Subject: ext4: add new ioctl EXT4_IOC_GET_ES_CACHE For debugging reasons, it's useful to know the contents of the extent cache. Since the extent cache contains much of what is in the fiemap ioctl, use an fiemap-style interface to return this information. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 10 ++++++ fs/ext4/extents.c | 94 +++++++++++++++++++++++++++++++++++++++++++----- fs/ext4/extents_status.c | 10 ++++++ fs/ext4/extents_status.h | 1 + fs/ext4/inode.c | 6 ++-- fs/ext4/ioctl.c | 72 +++++++++++++++++++++++++++++++++++++ 6 files changed, 182 insertions(+), 11 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ee296797bcd2..e2d8ad27f4d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -652,6 +652,7 @@ enum { /* ioctl codes 19--39 are reserved for fscrypt */ #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR @@ -692,6 +693,12 @@ enum { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -3258,6 +3265,9 @@ extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +extern int ext4_get_es_cache(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92266a2da7d6..0620d495fd8a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2315,6 +2315,52 @@ static int ext4_fill_fiemap_extents(struct inode *inode, return err; } +static int ext4_fill_es_cache_info(struct inode *inode, + ext4_lblk_t block, ext4_lblk_t num, + struct fiemap_extent_info *fieinfo) +{ + ext4_lblk_t next, end = block + num - 1; + struct extent_status es; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned int flags; + int err; + + while (block <= end) { + next = 0; + flags = 0; + if (!ext4_es_lookup_extent(inode, block, &next, &es)) + break; + if (ext4_es_is_unwritten(&es)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + if (ext4_es_is_delayed(&es)) + flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + if (ext4_es_is_hole(&es)) + flags |= EXT4_FIEMAP_EXTENT_HOLE; + if (next == 0) + flags |= FIEMAP_EXTENT_LAST; + if (flags & (FIEMAP_EXTENT_DELALLOC| + EXT4_FIEMAP_EXTENT_HOLE)) + es.es_pblk = 0; + else + es.es_pblk = ext4_es_pblock(&es); + err = fiemap_fill_next_extent(fieinfo, + (__u64)es.es_lblk << blksize_bits, + (__u64)es.es_pblk << blksize_bits, + (__u64)es.es_len << blksize_bits, + flags); + if (next == 0) + break; + block = next; + if (err < 0) + return err; + if (err == 1) + return 0; + } + return 0; +} + + /* * ext4_ext_determine_hole - determine hole around given block * @inode: inode we lookup in @@ -5017,8 +5063,6 @@ static int ext4_find_delayed_extent(struct inode *inode, return next_del; } -/* fiemap flags we can handle specified here */ -#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) static int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) @@ -5055,10 +5099,16 @@ static int ext4_xattr_fiemap(struct inode *inode, return (error < 0 ? error : 0); } -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) +static int _ext4_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, + int (*fill)(struct inode *, ext4_lblk_t, + ext4_lblk_t, + struct fiemap_extent_info *)) { ext4_lblk_t start_blk; + u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR; + int error = 0; if (ext4_has_inline_data(inode)) { @@ -5075,14 +5125,18 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_ext_precache(inode); if (error) return error; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && + fill == ext4_fill_fiemap_extents) return generic_block_fiemap(inode, fieinfo, start, len, ext4_get_block); - if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + if (fill == ext4_fill_es_cache_info) + ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; + if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { @@ -5101,12 +5155,36 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information * and pushing extents back to the user. */ - error = ext4_fill_fiemap_extents(inode, start_blk, - len_blks, fieinfo); + error = fill(inode, start_blk, len_blks, fieinfo); } return error; } +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + return _ext4_fiemap(inode, fieinfo, start, len, + ext4_fill_fiemap_extents); +} + +int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + if (ext4_has_inline_data(inode)) { + int has_inline; + + down_read(&EXT4_I(inode)->xattr_sem); + has_inline = ext4_has_inline_data(inode); + up_read(&EXT4_I(inode)->xattr_sem); + if (has_inline) + return 0; + } + + return _ext4_fiemap(inode, fieinfo, start, len, + ext4_fill_es_cache_info); +} + + /* * ext4_access_path: * Function to access the path buffer for marking it dirty. diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 02cc8eb3eb0e..a959adc59bcd 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -899,6 +899,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, * Return: 1 on found, 0 on not */ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, struct extent_status *es) { struct ext4_es_tree *tree; @@ -948,6 +949,15 @@ out: if (!ext4_es_is_referenced(es1)) ext4_es_set_referenced(es1); stats->es_stats_cache_hits++; + if (next_lblk) { + node = rb_next(&es1->rb_node); + if (node) { + es1 = rb_entry(node, struct extent_status, + rb_node); + *next_lblk = es1->es_lblk; + } else + *next_lblk = 0; + } } else { stats->es_stats_cache_misses++; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index e16785f431e7..eb56a1289031 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -140,6 +140,7 @@ extern void ext4_es_find_extent_range(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a6523516d681..4b92c7603907 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -527,7 +527,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, return -EFSCORRUPTED; /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -695,7 +695,7 @@ found: * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && - ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) goto out_sem; } @@ -1868,7 +1868,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ - if (ext4_es_lookup_extent(inode, iblock, &es)) { + if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ffb7bde4900d..d6242b7b8718 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -745,6 +745,74 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); } +/* copied from fs/ioctl.c */ +static int fiemap_check_ranges(struct super_block *sb, + u64 start, u64 len, u64 *new_len) +{ + u64 maxbytes = (u64) sb->s_maxbytes; + + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (len > maxbytes || (maxbytes - len) < start) + *new_len = maxbytes - start; + + return 0; +} + +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + +static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap __user *ufiemap = (struct fiemap __user *) arg; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + u64 len; + int error; + + if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, + &len); + if (error) + return error; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = ufiemap->fm_extents; + + if (fiemap.fm_extent_count != 0 && + !access_ok(fieinfo.fi_extents_start, + fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) + return -EFAULT; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1139,6 +1207,9 @@ resizefs_out: return put_user(state, (__u32 __user *) arg); } + case EXT4_IOC_GET_ES_CACHE: + return ext4_ioctl_get_es_cache(filp, arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -1259,6 +1330,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: + case EXT4_IOC_GET_ES_CACHE: break; default: return -ENOIOCTLCMD; -- cgit v1.2.3 From cd2d99229dc96219547e6349841e1aad851c6acc Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 12 Aug 2019 13:44:49 -0400 Subject: ext4: drop legacy pre-1970 encoding workaround Originally, support for expanded timestamps had a bug in that pre-1970 times were erroneously encoded as being in the the 24th century. This was fixed in commit a4dad1ae24f8 ("ext4: Fix handling of extended tv_sec") which landed in 4.4. Starting with 4.4, pre-1970 timestamps were correctly encoded, but for backwards compatibility those incorrectly encoded timestamps were mapped back to the pre-1970 dates. Given that backwards compatibility workaround has been around for 4 years, and given that running e2fsck from e2fsprogs 1.43.2 and later will offer to fix these timestamps (which has been released for 3 years), it's past time to drop the legacy workaround from the kernel. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e2d8ad27f4d1..17cc2dc13174 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -828,21 +828,8 @@ static inline __le32 ext4_encode_extra_time(struct timespec64 *time) static inline void ext4_decode_extra_time(struct timespec64 *time, __le32 extra) { - if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) { - -#if 1 - /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. (This backwards compatibility may be removed - * at the discretion of the ext4 developers.) - */ - u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; - if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) - extra_bits = 0; - time->tv_sec += extra_bits << 32; -#else + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; -#endif - } time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; } -- cgit v1.2.3 From 7963e5ac901251c7a3b36fe7c987623a3f309393 Mon Sep 17 00:00:00 2001 From: ZhangXiaoxu Date: Thu, 22 Aug 2019 23:00:32 -0400 Subject: ext4: treat buffers with write errors as containing valid data I got some errors when I repair an ext4 volume which stacked by an iscsi target: Entry 'test60' in / (2) has deleted/unused inode 73750. Clear? It can be reproduced when the network not good enough. When I debug this I found ext4 will read entry buffer from disk and the buffer is marked with write_io_error. If the buffer is marked with write_io_error, it means it already wroten to journal, and not checked out to disk. IOW, the journal is newer than the data in disk. If this journal record 'delete test60', it means the 'test60' still on the disk metadata. In this case, if we read the buffer from disk successfully and create file continue, the new journal record will overwrite the journal which record 'delete test60', then the entry corruptioned. So, use the buffer rather than read from disk if the buffer is marked with write_io_error. Signed-off-by: Zhang Xiaoxu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 13 +++++++++++++ fs/ext4/inode.c | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17cc2dc13174..2348be3d66b7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3344,6 +3344,19 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) extern const struct iomap_ops ext4_iomap_ops; +static inline int ext4_buffer_uptodate(struct buffer_head *bh) +{ + /* + * If the buffer has the write error flag, we have failed + * to write out data in the block. In this case, we don't + * have to read the block because we may read the old data + * successfully. + */ + if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + set_buffer_uptodate(bh); + return buffer_uptodate(bh); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4b92c7603907..9db896fc6af8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1024,7 +1024,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; - if (!bh || buffer_uptodate(bh)) + if (!bh || ext4_buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); @@ -1051,7 +1051,7 @@ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ - if (bhs[i] && !buffer_uptodate(bhs[i])) + if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bhs[i]); -- cgit v1.2.3 From 8fcc3a580651cceb94a9f48e1914491400d5146b Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 22 Aug 2019 23:22:14 -0400 Subject: ext4: rework reserved cluster accounting when invalidating pages The goal of this patch is to remove two references to the buffer delay bit in ext4_da_page_release_reservation() as part of a larger effort to remove all such references from ext4. These two references are principally used to reduce the reserved block/cluster count when pages are invalidated as a result of truncating, punching holes, or collapsing a block range in a file. The entire function is removed and replaced with code in ext4_es_remove_extent() that reduces the reserved count as a side effect of removing a block range from delayed and not unwritten extents in the extent status tree as is done when truncating, punching holes, or collapsing ranges. The code is written to minimize the number of searches descending from rb tree roots for scalability. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 + fs/ext4/extents_status.c | 446 ++++++++++++++++++++++++++++++++++++----------- fs/ext4/extents_status.h | 2 - fs/ext4/inode.c | 63 +------ 4 files changed, 353 insertions(+), 161 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2348be3d66b7..0664c43cc9dc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -284,6 +284,9 @@ struct ext4_io_submit { ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) +/* Fill in the low bits to get the last block of the cluster */ +#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ + ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index a959adc59bcd..5efbb116fba0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -146,7 +146,7 @@ static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end); + ext4_lblk_t end, int *reserved); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); @@ -836,7 +836,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, NULL); if (err != 0) goto error; retry: @@ -968,8 +968,322 @@ out: return found; } +struct rsvd_count { + int ndelonly; + bool first_do_lblk_found; + ext4_lblk_t first_do_lblk; + ext4_lblk_t last_do_lblk; + struct extent_status *left_es; + bool partial; + ext4_lblk_t lclu; +}; + +/* + * init_rsvd - initialize reserved count data before removing block range + * in file from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @es - pointer to first extent in range + * @rc - pointer to reserved count data + * + * Assumes es is not NULL + */ +static void init_rsvd(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct rb_node *node; + + rc->ndelonly = 0; + + /* + * for bigalloc, note the first delonly block in the range has not + * been found, record the extent containing the block to the left of + * the region to be removed, if any, and note that there's no partial + * cluster to track + */ + if (sbi->s_cluster_ratio > 1) { + rc->first_do_lblk_found = false; + if (lblk > es->es_lblk) { + rc->left_es = es; + } else { + node = rb_prev(&es->rb_node); + rc->left_es = node ? rb_entry(node, + struct extent_status, + rb_node) : NULL; + } + rc->partial = false; + } +} + +/* + * count_rsvd - count the clusters containing delayed and not unwritten + * (delonly) blocks in a range within an extent and add to + * the running tally in rsvd_count + * + * @inode - file containing extent + * @lblk - first block in range + * @len - length of range in blocks + * @es - pointer to extent containing clusters to be counted + * @rc - pointer to reserved count data + * + * Tracks partial clusters found at the beginning and end of extents so + * they aren't overcounted when they span adjacent extents + */ +static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, + struct extent_status *es, struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t i, end, nclu; + + if (!ext4_es_is_delonly(es)) + return; + + WARN_ON(len <= 0); + + if (sbi->s_cluster_ratio == 1) { + rc->ndelonly += (int) len; + return; + } + + /* bigalloc */ + + i = (lblk < es->es_lblk) ? es->es_lblk : lblk; + end = lblk + (ext4_lblk_t) len - 1; + end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; + + /* record the first block of the first delonly extent seen */ + if (rc->first_do_lblk_found == false) { + rc->first_do_lblk = i; + rc->first_do_lblk_found = true; + } + + /* update the last lblk in the region seen so far */ + rc->last_do_lblk = end; + + /* + * if we're tracking a partial cluster and the current extent + * doesn't start with it, count it and stop tracking + */ + if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) { + rc->ndelonly++; + rc->partial = false; + } + + /* + * if the first cluster doesn't start on a cluster boundary but + * ends on one, count it + */ + if (EXT4_LBLK_COFF(sbi, i) != 0) { + if (end >= EXT4_LBLK_CFILL(sbi, i)) { + rc->ndelonly++; + rc->partial = false; + i = EXT4_LBLK_CFILL(sbi, i) + 1; + } + } + + /* + * if the current cluster starts on a cluster boundary, count the + * number of whole delonly clusters in the extent + */ + if ((i + sbi->s_cluster_ratio - 1) <= end) { + nclu = (end - i + 1) >> sbi->s_cluster_bits; + rc->ndelonly += nclu; + i += nclu << sbi->s_cluster_bits; + } + + /* + * start tracking a partial cluster if there's a partial at the end + * of the current extent and we're not already tracking one + */ + if (!rc->partial && i <= end) { + rc->partial = true; + rc->lclu = EXT4_B2C(sbi, i); + } +} + +/* + * __pr_tree_search - search for a pending cluster reservation + * + * @root - root of pending reservation tree + * @lclu - logical cluster to search for + * + * Returns the pending reservation for the cluster identified by @lclu + * if found. If not, returns a reservation for the next cluster if any, + * and if not, returns NULL. + */ +static struct pending_reservation *__pr_tree_search(struct rb_root *root, + ext4_lblk_t lclu) +{ + struct rb_node *node = root->rb_node; + struct pending_reservation *pr = NULL; + + while (node) { + pr = rb_entry(node, struct pending_reservation, rb_node); + if (lclu < pr->lclu) + node = node->rb_left; + else if (lclu > pr->lclu) + node = node->rb_right; + else + return pr; + } + if (pr && lclu < pr->lclu) + return pr; + if (pr && lclu > pr->lclu) { + node = rb_next(&pr->rb_node); + return node ? rb_entry(node, struct pending_reservation, + rb_node) : NULL; + } + return NULL; +} + +/* + * get_rsvd - calculates and returns the number of cluster reservations to be + * released when removing a block range from the extent status tree + * and releases any pending reservations within the range + * + * @inode - file containing block range + * @end - last block in range + * @right_es - pointer to extent containing next block beyond end or NULL + * @rc - pointer to reserved count data + * + * The number of reservations to be released is equal to the number of + * clusters containing delayed and not unwritten (delonly) blocks within + * the range, minus the number of clusters still containing delonly blocks + * at the ends of the range, and minus the number of pending reservations + * within the range. + */ +static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, + struct extent_status *right_es, + struct rsvd_count *rc) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct pending_reservation *pr; + struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; + struct rb_node *node; + ext4_lblk_t first_lclu, last_lclu; + bool left_delonly, right_delonly, count_pending; + struct extent_status *es; + + if (sbi->s_cluster_ratio > 1) { + /* count any remaining partial cluster */ + if (rc->partial) + rc->ndelonly++; + + if (rc->ndelonly == 0) + return 0; + + first_lclu = EXT4_B2C(sbi, rc->first_do_lblk); + last_lclu = EXT4_B2C(sbi, rc->last_do_lblk); + + /* + * decrease the delonly count by the number of clusters at the + * ends of the range that still contain delonly blocks - + * these clusters still need to be reserved + */ + left_delonly = right_delonly = false; + + es = rc->left_es; + while (es && ext4_es_end(es) >= + EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + left_delonly = true; + break; + } + node = rb_prev(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, rb_node); + } + if (right_es && (!left_delonly || first_lclu != last_lclu)) { + if (end < ext4_es_end(right_es)) { + es = right_es; + } else { + node = rb_next(&right_es->rb_node); + es = node ? rb_entry(node, struct extent_status, + rb_node) : NULL; + } + while (es && es->es_lblk <= + EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) { + if (ext4_es_is_delonly(es)) { + rc->ndelonly--; + right_delonly = true; + break; + } + node = rb_next(&es->rb_node); + if (!node) + break; + es = rb_entry(node, struct extent_status, + rb_node); + } + } + + /* + * Determine the block range that should be searched for + * pending reservations, if any. Clusters on the ends of the + * original removed range containing delonly blocks are + * excluded. They've already been accounted for and it's not + * possible to determine if an associated pending reservation + * should be released with the information available in the + * extents status tree. + */ + if (first_lclu == last_lclu) { + if (left_delonly | right_delonly) + count_pending = false; + else + count_pending = true; + } else { + if (left_delonly) + first_lclu++; + if (right_delonly) + last_lclu--; + if (first_lclu <= last_lclu) + count_pending = true; + else + count_pending = false; + } + + /* + * a pending reservation found between first_lclu and last_lclu + * represents an allocated cluster that contained at least one + * delonly block, so the delonly total must be reduced by one + * for each pending reservation found and released + */ + if (count_pending) { + pr = __pr_tree_search(&tree->root, first_lclu); + while (pr && pr->lclu <= last_lclu) { + rc->ndelonly--; + node = rb_next(&pr->rb_node); + rb_erase(&pr->rb_node, &tree->root); + kmem_cache_free(ext4_pending_cachep, pr); + if (!node) + break; + pr = rb_entry(node, struct pending_reservation, + rb_node); + } + } + } + return rc->ndelonly; +} + + +/* + * __es_remove_extent - removes block range from extent status tree + * + * @inode - file containing range + * @lblk - first block in range + * @end - last block in range + * @reserved - number of cluster reservations released + * + * If @reserved is not NULL and delayed allocation is enabled, counts + * block/cluster reservations freed by removing range and if bigalloc + * enabled cancels pending reservations as needed. Returns 0 on success, + * error code on failure. + */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end) + ext4_lblk_t end, int *reserved) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; @@ -978,9 +1292,14 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len1, len2; ext4_fsblk_t block; int err; + bool count_reserved = true; + struct rsvd_count rc; + if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) + count_reserved = false; retry: err = 0; + es = __es_tree_search(&tree->root, lblk); if (!es) goto out; @@ -989,6 +1308,8 @@ retry: /* Simply invalidate cache_es. */ tree->cache_es = NULL; + if (count_reserved) + init_rsvd(inode, lblk, es, &rc); orig_es.es_lblk = es->es_lblk; orig_es.es_len = es->es_len; @@ -1030,10 +1351,16 @@ retry: ext4_es_store_pblock(es, block); } } + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, + &orig_es, &rc); goto out; } if (len1 > 0) { + if (count_reserved) + count_rsvd(inode, lblk, orig_es.es_len - len1, + &orig_es, &rc); node = rb_next(&es->rb_node); if (node) es = rb_entry(node, struct extent_status, rb_node); @@ -1042,6 +1369,8 @@ retry: } while (es && ext4_es_end(es) <= end) { + if (count_reserved) + count_rsvd(inode, es->es_lblk, es->es_len, es, &rc); node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); @@ -1056,6 +1385,9 @@ retry: ext4_lblk_t orig_len = es->es_len; len1 = ext4_es_end(es) - end; + if (count_reserved) + count_rsvd(inode, es->es_lblk, orig_len - len1, + es, &rc); es->es_lblk = end + 1; es->es_len = len1; if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { @@ -1064,20 +1396,28 @@ retry: } } + if (count_reserved) + *reserved = get_rsvd(inode, end, es, &rc); out: return err; } /* - * ext4_es_remove_extent() removes a space from a extent status tree. + * ext4_es_remove_extent - removes block range from extent status tree * - * Return 0 on success, error code on failure. + * @inode - file containing range + * @lblk - first block in range + * @len - number of blocks to remove + * + * Reduces block/cluster reservation count and for bigalloc cancels pending + * reservations as needed. Returns 0 on success, error code on failure. */ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { ext4_lblk_t end; int err = 0; + int reserved = 0; trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", @@ -1095,9 +1435,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end); + err = __es_remove_extent(inode, lblk, end, &reserved); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); + ext4_da_release_space(inode, reserved); return err; } @@ -1327,6 +1668,7 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); if (!es) goto out_wrap; + while (*nr_to_scan > 0) { if (es->es_lblk > end) { ei->i_es_shrink_lblk = end + 1; @@ -1628,7 +1970,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, lblk); + err = __es_remove_extent(inode, lblk, lblk, NULL); if (err != 0) goto error; retry: @@ -1817,93 +2159,3 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, __remove_pending(inode, last); } } - -/* - * ext4_es_remove_blks - remove block range from extents status tree and - * reduce reservation count or cancel pending - * reservation as needed - * - * @inode - file containing range - * @lblk - first block in range - * @len - number of blocks to remove - * - */ -void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) -{ - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned int clu_size, reserved = 0; - ext4_lblk_t last_lclu, first, length, remainder, last; - bool delonly; - int err = 0; - struct pending_reservation *pr; - struct ext4_pending_tree *tree; - - /* - * Process cluster by cluster for bigalloc - there may be up to - * two clusters in a 4k page with a 1k block size and two blocks - * per cluster. Also necessary for systems with larger page sizes - * and potentially larger block sizes. - */ - clu_size = sbi->s_cluster_ratio; - last_lclu = EXT4_B2C(sbi, lblk + len - 1); - - write_lock(&EXT4_I(inode)->i_es_lock); - - for (first = lblk, remainder = len; - remainder > 0; - first += length, remainder -= length) { - - if (EXT4_B2C(sbi, first) == last_lclu) - length = remainder; - else - length = clu_size - EXT4_LBLK_COFF(sbi, first); - - /* - * The BH_Delay flag, which triggers calls to this function, - * and the contents of the extents status tree can be - * inconsistent due to writepages activity. So, note whether - * the blocks to be removed actually belong to an extent with - * delayed only status. - */ - delonly = __es_scan_clu(inode, &ext4_es_is_delonly, first); - - /* - * because of the writepages effect, written and unwritten - * blocks could be removed here - */ - last = first + length - 1; - err = __es_remove_extent(inode, first, last); - if (err) - ext4_warning(inode->i_sb, - "%s: couldn't remove page (err = %d)", - __func__, err); - - /* non-bigalloc case: simply count the cluster for release */ - if (sbi->s_cluster_ratio == 1 && delonly) { - reserved++; - continue; - } - - /* - * bigalloc case: if all delayed allocated only blocks have - * just been removed from a cluster, either cancel a pending - * reservation if it exists or count a cluster for release - */ - if (delonly && - !__es_scan_clu(inode, &ext4_es_is_delonly, first)) { - pr = __get_pending(inode, EXT4_B2C(sbi, first)); - if (pr != NULL) { - tree = &EXT4_I(inode)->i_pending_tree; - rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); - } else { - reserved++; - } - } - } - - write_unlock(&EXT4_I(inode)->i_es_lock); - - ext4_da_release_space(inode, reserved); -} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index eb56a1289031..5e5c4a40d863 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -247,8 +247,6 @@ extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9db896fc6af8..2b1c58da8d1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1646,49 +1646,6 @@ void ext4_da_release_space(struct inode *inode, int to_free) dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } -static void ext4_da_page_release_reservation(struct page *page, - unsigned int offset, - unsigned int length) -{ - int contiguous_blks = 0; - struct buffer_head *head, *bh; - unsigned int curr_off = 0; - struct inode *inode = page->mapping->host; - unsigned int stop = offset + length; - ext4_fsblk_t lblk; - - BUG_ON(stop > PAGE_SIZE || stop < length); - - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - - if (next_off > stop) - break; - - if ((offset <= curr_off) && (buffer_delay(bh))) { - contiguous_blks++; - clear_buffer_delay(bh); - } else if (contiguous_blks) { - lblk = page->index << - (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - contiguous_blks = 0; - } - curr_off = next_off; - } while ((bh = bh->b_this_page) != head); - - if (contiguous_blks) { - lblk = page->index << (PAGE_SHIFT - inode->i_blkbits); - lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; - ext4_es_remove_blks(inode, lblk, contiguous_blks); - } - -} - /* * Delayed allocation stuff */ @@ -3227,24 +3184,6 @@ static int ext4_da_write_end(struct file *file, return ret ? ret : copied; } -static void ext4_da_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - /* - * Drop reserved blocks - */ - BUG_ON(!PageLocked(page)); - if (!page_has_buffers(page)) - goto out; - - ext4_da_page_release_reservation(page, offset, length); - -out: - ext4_invalidatepage(page, offset, length); - - return; -} - /* * Force all delayed allocation blocks to be allocated for a given inode. */ @@ -3985,7 +3924,7 @@ static const struct address_space_operations ext4_da_aops = { .write_end = ext4_da_write_end, .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, - .invalidatepage = ext4_da_invalidatepage, + .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, .direct_IO = ext4_direct_IO, .migratepage = buffer_migrate_page, -- cgit v1.2.3 From 7727ae52975d4f4ef7ff69ed8e6e25f6a4168158 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Wed, 28 Aug 2019 11:13:24 -0400 Subject: ext4: fix potential use after free after remounting with noblock_validity Remount process will release system zone which was allocated before if "noblock_validity" is specified. If we mount an ext4 file system to two mountpoints with default mount options, and then remount one of them with "noblock_validity", it may trigger a use after free problem when someone accessing the other one. # mount /dev/sda foo # mount /dev/sda bar User access mountpoint "foo" | Remount mountpoint "bar" | ext4_map_blocks() | ext4_remount() check_block_validity() | ext4_setup_system_zone() ext4_data_block_valid() | ext4_release_system_zone() | free system_blks rb nodes access system_blks rb nodes | trigger use after free | This problem can also be reproduced by one mountpint, At the same time, add_system_zone() can get called during remount as well so there can be racing ext4_data_block_valid() reading the rbtree at the same time. This patch add RCU to protect system zone from releasing or building when doing a remount which inverse current "noblock_validity" mount option. It assign the rbtree after the whole tree was complete and do actual freeing after rcu grace period, avoid any intermediate state. Reported-by: syzbot+1e470567330b7ad711d5@syzkaller.appspotmail.com Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/block_validity.c | 189 ++++++++++++++++++++++++++++++++++------------- fs/ext4/ext4.h | 10 ++- 2 files changed, 147 insertions(+), 52 deletions(-) (limited to 'fs/ext4/ext4.h') diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 8e83741b02e0..d4d4fdfac1a6 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -38,6 +38,7 @@ int __init ext4_init_system_zone(void) void ext4_exit_system_zone(void) { + rcu_barrier(); kmem_cache_destroy(ext4_system_zone_cachep); } @@ -49,17 +50,26 @@ static inline int can_merge(struct ext4_system_zone *entry1, return 0; } +static void release_system_zone(struct ext4_system_blocks *system_blks) +{ + struct ext4_system_zone *entry, *n; + + rbtree_postorder_for_each_entry_safe(entry, n, + &system_blks->root, node) + kmem_cache_free(ext4_system_zone_cachep, entry); +} + /* * Mark a range of blocks as belonging to the "system zone" --- that * is, filesystem metadata blocks which should never be used by * inodes. */ -static int add_system_zone(struct ext4_sb_info *sbi, +static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_system_zone *new_entry = NULL, *entry; - struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL; while (*n) { @@ -91,7 +101,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, new_node = &new_entry->node; rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &sbi->system_blks); + rb_insert_color(new_node, &system_blks->root); } /* Can we merge to the left? */ @@ -101,7 +111,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, if (can_merge(entry, new_entry)) { new_entry->start_blk = entry->start_blk; new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -112,7 +122,7 @@ static int add_system_zone(struct ext4_sb_info *sbi, entry = rb_entry(node, struct ext4_system_zone, node); if (can_merge(new_entry, entry)) { new_entry->count += entry->count; - rb_erase(node, &sbi->system_blks); + rb_erase(node, &system_blks->root); kmem_cache_free(ext4_system_zone_cachep, entry); } } @@ -126,7 +136,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks); + node = rb_first(&sbi->system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -137,7 +147,47 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_CONT "\n"); } -static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, + struct ext4_system_blocks *system_blks, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + + if (system_blks == NULL) + return 1; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + } + } + return 1; +} + +static int ext4_protect_reserved_inode(struct super_block *sb, + struct ext4_system_blocks *system_blks, + u32 ino) { struct inode *inode; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -163,14 +213,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) if (n == 0) { i++; } else { - if (!ext4_data_block_valid(sbi, map.m_pblk, n)) { + if (!ext4_data_block_valid_rcu(sbi, system_blks, + map.m_pblk, n)) { ext4_error(sb, "blocks %llu-%llu from inode %u " "overlap system zone", map.m_pblk, map.m_pblk + map.m_len - 1, ino); err = -EFSCORRUPTED; break; } - err = add_system_zone(sbi, map.m_pblk, n); + err = add_system_zone(system_blks, map.m_pblk, n); if (err < 0) break; i += n; @@ -180,94 +231,130 @@ static int ext4_protect_reserved_inode(struct super_block *sb, u32 ino) return err; } +static void ext4_destroy_system_zone(struct rcu_head *rcu) +{ + struct ext4_system_blocks *system_blks; + + system_blks = container_of(rcu, struct ext4_system_blocks, rcu); + release_system_zone(system_blks); + kfree(system_blks); +} + +/* + * Build system zone rbtree which is used for block validity checking. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. That's why we first build the rbtree and then + * swap it in place. + */ int ext4_setup_system_zone(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; int flex_size = ext4_flex_bg_size(sbi); int ret; if (!test_opt(sb, BLOCK_VALIDITY)) { - if (sbi->system_blks.rb_node) + if (sbi->system_blks) ext4_release_system_zone(sb); return 0; } - if (sbi->system_blks.rb_node) + if (sbi->system_blks) return 0; + system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); + if (!system_blks) + return -ENOMEM; + for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(sbi, ext4_group_first_block_no(sb, i), + add_system_zone(system_blks, + ext4_group_first_block_no(sb, i), ext4_bg_num_gdb(sb, i) + 1); gdp = ext4_get_group_desc(sb, i, NULL); - ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + ret = add_system_zone(system_blks, + ext4_block_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + goto err; + ret = add_system_zone(system_blks, + ext4_inode_bitmap(sb, gdp), 1); if (ret) - return ret; - ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + goto err; + ret = add_system_zone(system_blks, + ext4_inode_table(sb, gdp), sbi->s_itb_per_group); if (ret) - return ret; + goto err; } if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) { - ret = ext4_protect_reserved_inode(sb, + ret = ext4_protect_reserved_inode(sb, system_blks, le32_to_cpu(sbi->s_es->s_journal_inum)); if (ret) - return ret; + goto err; } + /* + * System blks rbtree complete, announce it once to prevent racing + * with ext4_data_block_valid() accessing the rbtree at the same + * time. + */ + rcu_assign_pointer(sbi->system_blks, system_blks); + if (test_opt(sb, DEBUG)) debug_print_tree(sbi); return 0; +err: + release_system_zone(system_blks); + kfree(system_blks); + return ret; } -/* Called when the filesystem is unmounted */ +/* + * Called when the filesystem is unmounted or when remounting it with + * noblock_validity specified. + * + * The update of system_blks pointer in this function is protected by + * sb->s_umount semaphore. However we have to be careful as we can be + * racing with ext4_data_block_valid() calls reading system_blks rbtree + * protected only by RCU. So we first clear the system_blks pointer and + * then free the rbtree only after RCU grace period expires. + */ void ext4_release_system_zone(struct super_block *sb) { - struct ext4_system_zone *entry, *n; + struct ext4_system_blocks *system_blks; - rbtree_postorder_for_each_entry_safe(entry, n, - &EXT4_SB(sb)->system_blks, node) - kmem_cache_free(ext4_system_zone_cachep, entry); + system_blks = rcu_dereference_protected(EXT4_SB(sb)->system_blks, + lockdep_is_held(&sb->s_umount)); + rcu_assign_pointer(EXT4_SB(sb)->system_blks, NULL); - EXT4_SB(sb)->system_blks = RB_ROOT; + if (system_blks) + call_rcu(&system_blks->rcu, ext4_destroy_system_zone); } -/* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. - */ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, unsigned int count) { - struct ext4_system_zone *entry; - struct rb_node *n = sbi->system_blks.rb_node; + struct ext4_system_blocks *system_blks; + int ret; - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - } - return 1; + /* + * Lock the system zone to prevent it being released concurrently + * when doing a remount which inverse current "[no]block_validity" + * mount option. + */ + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, + count); + rcu_read_unlock(); + return ret; } int ext4_check_blockref(const char *function, unsigned int line, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0664c43cc9dc..c35bb8d734df 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -184,6 +184,14 @@ struct ext4_map_blocks { unsigned int m_flags; }; +/* + * Block validity checking, system zone rbtree. + */ +struct ext4_system_blocks { + struct rb_root root; + struct rcu_head rcu; +}; + /* * Flags for ext4_io_end->flags */ @@ -1431,7 +1439,7 @@ struct ext4_sb_info { int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - struct rb_root system_blks; + struct ext4_system_blocks __rcu *system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ -- cgit v1.2.3