From e142d05263a4beedefd331d445c394f4397e9f03 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 8 Mar 2016 22:44:50 -0500 Subject: ext4: use i_mutex to serialize unaligned AIO DIO Currently we've used hashed aio_mutex to serialize unaligned AIO DIO. However the code cleanups that happened after 2011 when the lock was introduced made aio_mutex acquired at almost the same places where we already have exclusion using i_mutex. So just use i_mutex for the exclusion of unaligned AIO DIO. The change moves waiting for pending unwritten extent conversion under i_mutex. That makes special handling of O_APPEND writes unnecessary and also avoids possible livelocking of unaligned AIO DIO with aligned one (nothing was preventing contiguous stream of aligned AIO DIOs to let unaligned AIO DIO wait forever). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 474f1a4d2ca8..4a1153561580 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(iocb->ki_filp); - struct mutex *aio_mutex = NULL; struct blk_plug plug; int o_direct = iocb->ki_flags & IOCB_DIRECT; + int unaligned_aio = 0; int overwrite = 0; ssize_t ret; + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + /* - * Unaligned direct AIO must be serialized; see comment above - * In the case of O_APPEND, assume that we must always serialize + * Unaligned direct AIO must be serialized among each other as zeroing + * of partial blocks of two competing unaligned AIOs can result in data + * corruption. */ - if (o_direct && - ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && - (iocb->ki_flags & IOCB_APPEND || - ext4_unaligned_aio(inode, from, iocb->ki_pos))) { - aio_mutex = ext4_aio_mutex(inode); - mutex_lock(aio_mutex); + ext4_unaligned_aio(inode, from, iocb->ki_pos)) { + unaligned_aio = 1; ext4_unwritten_wait(inode); } - inode_lock(inode); - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto out; - /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. @@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !aio_mutex && + if (ext4_should_dioread_nolock(inode) && !unaligned_aio && !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; @@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) blk_finish_plug(&plug); - if (aio_mutex) - mutex_unlock(aio_mutex); return ret; out: inode_unlock(inode); - if (aio_mutex) - mutex_unlock(aio_mutex); return ret; } -- cgit v1.2.3 From 2d90c160e5f1d784e180f1e1458d56eee4d7f4f4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Mar 2016 23:11:13 -0500 Subject: ext4: more efficient SEEK_DATA implementation Using SEEK_DATA in a huge sparse file can easily lead to sotflockups as ext4_seek_data() iterates hole block-by-block. Fix the problem by using returned hole size from ext4_map_blocks() and thus skip the hole in one go. Update also SEEK_HOLE implementation to follow the same pattern as SEEK_DATA to make future maintenance easier. Furthermore we add cond_resched() to both ext4_seek_data() and ext4_seek_hole() to avoid softlockups in case evil user creates huge fragmented file and we have to go through lots of extents. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++ fs/ext4/file.c | 97 +++++++++++++++++++++------------------------------------ fs/ext4/inode.c | 67 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 61 deletions(-) (limited to 'fs/ext4/file.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 70b8e0409566..5623eec7fd22 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2546,6 +2546,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, + struct extent_status *result); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4a1153561580..e93a7efaf78f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -426,7 +426,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) */ static int ext4_find_unwritten_pgoff(struct inode *inode, int whence, - struct ext4_map_blocks *map, + ext4_lblk_t end_blk, loff_t *offset) { struct pagevec pvec; @@ -441,7 +441,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, blkbits = inode->i_sb->s_blocksize_bits; startoff = *offset; lastoff = startoff; - endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; + endoff = (loff_t)end_blk << blkbits; index = startoff >> PAGE_CACHE_SHIFT; end = endoff >> PAGE_CACHE_SHIFT; @@ -559,12 +559,11 @@ out: static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t dataoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -581,41 +580,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - if (last != start) - dataoff = (loff_t)last << blkbits; - break; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret <= 0) { + /* No extent found -> no data */ + if (ret == 0) + ret = -ENXIO; + inode_unlock(inode); + return ret; } - /* - * If there is a delay extent at this offset, - * it will be as a data. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - if (last != start) - dataoff = (loff_t)last << blkbits; + last = es.es_lblk; + if (last != start) + dataoff = (loff_t)last << blkbits; + if (!ext4_es_is_unwritten(&es)) break; - } /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, - &map, &dataoff); - if (unwritten) - break; - } - - last++; + if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, + es.es_lblk + es.es_len, &dataoff)) + break; + last += es.es_len; dataoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); @@ -632,12 +622,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) { struct inode *inode = file->f_mapping->host; - struct ext4_map_blocks map; struct extent_status es; ext4_lblk_t start, last, end; loff_t holeoff, isize; int blkbits; - int ret = 0; + int ret; inode_lock(inode); @@ -654,44 +643,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) holeoff = offset; do { - map.m_lblk = last; - map.m_len = end - last + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; + ret = ext4_get_next_extent(inode, last, end - last + 1, &es); + if (ret < 0) { + inode_unlock(inode); + return ret; } - - /* - * If there is a delay extent at this offset, - * we will skip this extent. - */ - ext4_es_find_delayed_extent_range(inode, last, last, &es); - if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { - last = es.es_lblk + es.es_len; - holeoff = (loff_t)last << blkbits; - continue; + /* Found a hole? */ + if (ret == 0 || es.es_lblk > last) { + if (last != start) + holeoff = (loff_t)last << blkbits; + break; } - /* * If there is a unwritten extent at this offset, * it will be as a data or a hole according to page * cache that has data or not. */ - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int unwritten; - unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, - &map, &holeoff); - if (!unwritten) { - last += ret; - holeoff = (loff_t)last << blkbits; - continue; - } - } + if (ext4_es_is_unwritten(&es) && + ext4_find_unwritten_pgoff(inode, SEEK_HOLE, + last + es.es_len, &holeoff)) + break; - /* find a hole */ - break; + last += es.es_len; + holeoff = (loff_t)last << blkbits; + cond_resched(); } while (last <= end); inode_unlock(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fddc6ddc53a8..ce2c4c62386f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5596,3 +5596,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return err; } + +/* + * Find the first extent at or after @lblk in an inode that is not a hole. + * Search for @map_len blocks at most. The extent is returned in @result. + * + * The function returns 1 if we found an extent. The function returns 0 in + * case there is no extent at or after @lblk and in that case also sets + * @result->es_len to 0. In case of error, the error code is returned. + */ +int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, + unsigned int map_len, struct extent_status *result) +{ + struct ext4_map_blocks map; + struct extent_status es = {}; + int ret; + + map.m_lblk = lblk; + map.m_len = map_len; + + /* + * For non-extent based files this loop may iterate several times since + * we do not determine full hole size. + */ + while (map.m_len > 0) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + /* There's extent covering m_lblk? Just return it. */ + if (ret > 0) { + int status; + + ext4_es_store_pblock(result, map.m_pblk); + result->es_lblk = map.m_lblk; + result->es_len = map.m_len; + if (map.m_flags & EXT4_MAP_UNWRITTEN) + status = EXTENT_STATUS_UNWRITTEN; + else + status = EXTENT_STATUS_WRITTEN; + ext4_es_store_status(result, status); + return 1; + } + ext4_es_find_delayed_extent_range(inode, map.m_lblk, + map.m_lblk + map.m_len - 1, + &es); + /* Is delalloc data before next block in extent tree? */ + if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { + ext4_lblk_t offset = 0; + + if (es.es_lblk < lblk) + offset = lblk - es.es_lblk; + result->es_lblk = es.es_lblk + offset; + ext4_es_store_pblock(result, + ext4_es_pblock(&es) + offset); + result->es_len = es.es_len - offset; + ext4_es_store_status(result, ext4_es_status(&es)); + + return 1; + } + /* There's a hole at m_lblk, advance us after it */ + map.m_lblk += map.m_len; + map_len -= map.m_len; + map.m_len = map_len; + cond_resched(); + } + result->es_len = 0; + return 0; +} -- cgit v1.2.3