diff options
Diffstat (limited to 'fs')
206 files changed, 6755 insertions, 5063 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index baf8f91776f4..439ed81e755a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1537,7 +1537,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, { const struct cred *cred; unsigned int i, len; - + unsigned int state; + /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); @@ -1559,7 +1560,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); - i = p->state ? ffz(~p->state) + 1 : 0; + state = READ_ONCE(p->__state); + i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; @@ -1571,7 +1573,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); - + return 0; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 39fa1b0307e1..cf4028487dcc 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1324,6 +1324,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, { const struct cred *cred; unsigned int i, len; + unsigned int state; /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); @@ -1346,7 +1347,8 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); - i = p->state ? ffz(~p->state) + 1 : 0; + state = READ_ONCE(p->__state); + i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; diff --git a/fs/block_dev.c b/fs/block_dev.c index eb34f5c357cf..7e83c3e71504 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -895,7 +895,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); - mutex_init(&bdev->bd_mutex); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); bdev->bd_disk = disk; @@ -1154,7 +1153,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -1199,7 +1198,7 @@ out_del: out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -1218,7 +1217,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); @@ -1230,138 +1229,97 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void blkdev_flush_mapping(struct block_device *bdev) +{ + WARN_ON_ONCE(bdev->bd_holders); + sync_blockdev(bdev); + kill_bdev(bdev); + bdev_write_inode(bdev); +} -int bdev_disk_changed(struct block_device *bdev, bool invalidate) +static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret = 0; - lockdep_assert_held(&bdev->bd_mutex); - - if (!(disk->flags & GENHD_FL_UP)) - return -ENXIO; - -rescan: - if (bdev->bd_part_count) - return -EBUSY; - sync_blockdev(bdev); - invalidate_bdev(bdev); - blk_drop_partitions(disk); - - clear_bit(GD_NEED_PART_SCAN, &disk->state); - - /* - * Historically we only set the capacity to zero for devices that - * support partitions (independ of actually having partitions created). - * Doing that is rather inconsistent, but changing it broke legacy - * udisks polling for legacy ide-cdrom devices. Use the crude check - * below to get the sane behavior for most device while not breaking - * userspace for this particular setup. - */ - if (invalidate) { - if (disk_part_scan_enabled(disk) || - !(disk->flags & GENHD_FL_REMOVABLE)) - set_capacity(disk, 0); + if (disk->fops->open) { + ret = disk->fops->open(bdev, mode); + if (ret) { + /* avoid ghost partitions on a removed medium */ + if (ret == -ENOMEDIUM && + test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, true); + return ret; + } } - if (get_capacity(disk)) { - ret = blk_add_partitions(disk, bdev); - if (ret == -EAGAIN) - goto rescan; - } else if (invalidate) { - /* - * Tell userspace that the media / partition table may have - * changed. - */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + if (!bdev->bd_openers) { + set_init_blocksize(bdev); + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, false); + bdev->bd_openers++; + return 0;; +} - return ret; +static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) +{ + if (!--bdev->bd_openers) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk, mode); } -/* - * Only exported for loop and dasd for historic reasons. Don't use in new - * code! - */ -EXPORT_SYMBOL_GPL(bdev_disk_changed); -/* - * bd_mutex locking: - * - * mutex_lock(part->bd_mutex) - * mutex_lock_nested(whole->bd_mutex, 1) - */ -static int __blkdev_get(struct block_device *bdev, fmode_t mode) +static int blkdev_get_part(struct block_device *part, fmode_t mode) { - struct gendisk *disk = bdev->bd_disk; - int ret = 0; + struct gendisk *disk = part->bd_disk; + struct block_device *whole; + int ret; - if (!(disk->flags & GENHD_FL_UP)) - return -ENXIO; + if (part->bd_openers) + goto done; - if (!bdev->bd_openers) { - if (!bdev_is_partition(bdev)) { - ret = 0; - if (disk->fops->open) - ret = disk->fops->open(bdev, mode); + whole = bdgrab(disk->part0); + ret = blkdev_get_whole(whole, mode); + if (ret) + goto out_put_whole; - if (!ret) - set_init_blocksize(bdev); + ret = -ENXIO; + if (!bdev_nr_sectors(part)) + goto out_blkdev_put; - /* - * If the device is invalidated, rescan partition - * if open succeeded or failed with -ENOMEDIUM. - * The latter is necessary to prevent ghost - * partitions on a removed medium. - */ - if (test_bit(GD_NEED_PART_SCAN, &disk->state) && - (!ret || ret == -ENOMEDIUM)) - bdev_disk_changed(bdev, ret == -ENOMEDIUM); + disk->open_partitions++; + set_init_blocksize(part); + if (part->bd_bdi == &noop_backing_dev_info) + part->bd_bdi = bdi_get(disk->queue->backing_dev_info); +done: + part->bd_openers++; + return 0; - if (ret) - return ret; - } else { - struct block_device *whole = bdgrab(disk->part0); - - mutex_lock_nested(&whole->bd_mutex, 1); - ret = __blkdev_get(whole, mode); - if (ret) { - mutex_unlock(&whole->bd_mutex); - bdput(whole); - return ret; - } - whole->bd_part_count++; - mutex_unlock(&whole->bd_mutex); +out_blkdev_put: + blkdev_put_whole(whole, mode); +out_put_whole: + bdput(whole); + return ret; +} - if (!bdev_nr_sectors(bdev)) { - __blkdev_put(whole, mode, 1); - bdput(whole); - return -ENXIO; - } - set_init_blocksize(bdev); - } +static void blkdev_put_part(struct block_device *part, fmode_t mode) +{ + struct block_device *whole = bdev_whole(part); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); - } else { - if (!bdev_is_partition(bdev)) { - if (bdev->bd_disk->fops->open) - ret = bdev->bd_disk->fops->open(bdev, mode); - /* the same as first opener case, read comment there */ - if (test_bit(GD_NEED_PART_SCAN, &disk->state) && - (!ret || ret == -ENOMEDIUM)) - bdev_disk_changed(bdev, ret == -ENOMEDIUM); - if (ret) - return ret; - } - } - bdev->bd_openers++; - return 0; + if (--part->bd_openers) + return; + blkdev_flush_mapping(part); + whole->bd_disk->open_partitions--; + blkdev_put_whole(whole, mode); + bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) @@ -1447,8 +1405,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk_block_events(disk); - mutex_lock(&bdev->bd_mutex); - ret =__blkdev_get(bdev, mode); + mutex_lock(&disk->open_mutex); + ret = -ENXIO; + if (!(disk->flags & GENHD_FL_UP)) + goto abort_claiming; + if (bdev_is_partition(bdev)) + ret = blkdev_get_part(bdev, mode); + else + ret = blkdev_get_whole(bdev, mode); if (ret) goto abort_claiming; if (mode & FMODE_EXCL) { @@ -1467,7 +1431,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) unblock_events = false; } } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); if (unblock_events) disk_unblock_events(disk); @@ -1476,7 +1440,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) abort_claiming: if (mode & FMODE_EXCL) bd_abort_claiming(bdev, holder); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); @@ -1551,10 +1515,9 @@ static int blkdev_open(struct inode * inode, struct file * filp) return 0; } -static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +void blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; - struct block_device *victim = NULL; /* * Sync early if it looks like we're the last one. If someone else @@ -1566,41 +1529,14 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (bdev->bd_openers == 1) sync_blockdev(bdev); - mutex_lock_nested(&bdev->bd_mutex, for_part); - if (for_part) - bdev->bd_part_count--; - - if (!--bdev->bd_openers) { - WARN_ON_ONCE(bdev->bd_holders); - sync_blockdev(bdev); - kill_bdev(bdev); - bdev_write_inode(bdev); - if (bdev_is_partition(bdev)) - victim = bdev_whole(bdev); - } - - if (!bdev_is_partition(bdev) && disk->fops->release) - disk->fops->release(disk, mode); - mutex_unlock(&bdev->bd_mutex); - if (victim) { - __blkdev_put(victim, mode, 1); - bdput(victim); - } -} - -void blkdev_put(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - - mutex_lock(&bdev->bd_mutex); - + mutex_lock(&disk->open_mutex); if (mode & FMODE_EXCL) { struct block_device *whole = bdev_whole(bdev); bool bdev_free; /* * Release a claim on the device. The holder fields - * are protected with bdev_lock. bd_mutex is to + * are protected with bdev_lock. open_mutex is to * synchronize disk_holder unlinking. */ spin_lock(&bdev_lock); @@ -1631,9 +1567,13 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - mutex_unlock(&bdev->bd_mutex); - __blkdev_put(bdev, mode, 0); + if (bdev_is_partition(bdev)) + blkdev_put_part(bdev, mode); + else + blkdev_put_whole(bdev, mode); + mutex_unlock(&disk->open_mutex); + blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); @@ -1733,20 +1673,6 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) } EXPORT_SYMBOL_GPL(blkdev_read_iter); -/* - * Try to release a page associated with block device when the system - * is under memory pressure. - */ -static int blkdev_releasepage(struct page *page, gfp_t wait) -{ - struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; - - if (super && super->s_op->bdev_try_to_free_page) - return super->s_op->bdev_try_to_free_page(super, page, wait); - - return try_to_free_buffers(page); -} - static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1761,7 +1687,6 @@ static const struct address_space_operations def_blk_aops = { .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = blkdev_writepages, - .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .migratepage = buffer_migrate_page_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, @@ -1941,10 +1866,10 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) old_inode = inode; bdev = I_BDEV(inode); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers) func(bdev, arg); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 68b95ad82126..520a0f6a7d9e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -18,6 +18,8 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU + depends on !PPC_256K_PAGES # powerpc + depends on !PAGE_SIZE_256KB # hexagon help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 117d423fdb93..7a8a2fc19533 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2675,7 +2675,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, * * @ref_key: The same as @ref_key in handle_direct_tree_backref() * @tree_key: The first key of this tree block. - * @path: A clean (released) path, to avoid allocating path everytime + * @path: A clean (released) path, to avoid allocating path every time * the function get called. */ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6d5c4e45cfef..38b127b9edfc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1399,7 +1399,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_space_info_update_bytes_pinned(fs_info, space_info, -block_group->pinned); space_info->bytes_readonly += block_group->pinned; - __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); block_group->pinned = 0; spin_unlock(&block_group->lock); @@ -1491,7 +1490,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - int ret; + LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1502,6 +1501,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) mutex_lock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->unused_bgs_lock); while (!list_empty(&fs_info->reclaim_bgs)) { + int ret = 0; + bg = list_first_entry(&fs_info->reclaim_bgs, struct btrfs_block_group, bg_list); @@ -1547,9 +1548,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) bg->start); next: - btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); + if (ret == -EAGAIN && list_empty(&bg->bg_list)) + list_add_tail(&bg->bg_list, &again_list); + else + btrfs_put_block_group(bg); } + list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); @@ -2505,7 +2510,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - u64 num_pages = 0; + u64 cache_size = 0; int retries = 0; int ret = 0; @@ -2617,20 +2622,20 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->length, SZ_256M); - if (!num_pages) - num_pages = 1; + cache_size = div_u64(block_group->length, SZ_256M); + if (!cache_size) + cache_size = 1; - num_pages *= 16; - num_pages *= PAGE_SIZE; + cache_size *= 16; + cache_size *= fs_info->sectorsize; ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, - num_pages); + cache_size); if (ret) goto out_put; - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, - num_pages, num_pages, + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, + cache_size, cache_size, &alloc_hint); /* * Our cache requires contiguous chunks so that we don't modify a bunch @@ -3062,8 +3067,6 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - __btrfs_mod_total_bytes_pinned(cache->space_info, - num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1346d698463a..9a023ae0f98b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -149,7 +149,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, const u32 csum_size = fs_info->csum_size; const u32 sectorsize = fs_info->sectorsize; struct page *page; - unsigned long i; + unsigned int i; char *kaddr; u8 csum[BTRFS_CSUM_SIZE]; struct compressed_bio *cb = bio->bi_private; @@ -208,7 +208,7 @@ static void end_compressed_bio_read(struct bio *bio) struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; - unsigned long index; + unsigned int index; unsigned int mirror = btrfs_io_bio(bio)->mirror_num; int ret = 0; @@ -334,7 +334,7 @@ static void end_compressed_bio_write(struct bio *bio) struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; - unsigned long index; + unsigned int index; if (bio->bi_status) cb->errors = 1; @@ -349,12 +349,10 @@ static void end_compressed_bio_write(struct bio *bio) * call back into the FS and do all the end_io operations */ inode = cb->inode; - cb->compressed_pages[0]->mapping = cb->inode->i_mapping; btrfs_record_physical_zoned(inode, cb->start, bio); - btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, bio->bi_status == BLK_STS_OK); - cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ @@ -387,10 +385,10 @@ out: * the end io hooks. */ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, + unsigned int len, u64 disk_start, + unsigned int compressed_len, struct page **compressed_pages, - unsigned long nr_pages, + unsigned int nr_pages, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css) { @@ -427,24 +425,16 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bio->bi_end_io = end_compressed_bio_write; if (use_append) { - struct extent_map *em; - struct map_lookup *map; - struct block_device *bdev; + struct btrfs_device *device; - em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE); - if (IS_ERR(em)) { + device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); + if (IS_ERR(device)) { kfree(cb); bio_put(bio); return BLK_STS_NOTSUPP; } - map = em->map_lookup; - /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); - bdev = map->stripes[0].dev->bdev; - - bio_set_dev(bio, bdev); - free_extent_map(em); + bio_set_dev(bio, device->bdev); } if (blkcg_css) { @@ -515,7 +505,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } if (bytes_left < PAGE_SIZE) { btrfs_info(fs_info, - "bytes left %lu compress len %lu nr %lu", + "bytes left %lu compress len %u nr %u", bytes_left, cb->compressed_len, cb->nr_pages); } bytes_left -= PAGE_SIZE; @@ -677,9 +667,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; - unsigned long compressed_len; - unsigned long nr_pages; - unsigned long pg_index; + unsigned int compressed_len; + unsigned int nr_pages; + unsigned int pg_index; struct page *page; struct bio *comp_bio; u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; @@ -1202,9 +1192,6 @@ static unsigned int btrfs_compress_set_level(int type, unsigned level) * * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes - * - * @max_out tells us the max number of bytes that we're allowed to - * stuff into pages */ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, @@ -1225,20 +1212,6 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * orig_bio contains the pages from the file that we want to decompress into - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ static int btrfs_decompress_bio(struct compressed_bio *cb) { struct list_head *workspace; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 8001b700ea3a..c359f20920d0 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -31,6 +31,9 @@ struct compressed_bio { /* number of bios pending for this compressed extent */ refcount_t pending_bios; + /* Number of compressed pages in the array */ + unsigned int nr_pages; + /* the pages with the compressed data on them */ struct page **compressed_pages; @@ -40,20 +43,17 @@ struct compressed_bio { /* starting offset in the inode for our pages */ u64 start; - /* number of bytes in the inode we're working on */ - unsigned long len; - - /* number of bytes on disk */ - unsigned long compressed_len; + /* Number of bytes in the inode we're working on */ + unsigned int len; - /* the compression algorithm for this bio */ - int compress_type; + /* Number of bytes on disk */ + unsigned int compressed_len; - /* number of compressed pages in the array */ - unsigned long nr_pages; + /* The compression algorithm for this bio */ + u8 compress_type; /* IO errors */ - int errors; + u8 errors; int mirror_num; /* for reads, this is the bio we are copying the data into */ @@ -91,10 +91,10 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, struct bio *bio); blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, - unsigned long len, u64 disk_start, - unsigned long compressed_len, + unsigned int len, u64 disk_start, + unsigned int compressed_len, struct page **compressed_pages, - unsigned long nr_pages, + unsigned int nr_pages, unsigned int write_flags, struct cgroup_subsys_state *blkcg_css); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a484fb72a01f..4bc3ca2cbd7d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -596,7 +596,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, fs_info->generation); if (!should_cow_block(trans, root, buf)) { - trans->dirty = true; *cow_ret = buf; return 0; } @@ -1788,10 +1787,8 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) { - trans->dirty = true; + if (!should_cow_block(trans, root, b)) goto cow_done; - } /* * must have write locks on this node and the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9fb76829a281..e5e53e592d4f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -561,10 +561,16 @@ enum { /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. - * Set and cleared while holding fs_info::balance_mutex. */ BTRFS_FS_BALANCE_RUNNING, + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + * Set, tested and cleared while holding fs_info::send_reloc_lock. + */ + BTRFS_FS_RELOC_RUNNING, + /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, @@ -817,8 +823,6 @@ struct btrfs_fs_info { struct kobject *space_info_kobj; struct kobject *qgroups_kobj; - u64 total_pinned; - /* used to keep from writing metadata until there is a nice batch */ struct percpu_counter dirty_metadata_bytes; struct percpu_counter delalloc_bytes; @@ -871,6 +875,9 @@ struct btrfs_fs_info { struct btrfs_balance_control *balance_ctl; wait_queue_head_t balance_wait_q; + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + u32 data_chunk_allocations; u32 metadata_ratio; @@ -986,14 +993,15 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; + spinlock_t send_reloc_lock; /* * Number of send operations in progress. - * Updated while holding fs_info::balance_mutex. + * Updated while holding fs_info::send_reloc_lock. */ int send_in_progress; - /* Type of exclusive operation running */ - unsigned long exclusive_operation; + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; /* * Zone size > 0 when in ZONED mode, otherwise it's used for a check @@ -1375,38 +1383,39 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) * * Note: don't forget to add new options to btrfs_show_options() */ -#define BTRFS_MOUNT_NODATASUM (1 << 0) -#define BTRFS_MOUNT_NODATACOW (1 << 1) -#define BTRFS_MOUNT_NOBARRIER (1 << 2) -#define BTRFS_MOUNT_SSD (1 << 3) -#define BTRFS_MOUNT_DEGRADED (1 << 4) -#define BTRFS_MOUNT_COMPRESS (1 << 5) -#define BTRFS_MOUNT_NOTREELOG (1 << 6) -#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) -#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) -#define BTRFS_MOUNT_NOSSD (1 << 9) -#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10) -#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) -#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) -#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) -#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) -#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) -#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) -/* bit 17 is free */ -#define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) -#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) -#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) -#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) -#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) -#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) -#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) -#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) -#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) -#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) -#define BTRFS_MOUNT_REF_VERIFY (1 << 28) -#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) -#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) -#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31) +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), +}; #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2216,11 +2225,13 @@ BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, static inline bool btrfs_root_readonly(const struct btrfs_root *root) { + /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } static inline bool btrfs_root_dead(const struct btrfs_root *root) { + /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } @@ -2746,9 +2757,9 @@ enum btrfs_reserve_flush_enum { /* * Flush space by above mentioned methods and by: * - Running delayed iputs - * - Commiting transaction + * - Committing transaction * - * Can be interruped by fatal signal. + * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_DATA, BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, @@ -2758,7 +2769,7 @@ enum btrfs_reserve_flush_enum { * Pretty much the same as FLUSH_ALL, but can also steal space from * global rsv. * - * Can be interruped by fatal signal. + * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_ALL_STEAL, }; @@ -2774,7 +2785,6 @@ enum btrfs_flush_state { ALLOC_CHUNK_FORCE = 8, RUN_DELAYED_IPUTS = 9, COMMIT_TRANS = 10, - FORCE_COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, @@ -3100,8 +3110,8 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end); +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3125,7 +3135,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, u64 new_size, - u32 min_type); + u32 min_type, u64 *extents_found); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -3146,9 +3156,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, unsigned long bio_flags); -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size); -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); @@ -3187,7 +3195,8 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; @@ -3222,6 +3231,9 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ @@ -3786,4 +3798,14 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zoned != 0; } +/* + * We use page status Private2 to indicate there is an ordered extent with + * unfinished IO. + * + * Rename the Private2 accessors to Ordered, to improve readability. + */ +#define PageOrdered(page) PagePrivate2(page) +#define SetPageOrdered(page) SetPagePrivate2(page) +#define ClearPageOrdered(page) ClearPagePrivate2(page) + #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 56642ca7af10..2059d1504149 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -89,7 +89,7 @@ * ->outstanding_extents += 1 (current value is 1) * * -> set_delalloc - * ->outstanding_extents += 1 (currrent value is 2) + * ->outstanding_extents += 1 (current value is 2) * * -> btrfs_delalloc_release_extents() * ->outstanding_extents -= 1 (current value is 1) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1a88f6214ebc..257c1e18abd4 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -681,7 +681,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, { struct btrfs_delayed_item *curr, *next; int free_space; - int total_data_size = 0, total_size = 0; + int total_size = 0; struct extent_buffer *leaf; char *data_ptr; struct btrfs_key *keys; @@ -706,7 +706,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, */ while (total_size + next->data_len + sizeof(struct btrfs_item) <= free_space) { - total_data_size += next->data_len; total_size += next->data_len + sizeof(struct btrfs_item); list_add_tail(&next->tree_list, &head); nitems++; @@ -974,14 +973,16 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) { - struct btrfs_delayed_root *delayed_root; - ASSERT(delayed_node->root); - clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); - delayed_node->count--; + if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { + struct btrfs_delayed_root *delayed_root; - delayed_root = delayed_node->root->fs_info->delayed_root; - finish_one_item(delayed_root); + ASSERT(delayed_node->root); + delayed_node->count--; + + delayed_root = delayed_node->root->fs_info->delayed_root; + finish_one_item(delayed_root); + } } static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -1009,12 +1010,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); memalloc_nofs_restore(nofs_flag); - if (ret > 0) { - btrfs_release_path(path); - return -ENOENT; - } else if (ret < 0) { - return ret; - } + if (ret > 0) + ret = -ENOENT; + if (ret < 0) + goto out; leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], @@ -1024,7 +1023,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) - goto no_iref; + goto out; path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) @@ -1046,12 +1045,19 @@ again: btrfs_del_item(trans, root, path); out: btrfs_release_delayed_iref(node); -no_iref: btrfs_release_path(path); err_out: btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); btrfs_release_delayed_inode(node); + /* + * If we fail to update the delayed inode we need to abort the + * transaction, because we could leave the inode with the improper + * counts behind. + */ + if (ret && ret != -ENOENT) + btrfs_abort_transaction(trans, ret); + return ret; search: @@ -1898,8 +1904,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_item(prev_item); } - if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) - btrfs_release_delayed_iref(delayed_node); + btrfs_release_delayed_iref(delayed_node); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c92d9d4f5f46..06bc842ecdb3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -641,7 +641,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs; struct btrfs_fs_info *fs_info = trans->fs_info; - u64 flags = btrfs_ref_head_to_space_flags(existing); int old_ref_mod; BUG_ON(existing->is_data != update->is_data); @@ -711,26 +710,6 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, } } - /* - * This handles the following conditions: - * - * 1. We had a ref mod of 0 or more and went negative, indicating that - * we may be freeing space, so add our space to the - * total_bytes_pinned counter. - * 2. We were negative and went to 0 or positive, so no longer can say - * that the space would be pinned, decrement our counter from the - * total_bytes_pinned counter. - * 3. We are now at 0 and have ->must_insert_reserved set, which means - * this was a new allocation and then we dropped it, and thus must - * add our space to the total_bytes_pinned counter. - */ - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); - else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) - btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); - else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); - spin_unlock(&existing->lock); } @@ -835,17 +814,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { - u64 flags = btrfs_ref_head_to_space_flags(head_ref); - if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; trans->delayed_ref_updates += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } - if (head_ref->ref_mod < 0) - btrfs_mod_total_bytes_pinned(trans->fs_info, flags, - head_ref->num_bytes); delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d05f73530af7..d029be40ea6f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -37,7 +37,7 @@ * - Write duplication * * All new writes will be written to both target and source devices, so even - * if replace gets canceled, sources device still contans up-to-date data. + * if replace gets canceled, sources device still contains up-to-date data. * * Location: handle_ops_on_dev_replace() from __btrfs_map_block() * Start: btrfs_dev_replace_start() diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 306ff20af70f..e1b7bd927d69 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -624,7 +624,7 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the - * order of operations is changed. In the normal sychronous discard path, the + * order of operations is changed. In the normal synchronous discard path, the * block groups are trimmed via a single large trim in transaction commit. This * is ultimately what we are trying to avoid with asynchronous discard. Thus, * it must be done before going down the unused_bgs path. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8d386a5587ee..b117dd3b8172 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -241,7 +241,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; - bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -249,9 +248,6 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, if (atomic) return -EAGAIN; - if (need_lock) - btrfs_tree_read_lock(eb); - lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); if (extent_buffer_uptodate(eb) && @@ -264,22 +260,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; - - /* - * Things reading via commit roots that don't have normal protection, - * like send, can have a really old block in cache that may point at a - * block that has been freed and re-allocated. So don't clear uptodate - * if we find an eb that is under IO (dirty/writeback) because we could - * end up reading in the stale data and then writing it back out and - * making everybody very sad. - */ - if (!extent_buffer_under_io(eb)) - clear_extent_buffer_uptodate(eb); + clear_extent_buffer_uptodate(eb); out: unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, &cached_state); - if (need_lock) - btrfs_tree_read_unlock(eb); return ret; } @@ -584,6 +568,7 @@ static int validate_extent_buffer(struct extent_buffer *eb) const u32 csum_size = fs_info->csum_size; u8 found_level; u8 result[BTRFS_CSUM_SIZE]; + const u8 *header_csum; int ret = 0; found_start = btrfs_header_bytenr(eb); @@ -608,15 +593,14 @@ static int validate_extent_buffer(struct extent_buffer *eb) } csum_tree_block(eb, result); + header_csum = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); - if (memcmp_extent_buffer(eb, result, 0, csum_size)) { - u8 val[BTRFS_CSUM_SIZE] = { 0 }; - - read_extent_buffer(eb, &val, 0, csum_size); + if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - fs_info->sb->s_id, eb->start, - CSUM_FMT_VALUE(csum_size, val), + "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, + CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); ret = -EUCLEAN; @@ -917,23 +901,22 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, return btree_csum_one_bio(bio); } -static int check_async_write(struct btrfs_fs_info *fs_info, +static bool should_async_write(struct btrfs_fs_info *fs_info, struct btrfs_inode *bi) { if (btrfs_is_zoned(fs_info)) - return 0; + return false; if (atomic_read(&bi->sync_writers)) - return 0; + return false; if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) - return 0; - return 1; + return false; + return true; } blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { @@ -946,7 +929,7 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, if (ret) goto out_w_error; ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!async) { + } else if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; @@ -2252,6 +2235,7 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) atomic_set(&fs_info->balance_cancel_req, 0); fs_info->balance_ctl = NULL; init_waitqueue_head(&fs_info->balance_wait_q); + atomic_set(&fs_info->reloc_cancel_req, 0); } static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) @@ -2999,6 +2983,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; + spin_lock_init(&fs_info->send_reloc_lock); fs_info->send_in_progress = 0; fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; @@ -3471,7 +3456,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * At this point we know all the devices that make this filesystem, * including the seed devices but we don't know yet if the replace * target is required. So free devices that are not part of this - * filesystem but skip the replace traget device which is checked + * filesystem but skip the replace target device which is checked * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); @@ -3598,8 +3583,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(fs_info, fs_devices, btrfs_test_opt(fs_info, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? - 1 : 0, + CHECK_INTEGRITY_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); if (ret) btrfs_warn(fs_info, @@ -4696,9 +4680,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, cache->space_info->bytes_reserved -= head->num_bytes; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - percpu_counter_add_batch( - &cache->space_info->total_bytes_pinned, - head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); btrfs_put_block_group(cache); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d5c35e4cb76..d296483d148f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1425,7 +1425,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * bytenr of the parent block. Since new extents are always * created with indirect references, this will only be the case * when relocating a shared extent. In that case, root_objectid - * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must + * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must * be 0 * * @root_objectid: The id of the root where this modification has originated, @@ -1804,19 +1804,6 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); } - /* - * We were dropping refs, or had a new ref and dropped it, and thus must - * adjust down our total_bytes_pinned, the space may or may not have - * been pinned and so is accounted for properly in the pinned space by - * now. - */ - if (head->total_ref_mod < 0 || - (head->total_ref_mod == 0 && head->must_insert_reserved)) { - u64 flags = btrfs_ref_head_to_space_flags(head); - - btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); - } - btrfs_delayed_refs_rsv_release(fs_info, nr_items); } @@ -2551,7 +2538,6 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); set_extent_dirty(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; @@ -2762,7 +2748,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache->pinned -= len; btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); space_info->max_extent_size = 0; - __btrfs_mod_total_bytes_pinned(space_info, -len); if (cache->ro) { space_info->bytes_readonly += len; readonly = true; @@ -4784,7 +4769,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index dee2dafbc872..9e81d25dea70 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -136,7 +136,7 @@ struct tree_entry { }; struct extent_page_data { - struct bio *bio; + struct btrfs_bio_ctrl bio_ctrl; /* tells writepage not to lock the state bits for this range * it still does the unlocking */ @@ -185,10 +185,12 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, /* Cleanup unsubmitted bios */ static void end_write_bio(struct extent_page_data *epd, int ret) { - if (epd->bio) { - epd->bio->bi_status = errno_to_blk_status(ret); - bio_endio(epd->bio); - epd->bio = NULL; + struct bio *bio = epd->bio_ctrl.bio; + + if (bio) { + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + epd->bio_ctrl.bio = NULL; } } @@ -201,9 +203,10 @@ static void end_write_bio(struct extent_page_data *epd, int ret) static int __must_check flush_write_bio(struct extent_page_data *epd) { int ret = 0; + struct bio *bio = epd->bio_ctrl.bio; - if (epd->bio) { - ret = submit_one_bio(epd->bio, 0, 0); + if (bio) { + ret = submit_one_bio(bio, 0, 0); /* * Clean up of epd->bio is handled by its endio function. * And endio is either triggered by successful bio execution @@ -211,7 +214,7 @@ static int __must_check flush_write_bio(struct extent_page_data *epd) * So at this point, no matter what happened, we don't need * to clean up epd->bio. */ - epd->bio = NULL; + epd->bio_ctrl.bio = NULL; } return ret; } @@ -1805,10 +1808,130 @@ out: return found; } +/* + * Process one page for __process_pages_contig(). + * + * Return >0 if we hit @page == @locked_page. + * Return 0 if we updated the page status. + * Return -EGAIN if the we need to try again. + * (For PAGE_LOCK case but got dirty page or page not belong to mapping) + */ +static int process_one_page(struct btrfs_fs_info *fs_info, + struct address_space *mapping, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) +{ + u32 len; + + ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); + len = end + 1 - start; + + if (page_ops & PAGE_SET_ORDERED) + btrfs_page_clamp_set_ordered(fs_info, page, start, len); + if (page_ops & PAGE_SET_ERROR) + btrfs_page_clamp_set_error(fs_info, page, start, len); + if (page_ops & PAGE_START_WRITEBACK) { + btrfs_page_clamp_clear_dirty(fs_info, page, start, len); + btrfs_page_clamp_set_writeback(fs_info, page, start, len); + } + if (page_ops & PAGE_END_WRITEBACK) + btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + + if (page == locked_page) + return 1; + + if (page_ops & PAGE_LOCK) { + int ret; + + ret = btrfs_page_start_writer_lock(fs_info, page, start, len); + if (ret) + return ret; + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, len); + return -EAGAIN; + } + } + if (page_ops & PAGE_UNLOCK) + btrfs_page_end_writer_lock(fs_info, page, start, len); + return 0; +} + static int __process_pages_contig(struct address_space *mapping, struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret); + u64 start, u64 end, unsigned long page_ops, + u64 *processed_end) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned long nr_pages = end_index - start_index + 1; + unsigned long pages_processed = 0; + struct page *pages[16]; + int err = 0; + int i; + + if (page_ops & PAGE_LOCK) { + ASSERT(page_ops == PAGE_LOCK); + ASSERT(processed_end && *processed_end == start); + } + + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + mapping_set_error(mapping, -EIO); + + while (nr_pages > 0) { + int found_pages; + + found_pages = find_get_pages_contig(mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (found_pages == 0) { + /* + * Only if we're going to lock these pages, we can find + * nothing at @index. + */ + ASSERT(page_ops & PAGE_LOCK); + err = -EAGAIN; + goto out; + } + + for (i = 0; i < found_pages; i++) { + int process_ret; + + process_ret = process_one_page(fs_info, mapping, + pages[i], locked_page, page_ops, + start, end); + if (process_ret < 0) { + for (; i < found_pages; i++) + put_page(pages[i]); + err = -EAGAIN; + goto out; + } + put_page(pages[i]); + pages_processed++; + } + nr_pages -= found_pages; + index += found_pages; + cond_resched(); + } +out: + if (err && processed_end) { + /* + * Update @processed_end. I know this is awful since it has + * two different return value patterns (inclusive vs exclusive). + * + * But the exclusive pattern is necessary if @start is 0, or we + * underflow and check against processed_end won't work as + * expected. + */ + if (pages_processed) + *processed_end = min(end, + ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); + else + *processed_end = start; + } + return err; +} static noinline void __unlock_for_delalloc(struct inode *inode, struct page *locked_page, @@ -1821,7 +1944,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode, if (index == locked_page->index && end_index == index) return; - __process_pages_contig(inode->i_mapping, locked_page, index, end_index, + __process_pages_contig(inode->i_mapping, locked_page, start, end, PAGE_UNLOCK, NULL); } @@ -1831,19 +1954,19 @@ static noinline int lock_delalloc_pages(struct inode *inode, u64 delalloc_end) { unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long index_ret = index; unsigned long end_index = delalloc_end >> PAGE_SHIFT; + u64 processed_end = delalloc_start; int ret; ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, index, - end_index, PAGE_LOCK, &index_ret); - if (ret == -EAGAIN) + ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, + delalloc_end, PAGE_LOCK, &processed_end); + if (ret == -EAGAIN && processed_end > delalloc_start) __unlock_for_delalloc(inode, locked_page, delalloc_start, - (u64)index_ret << PAGE_SHIFT); + processed_end); return ret; } @@ -1936,84 +2059,6 @@ out_failed: return found; } -static int __process_pages_contig(struct address_space *mapping, - struct page *locked_page, - pgoff_t start_index, pgoff_t end_index, - unsigned long page_ops, pgoff_t *index_ret) -{ - unsigned long nr_pages = end_index - start_index + 1; - unsigned long pages_processed = 0; - pgoff_t index = start_index; - struct page *pages[16]; - unsigned ret; - int err = 0; - int i; - - if (page_ops & PAGE_LOCK) { - ASSERT(page_ops == PAGE_LOCK); - ASSERT(index_ret && *index_ret == start_index); - } - - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) - mapping_set_error(mapping, -EIO); - - while (nr_pages > 0) { - ret = find_get_pages_contig(mapping, index, - min_t(unsigned long, - nr_pages, ARRAY_SIZE(pages)), pages); - if (ret == 0) { - /* - * Only if we're going to lock these pages, - * can we find nothing at @index. - */ - ASSERT(page_ops & PAGE_LOCK); - err = -EAGAIN; - goto out; - } - - for (i = 0; i < ret; i++) { - if (page_ops & PAGE_SET_PRIVATE2) - SetPagePrivate2(pages[i]); - - if (locked_page && pages[i] == locked_page) { - put_page(pages[i]); - pages_processed++; - continue; - } - if (page_ops & PAGE_START_WRITEBACK) { - clear_page_dirty_for_io(pages[i]); - set_page_writeback(pages[i]); - } - if (page_ops & PAGE_SET_ERROR) - SetPageError(pages[i]); - if (page_ops & PAGE_END_WRITEBACK) - end_page_writeback(pages[i]); - if (page_ops & PAGE_UNLOCK) - unlock_page(pages[i]); - if (page_ops & PAGE_LOCK) { - lock_page(pages[i]); - if (!PageDirty(pages[i]) || - pages[i]->mapping != mapping) { - unlock_page(pages[i]); - for (; i < ret; i++) - put_page(pages[i]); - err = -EAGAIN; - goto out; - } - } - put_page(pages[i]); - pages_processed++; - } - nr_pages -= ret; - index += ret; - cond_resched(); - } -out: - if (err && index_ret) - *index_ret = start_index + pages_processed - 1; - return err; -} - void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 clear_bits, unsigned long page_ops) @@ -2021,8 +2066,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start >> PAGE_SHIFT, end >> PAGE_SHIFT, - page_ops, NULL); + start, end, page_ops, NULL); } /* @@ -2381,13 +2425,6 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, BUG_ON(!failrec->this_mirror); - if (failrec->in_validation) { - /* there was no real error, just free the record */ - btrfs_debug(fs_info, - "clean_io_failure: freeing dummy error at %llu", - failrec->start); - goto out; - } if (sb_rdonly(fs_info->sb)) goto out; @@ -2449,7 +2486,7 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start, u64 end) + u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct io_failure_record *failrec; @@ -2457,15 +2494,15 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + const u32 sectorsize = fs_info->sectorsize; int ret; u64 logical; failrec = get_state_failrec(failure_tree, start); if (!IS_ERR(failrec)) { btrfs_debug(fs_info, - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", - failrec->logical, failrec->start, failrec->len, - failrec->in_validation); + "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", + failrec->logical, failrec->start, failrec->len); /* * when data can be on disk more than twice, add to failrec here * (e.g. with a list for failed_mirror) to make @@ -2480,10 +2517,9 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return ERR_PTR(-ENOMEM); failrec->start = start; - failrec->len = end - start + 1; + failrec->len = sectorsize; failrec->this_mirror = 0; failrec->bio_flags = 0; - failrec->in_validation = 0; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); @@ -2519,12 +2555,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode free_extent_map(em); /* Set the bits in the private failure tree */ - ret = set_extent_bits(failure_tree, start, end, + ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) { ret = set_state_failrec(failure_tree, start, failrec); /* Set the bits in the inode's tree */ - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); + ret = set_extent_bits(tree, start, start + sectorsize - 1, + EXTENT_DAMAGED); } else if (ret < 0) { kfree(failrec); return ERR_PTR(ret); @@ -2533,7 +2570,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, +static bool btrfs_check_repairable(struct inode *inode, struct io_failure_record *failrec, int failed_mirror) { @@ -2553,39 +2590,22 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, return false; } + /* The failure record should only contain one sector */ + ASSERT(failrec->len == fs_info->sectorsize); + /* - * there are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. */ - if (needs_validation) { - /* - * to fulfill b), we need to know the exact failing sectors, as - * we don't want to rewrite any more than the failed ones. thus, - * we need separate read requests for the failed bio - * - * if the following BUG_ON triggers, our validation request got - * merged. we need separate requests for our algorithm to work. - */ - BUG_ON(failrec->in_validation); - failrec->in_validation = 1; - failrec->this_mirror = failed_mirror; - } else { - /* - * we're ready to fulfill a) and b) alongside. get a good copy - * of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - if (failrec->in_validation) { - BUG_ON(failrec->this_mirror != failed_mirror); - failrec->in_validation = 0; - failrec->this_mirror = 0; - } - failrec->failed_mirror = failed_mirror; + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - } if (failrec->this_mirror > num_copies) { btrfs_debug(fs_info, @@ -2597,53 +2617,11 @@ static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, return true; } -static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) -{ - u64 len = 0; - const u32 blocksize = inode->i_sb->s_blocksize; - - /* - * If bi_status is BLK_STS_OK, then this was a checksum error, not an - * I/O error. In this case, we already know exactly which sector was - * bad, so we don't need to validate. - */ - if (bio->bi_status == BLK_STS_OK) - return false; - - /* - * We need to validate each sector individually if the failed I/O was - * for multiple sectors. - * - * There are a few possible bios that can end up here: - * 1. A buffered read bio, which is not cloned. - * 2. A direct I/O read bio, which is cloned. - * 3. A (buffered or direct) repair bio, which is not cloned. - * - * For cloned bios (case 2), we can get the size from - * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get - * it from the bvecs. - */ - if (bio_flagged(bio, BIO_CLONED)) { - if (btrfs_io_bio(bio)->iter.bi_size > blocksize) - return true; - } else { - struct bio_vec *bvec; - int i; - - bio_for_each_bvec_all(bvec, bio, i) { - len += bvec->bv_len; - if (len > blocksize) - return true; - } - } - return false; -} - -blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - submit_bio_hook_t *submit_bio_hook) +int btrfs_repair_one_sector(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, int failed_mirror, + submit_bio_hook_t *submit_bio_hook) { struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2651,7 +2629,6 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); const int icsum = bio_offset >> fs_info->sectorsize_bits; - bool need_validation; struct bio *repair_bio; struct btrfs_io_bio *repair_io_bio; blk_status_t status; @@ -2661,23 +2638,19 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start, end); + failrec = btrfs_get_io_failure_record(inode, start); if (IS_ERR(failrec)) - return errno_to_blk_status(PTR_ERR(failrec)); + return PTR_ERR(failrec); - need_validation = btrfs_io_needs_validation(inode, failed_bio); - if (!btrfs_check_repairable(inode, need_validation, failrec, - failed_mirror)) { + if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { free_io_failure(failure_tree, tree, failrec); - return BLK_STS_IOERR; + return -EIO; } repair_bio = btrfs_io_bio_alloc(1); repair_io_bio = btrfs_io_bio(repair_bio); repair_bio->bi_opf = REQ_OP_READ; - if (need_validation) - repair_bio->bi_opf |= REQ_FAILFAST_DEV; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; repair_bio->bi_private = failed_bio->bi_private; @@ -2695,8 +2668,8 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, repair_io_bio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), -"repair read error: submitting new read to mirror %d, in_validation=%d", - failrec->this_mirror, failrec->in_validation); + "repair read error: submitting new read to mirror %d", + failrec->this_mirror); status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); @@ -2704,17 +2677,114 @@ blk_status_t btrfs_submit_read_repair(struct inode *inode, free_io_failure(failure_tree, tree, failrec); bio_put(repair_bio); } - return status; + return blk_status_to_errno(status); +} + +static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + + ASSERT(page_offset(page) <= start && + start + len <= page_offset(page) + PAGE_SIZE); + + if (uptodate) { + btrfs_page_set_uptodate(fs_info, page, start, len); + } else { + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); + } + + if (fs_info->sectorsize == PAGE_SIZE) + unlock_page(page); + else + btrfs_subpage_end_reader(fs_info, page, start, len); +} + +static blk_status_t submit_read_repair(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + unsigned int error_bitmap, + submit_bio_hook_t *submit_bio_hook) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; + int error = 0; + int i; + + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + + /* We're here because we had some read errors or csum mismatch */ + ASSERT(error_bitmap); + + /* + * We only get called on buffered IO, thus page must be mapped and bio + * must not be cloned. + */ + ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + + /* Iterate through all the sectors in the range */ + for (i = 0; i < nr_bits; i++) { + const unsigned int offset = i * sectorsize; + struct extent_state *cached = NULL; + bool uptodate = false; + int ret; + + if (!(error_bitmap & (1U << i))) { + /* + * This sector has no error, just end the page read + * and unlock the range. + */ + uptodate = true; + goto next; + } + + ret = btrfs_repair_one_sector(inode, failed_bio, + bio_offset + offset, + page, pgoff + offset, start + offset, + failed_mirror, submit_bio_hook); + if (!ret) { + /* + * We have submitted the read repair, the page release + * will be handled by the endio function of the + * submitted repair bio. + * Thus we don't need to do any thing here. + */ + continue; + } + /* + * Repair failed, just record the error but still continue. + * Or the remaining sectors will not be properly unlocked. + */ + if (!error) + error = ret; +next: + end_page_read(page, uptodate, start + offset, sectorsize); + if (uptodate) + set_extent_uptodate(&BTRFS_I(inode)->io_tree, + start + offset, + start + offset + sectorsize - 1, + &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, + start + offset, + start + offset + sectorsize - 1, + &cached); + } + return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { + struct btrfs_inode *inode; int uptodate = (err == 0); int ret = 0; - btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); + ASSERT(page && page->mapping); + inode = BTRFS_I(page->mapping->host); + btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); if (!uptodate) { ClearPageUptodate(page); @@ -2747,25 +2817,20 @@ static void end_bio_extent_writepage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; - /* We always issue full-page reads, but if some block - * in a page fails to read, blk_update_request() will - * advance bv_offset and adjust bv_len to compensate. - * Print a warning for nonzero offsets, and an error - * if they don't add up to a full page. */ - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else - btrfs_info(fs_info, - "incomplete page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - } + /* Our read/write should always be sector aligned. */ + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + btrfs_err(fs_info, + "partial page write in btrfs with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); + else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + btrfs_info(fs_info, + "incomplete page write with offset %u and length %u", + bvec->bv_offset, bvec->bv_len); - start = page_offset(page); - end = start + bvec->bv_offset + bvec->bv_len - 1; + start = page_offset(page) + bvec->bv_offset; + end = start + bvec->bv_len - 1; if (first_bvec) { btrfs_record_physical_zoned(inode, start, bio); @@ -2773,7 +2838,8 @@ static void end_bio_extent_writepage(struct bio *bio) } end_extent_writepage(page, error, start, end); - end_page_writeback(page); + + btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); } bio_put(bio); @@ -2862,30 +2928,6 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); } -static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); - - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); - - if (uptodate) { - btrfs_page_set_uptodate(fs_info, page, start, len); - } else { - btrfs_page_clear_uptodate(fs_info, page, start, len); - btrfs_page_set_error(fs_info, page, start, len); - } - - if (fs_info->sectorsize == PAGE_SIZE) - unlock_page(page); - else if (is_data_inode(page->mapping->host)) - /* - * For subpage data, unlock the page if we're the last reader. - * For subpage metadata, page lock is not utilized for read. - */ - btrfs_subpage_end_reader(fs_info, page, start, len); -} - /* * Find extent buffer for a givne bytenr. * @@ -2929,7 +2971,6 @@ static struct extent_buffer *find_extent_buffer_readpage( static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - int uptodate = !bio->bi_status; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; @@ -2944,10 +2985,12 @@ static void end_bio_extent_readpage(struct bio *bio) ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { + bool uptodate = !bio->bi_status; struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + unsigned int error_bitmap = (unsigned int)-1; u64 start; u64 end; u32 len; @@ -2982,14 +3025,16 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - if (is_data_inode(inode)) - ret = btrfs_verify_data_csum(io_bio, + if (is_data_inode(inode)) { + error_bitmap = btrfs_verify_data_csum(io_bio, bio_offset, page, start, end); - else + ret = error_bitmap; + } else { ret = btrfs_validate_metadata_buffer(io_bio, page, start, end, mirror); + } if (ret) - uptodate = 0; + uptodate = false; else clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree, tree, start, @@ -3001,27 +3046,18 @@ static void end_bio_extent_readpage(struct bio *bio) goto readpage_ok; if (is_data_inode(inode)) { - /* - * The generic bio_readpage_error handles errors the - * following way: If possible, new read requests are - * created and submitted and will end up in - * end_bio_extent_readpage as well (if we're lucky, - * not in the !uptodate case). In that case it returns - * 0 and we just go on with the next page in our bio. - * If it can't handle the error it will return -EIO and - * we remain responsible for that page. + * btrfs_submit_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. */ - if (!btrfs_submit_read_repair(inode, bio, bio_offset, - page, - start - page_offset(page), - start, end, mirror, - btrfs_submit_data_bio)) { - uptodate = !bio->bi_status; - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } + submit_read_repair(inode, bio, bio_offset, page, + start - page_offset(page), start, + end, mirror, error_bitmap, + btrfs_submit_data_bio); + + ASSERT(bio_offset + len > bio_offset); + bio_offset += len; + continue; } else { struct extent_buffer *eb; @@ -3151,42 +3187,99 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * * Return true if successfully page added. Otherwise, return false. */ -static bool btrfs_bio_add_page(struct bio *bio, struct page *page, +static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long prev_bio_flags, unsigned long bio_flags) { + struct bio *bio = bio_ctrl->bio; + u32 bio_size = bio->bi_iter.bi_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; int ret; - if (prev_bio_flags != bio_flags) + ASSERT(bio); + /* The limit should be calculated when bio_ctrl->bio is allocated */ + ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); + if (bio_ctrl->bio_flags != bio_flags) return false; - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; if (!contig) return false; - if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) + if (bio_size + size > bio_ctrl->len_to_oe_boundary || + bio_size + size > bio_ctrl->len_to_stripe_boundary) return false; - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *first_page = bio_first_bvec_all(bio)->bv_page; - - if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) - return false; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) ret = bio_add_zone_append_page(bio, page, size, pg_offset); - } else { + else ret = bio_add_page(bio, page, size, pg_offset); - } return ret == size; } +static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_io_geometry geom; + struct btrfs_ordered_extent *ordered; + struct extent_map *em; + u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); + int ret; + + /* + * Pages for compressed extent are never submitted to disk directly, + * thus it has no real boundary, just set them to U32_MAX. + * + * The split happens for real compressed bio, which happens in + * btrfs_submit_compressed_read/write(). + */ + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->len_to_stripe_boundary = U32_MAX; + return 0; + } + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); + if (IS_ERR(em)) + return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), + logical, &geom); + free_extent_map(em); + if (ret < 0) { + return ret; + } + if (geom.len > U32_MAX) + bio_ctrl->len_to_stripe_boundary = U32_MAX; + else + bio_ctrl->len_to_stripe_boundary = (u32)geom.len; + + if (!btrfs_is_zoned(fs_info) || + bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + return 0; + } + + ASSERT(fs_info->max_zone_append_size > 0); + /* Ordered extent not yet created, so we're good */ + ordered = btrfs_lookup_ordered_extent(inode, logical); + if (!ordered) { + bio_ctrl->len_to_oe_boundary = U32_MAX; + return 0; + } + + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->disk_bytenr + ordered->disk_num_bytes - logical); + btrfs_put_ordered_extent(ordered); + return 0; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3203,12 +3296,11 @@ static bool btrfs_bio_add_page(struct bio *bio, struct page *page, */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, + struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, - struct bio **bio_ret, bio_end_io_t end_io_func, int mirror_num, - unsigned long prev_bio_flags, unsigned long bio_flags, bool force_bio_submit) { @@ -3219,19 +3311,19 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree = &inode->io_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; - ASSERT(bio_ret); + ASSERT(bio_ctrl); - if (*bio_ret) { - bio = *bio_ret; + ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && + pg_offset + size <= PAGE_SIZE); + if (bio_ctrl->bio) { + bio = bio_ctrl->bio; if (force_bio_submit || - !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, - pg_offset, prev_bio_flags, bio_flags)) { - ret = submit_one_bio(bio, mirror_num, prev_bio_flags); - if (ret < 0) { - *bio_ret = NULL; + !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, + pg_offset, bio_flags)) { + ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); + bio_ctrl->bio = NULL; + if (ret < 0) return ret; - } - bio = NULL; } else { if (wbc) wbc_account_cgroup_owner(wbc, page, io_size); @@ -3254,22 +3346,18 @@ static int submit_extent_page(unsigned int opf, wbc_account_cgroup_owner(wbc, page, io_size); } if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_device *device; - em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); - if (IS_ERR(em)) - return PTR_ERR(em); + device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); + if (IS_ERR(device)) + return PTR_ERR(device); - map = em->map_lookup; - /* We only support single profile for now */ - ASSERT(map->num_stripes == 1); - btrfs_io_bio(bio)->device = map->stripes[0].dev; - - free_extent_map(em); + btrfs_io_bio(bio)->device = device; } - *bio_ret = bio; + bio_ctrl->bio = bio; + bio_ctrl->bio_flags = bio_flags; + ret = calc_bio_boundaries(bio_ctrl, inode); return ret; } @@ -3382,7 +3470,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * return 0 on success, otherwise return error */ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, + struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; @@ -3558,15 +3646,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, disk_bytenr, iosize, - pg_offset, bio, + bio_ctrl, page, disk_bytenr, iosize, + pg_offset, end_bio_extent_readpage, 0, - *bio_flags, this_bio_flag, force_bio_submit); if (!ret) { nr++; - *bio_flags = this_bio_flag; } else { unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, false, cur, iosize); @@ -3580,11 +3666,10 @@ out: } static inline void contiguous_readpages(struct page *pages[], int nr_pages, - u64 start, u64 end, - struct extent_map **em_cached, - struct bio **bio, - unsigned long *bio_flags, - u64 *prev_em_start) + u64 start, u64 end, + struct extent_map **em_cached, + struct btrfs_bio_ctrl *bio_ctrl, + u64 *prev_em_start) { struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; @@ -3592,7 +3677,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + btrfs_do_readpage(pages[index], em_cached, bio_ctrl, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } @@ -3680,6 +3765,54 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, } /* + * Find the first byte we need to write. + * + * For subpage, one page can contain several sectors, and + * __extent_writepage_io() will just grab all extent maps in the page + * range and try to submit all non-inline/non-compressed extents. + * + * This is a big problem for subpage, we shouldn't re-submit already written + * data at all. + * This function will lookup subpage dirty bit to find which range we really + * need to submit. + * + * Return the next dirty range in [@start, @end). + * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. + */ +static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, + struct page *page, u64 *start, u64 *end) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u64 orig_start = *start; + /* Declare as unsigned long so we can use bitmap ops */ + unsigned long dirty_bitmap; + unsigned long flags; + int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; + int range_start_bit = nbits; + int range_end_bit; + + /* + * For regular sector size == page size case, since one page only + * contains one sector, we return the page offset directly. + */ + if (fs_info->sectorsize == PAGE_SIZE) { + *start = page_offset(page); + *end = page_offset(page) + PAGE_SIZE; + return; + } + + /* We should have the page locked, but just in case */ + spin_lock_irqsave(&subpage->lock, flags); + dirty_bitmap = subpage->dirty_bitmap; + spin_unlock_irqrestore(&subpage->lock, flags); + + bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, + BTRFS_SUBPAGE_BITMAP_SIZE); + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; + *end = page_offset(page) + range_end_bit * fs_info->sectorsize; +} + +/* * helper for __extent_writepage. This calls the writepage start hooks, * and does the loop to map the page into extents and bios. * @@ -3696,7 +3829,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_io_tree *tree = &inode->io_tree; u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; u64 cur = start; @@ -3727,15 +3859,26 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, while (cur <= end) { u64 disk_bytenr; u64 em_end; + u64 dirty_range_start = cur; + u64 dirty_range_end; u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(page, cur, end, 1); + btrfs_writepage_endio_finish_ordered(inode, page, cur, + end, 1); break; } + + find_next_dirty_byte(fs_info, page, &dirty_range_start, + &dirty_range_end); + if (cur < dirty_range_start) { + cur = dirty_range_start; + continue; + } + em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); if (IS_ERR_OR_NULL(em)) { - SetPageError(page); + btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); break; } @@ -3750,8 +3893,11 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); disk_bytenr = em->block_start + extent_offset; - /* Note that em_end from extent_map_end() is exclusive */ - iosize = min(em_end, end + 1) - cur; + /* + * Note that em_end from extent_map_end() and dirty_range_end from + * find_next_dirty_byte() are all exclusive + */ + iosize = min(min(em_end, end + 1), dirty_range_end) - cur; if (btrfs_use_zone_append(inode, em->block_start)) opf = REQ_OP_ZONE_APPEND; @@ -3768,28 +3914,38 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (compressed) nr++; else - btrfs_writepage_endio_finish_ordered(page, cur, - cur + iosize - 1, 1); + btrfs_writepage_endio_finish_ordered(inode, + page, cur, cur + iosize - 1, 1); cur += iosize; continue; } - btrfs_set_range_writeback(tree, cur, cur + iosize - 1); + btrfs_set_range_writeback(inode, cur, cur + iosize - 1); if (!PageWriteback(page)) { btrfs_err(inode->root->fs_info, "page %lu not writeback, cur %llu end %llu", page->index, cur, end); } - ret = submit_extent_page(opf | write_flags, wbc, page, + /* + * Although the PageDirty bit is cleared before entering this + * function, subpage dirty bit is not cleared. + * So clear subpage dirty bit here so next time we won't submit + * page for range already written to disk. + */ + btrfs_page_clear_dirty(fs_info, page, cur, iosize); + + ret = submit_extent_page(opf | write_flags, wbc, + &epd->bio_ctrl, page, disk_bytenr, iosize, - cur - page_offset(page), &epd->bio, + cur - page_offset(page), end_bio_extent_writepage, - 0, 0, 0, false); + 0, 0, false); if (ret) { - SetPageError(page); + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) - end_page_writeback(page); + btrfs_page_clear_writeback(fs_info, page, cur, + iosize); } cur += iosize; @@ -4098,12 +4254,15 @@ static struct extent_buffer *find_extent_buffer_nolock( * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() * after all extent buffers in the page has finished their writeback. */ -static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, - struct bio *bio) +static void end_bio_subpage_eb_writepage(struct bio *bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct bvec_iter_all iter_all; + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); + ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4154,16 +4313,11 @@ static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, static void end_bio_extent_buffer_writepage(struct bio *bio) { - struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct extent_buffer *eb; int done; struct bvec_iter_all iter_all; - fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) - return end_bio_subpage_eb_writepage(fs_info, bio); - ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4189,12 +4343,34 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +static void prepare_eb_write(struct extent_buffer *eb) +{ + u32 nritems; + unsigned long start; + unsigned long end; + + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); + atomic_set(&eb->io_pages, num_extent_pages(eb)); + + /* Set btree blocks beyond nritems with 0 to avoid stale content */ + nritems = btrfs_header_nritems(eb); + if (btrfs_header_level(eb) > 0) { + end = btrfs_node_key_ptr_offset(nritems); + memzero_extent_buffer(eb, end, eb->len - end); + } else { + /* + * Leaf: + * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 + */ + start = btrfs_item_nr_offset(nritems); + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); + memzero_extent_buffer(eb, start, end - start); + } +} + /* * Unlike the work in write_one_eb(), we rely completely on extent locking. * Page locking is only utilized at minimum to keep the VMM code happy. - * - * Caller should still call write_one_eb() other than this function directly. - * As write_one_eb() has extra preparation before submitting the extent buffer. */ static int write_one_subpage_eb(struct extent_buffer *eb, struct writeback_control *wbc, @@ -4206,6 +4382,8 @@ static int write_one_subpage_eb(struct extent_buffer *eb, bool no_dirty_ebs = false; int ret; + prepare_eb_write(eb); + /* clear_page_dirty_for_io() in subpage helper needs page locked */ lock_page(page); btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); @@ -4216,10 +4394,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, - eb->start, eb->len, eb->start - page_offset(page), - &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, - false); + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, + &epd->bio_ctrl, page, eb->start, eb->len, + eb->start - page_offset(page), + end_bio_subpage_eb_writepage, 0, 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4244,45 +4422,23 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct extent_page_data *epd) { u64 disk_bytenr = eb->start; - u32 nritems; int i, num_pages; - unsigned long start, end; unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; - clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); - num_pages = num_extent_pages(eb); - atomic_set(&eb->io_pages, num_pages); - - /* set btree blocks beyond nritems with 0 to avoid stale content. */ - nritems = btrfs_header_nritems(eb); - if (btrfs_header_level(eb) > 0) { - end = btrfs_node_key_ptr_offset(nritems); - - memzero_extent_buffer(eb, end, eb->len - end); - } else { - /* - * leaf: - * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 - */ - start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); - memzero_extent_buffer(eb, start, end - start); - } - - if (eb->fs_info->sectorsize < PAGE_SIZE) - return write_one_subpage_eb(eb, wbc, epd); + prepare_eb_write(eb); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - p, disk_bytenr, PAGE_SIZE, 0, - &epd->bio, + &epd->bio_ctrl, p, disk_bytenr, + PAGE_SIZE, 0, end_bio_extent_buffer_writepage, - 0, 0, 0, false); + 0, 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4386,7 +4542,7 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_eb(eb, wbc, epd); + ret = write_one_subpage_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) goto cleanup; @@ -4498,7 +4654,7 @@ int btree_write_cache_pages(struct address_space *mapping, { struct extent_buffer *eb_context = NULL; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4780,7 +4936,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4807,7 +4963,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, PAGE_SHIFT; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, }; @@ -4827,8 +4983,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, if (clear_page_dirty_for_io(page)) ret = __extent_writepage(page, &wbc_writepages, &epd); else { - btrfs_writepage_endio_finish_ordered(page, start, - start + PAGE_SIZE - 1, 1); + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), + page, start, start + PAGE_SIZE - 1, 1); unlock_page(page); } put_page(page); @@ -4850,7 +5006,7 @@ int extent_writepages(struct address_space *mapping, { int ret = 0; struct extent_page_data epd = { - .bio = NULL, + .bio_ctrl = { 0 }, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; @@ -4867,8 +5023,7 @@ int extent_writepages(struct address_space *mapping, void extent_readahead(struct readahead_control *rac) { - struct bio *bio = NULL; - unsigned long bio_flags = 0; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; struct page *pagepool[16]; struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; @@ -4879,14 +5034,14 @@ void extent_readahead(struct readahead_control *rac) u64 contig_end = contig_start + readahead_batch_length(rac) - 1; contiguous_readpages(pagepool, nr, contig_start, contig_end, - &em_cached, &bio, &bio_flags, &prev_em_start); + &em_cached, &bio_ctrl, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) { - if (submit_one_bio(bio, 0, bio_flags)) + if (bio_ctrl.bio) { + if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) return; } } @@ -5429,6 +5584,12 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) subpage = (struct btrfs_subpage *)page->private; if (atomic_read(&subpage->eb_refs)) return true; + /* + * Even there is no eb refs here, we may still have + * end_page_read() call relying on page::private. + */ + if (atomic_read(&subpage->readers)) + return true; } return false; } @@ -5489,7 +5650,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag /* * We can only detach the page private if there are no other ebs in the - * page range. + * page range and no unfinished IO. */ if (!page_range_has_eb(fs_info, page)) btrfs_detach_subpage(fs_info, page); @@ -6176,7 +6337,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6184,10 +6345,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { - ret = try_lock_extent(io_tree, eb->start, - eb->start + eb->len - 1); - if (ret <= 0) - return ret; + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) + return -EAGAIN; } else { ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); if (ret < 0) @@ -6209,9 +6368,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, check_buffer_tree_ref(eb); btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, - eb->len, eb->start - page_offset(page), &bio, - end_bio_extent_readpage, mirror_num, 0, 0, + btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, + page, eb->start, eb->len, + eb->start - page_offset(page), + end_bio_extent_readpage, mirror_num, 0, true); if (ret) { /* @@ -6221,10 +6382,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio) { + if (bio_ctrl.bio) { int tmp; - tmp = submit_one_bio(bio, mirror_num, 0); + tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); + bio_ctrl.bio = NULL; if (tmp < 0) return tmp; } @@ -6247,8 +6409,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct bio *bio = NULL; - unsigned long bio_flags = 0; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6312,9 +6473,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) ClearPageError(page); err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, - page, page_offset(page), PAGE_SIZE, 0, - &bio, end_bio_extent_readpage, - mirror_num, 0, 0, false); + &bio_ctrl, page, page_offset(page), + PAGE_SIZE, 0, end_bio_extent_readpage, + mirror_num, 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6331,8 +6492,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio) { - err = submit_one_bio(bio, mirror_num, bio_flags); + if (bio_ctrl.bio) { + err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); + bio_ctrl.bio = NULL; if (err) return err; } @@ -6515,9 +6677,10 @@ void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, char *kaddr; assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); - memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, - BTRFS_FSID_SIZE); + kaddr = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, + chunk_tree_uuid)); + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); } void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) @@ -6525,9 +6688,9 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) char *kaddr; assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); - memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, - BTRFS_FSID_SIZE); + kaddr = page_address(eb->pages[0]) + + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); } void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 227215a5722c..62027f551b44 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -39,7 +39,7 @@ enum { /* Page starts writeback, clear dirty bit and set writeback bit */ #define PAGE_START_WRITEBACK (1 << 1) #define PAGE_END_WRITEBACK (1 << 2) -#define PAGE_SET_PRIVATE2 (1 << 3) +#define PAGE_SET_ORDERED (1 << 3) #define PAGE_SET_ERROR (1 << 4) #define PAGE_LOCK (1 << 5) @@ -102,6 +102,17 @@ struct extent_buffer { }; /* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + unsigned long bio_flags; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; +}; + +/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { @@ -169,7 +180,7 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, + struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, @@ -281,7 +292,7 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); * When IO fails, either with EIO or csum verification fails, we * try other mirrors that might have a good copy of the data. This * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date + * mirrors. If another mirror has good data, the sector is set up to date * and things continue. If a good mirror can't be found, the original * bio end_io callback is called to indicate things have failed. */ @@ -293,15 +304,13 @@ struct io_failure_record { unsigned long bio_flags; int this_mirror; int failed_mirror; - int in_validation; }; - -blk_status_t btrfs_submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - submit_bio_hook_t *submit_bio_hook); +int btrfs_repair_one_sector(struct inode *inode, + struct bio *failed_bio, u32 bio_offset, + struct page *page, unsigned int pgoff, + u64 start, int failed_mirror, + submit_bio_hook_t *submit_bio_hook); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 441cee7fbb62..df6631eefc65 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -618,7 +618,7 @@ fail: * @file_start: offset in file this bio begins to describe * @contig: Boolean. If true/1 means all bio vecs in this bio are * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contains potentially discontigous bio vecs + * means this bio can contain potentially discontiguous bio vecs * so the logical offset of each should be calculated separately. */ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 55f68422061d..28a05ba47060 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -28,6 +28,7 @@ #include "compression.h" #include "delalloc-space.h" #include "reflink.h" +#include "subpage.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -482,6 +483,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); + ASSERT(num_bytes <= U32_MAX); end_of_last_block = start_pos + num_bytes - 1; @@ -500,9 +502,10 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; - SetPageUptodate(p); + + btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); ClearPageChecked(p); - set_page_dirty(p); + btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); } /* @@ -2483,6 +2486,17 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, const u64 lockend, struct extent_state **cached_state) { + /* + * For subpage case, if the range is not at page boundary, we could + * have pages at the leading/tailing part of the range. + * This could lead to dead loop since filemap_range_has_page() + * will always return true. + * So here we need to do extra page alignment for + * filemap_range_has_page(). + */ + const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + while (1) { struct btrfs_ordered_extent *ordered; int ret; @@ -2503,7 +2517,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, (ordered->file_offset + ordered->num_bytes <= lockstart || ordered->file_offset > lockend)) && !filemap_range_has_page(inode->i_mapping, - lockstart, lockend)) { + page_lockstart, page_lockend)) { if (ordered) btrfs_put_ordered_extent(ordered); break; @@ -3034,22 +3048,20 @@ struct falloc_range { */ static int add_falloc_range(struct list_head *head, u64 start, u64 len) { - struct falloc_range *prev = NULL; struct falloc_range *range = NULL; - if (list_empty(head)) - goto insert; - - /* - * As fallocate iterate by bytenr order, we only need to check - * the last range. - */ - prev = list_entry(head->prev, struct falloc_range, list); - if (prev->start + prev->len == start) { - prev->len += len; - return 0; + if (!list_empty(head)) { + /* + * As fallocate iterates by bytenr order, we only need to check + * the last range. + */ + range = list_last_entry(head, struct falloc_range, list); + if (range->start + range->len == start) { + range->len += len; + return 0; + } } -insert: + range = kmalloc(sizeof(*range), GFP_KERNEL); if (!range) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4806295116d8..2131ae5b9ed7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -327,7 +327,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, * need to check for -EAGAIN. */ ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, BTRFS_EXTENT_DATA_KEY); + 0, BTRFS_EXTENT_DATA_KEY, NULL); if (ret) goto fail; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 46f392943f4d..e6eb20987351 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -51,6 +51,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" struct btrfs_iget_args { u64 ino; @@ -166,22 +167,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *page; while (index <= end_index) { + /* + * For locked page, we will call end_extent_writepage() on it + * in run_delalloc_range() for the error handling. That + * end_extent_writepage() function will call + * btrfs_mark_ordered_io_finished() to clear page Ordered and + * run the ordered extent accounting. + * + * Here we can't just clear the Ordered bit, or + * btrfs_mark_ordered_io_finished() would skip the accounting + * for the page range, and the ordered extent will never finish. + */ + if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + index++; + continue; + } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; - ClearPagePrivate2(page); + + /* + * Here we just clear all Ordered bits for every page in the + * range, then __endio_write_update_ordered() will handle + * the ordered extent accounting for the range. + */ + btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, + offset, bytes); put_page(page); } + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) + return; /* * In case this page belongs to the delalloc range being instantiated * then skip it, since the first page of a range is going to be * properly cleaned up by the caller of run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { - offset += PAGE_SIZE; - bytes -= PAGE_SIZE; + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; } return __endio_write_update_ordered(inode, offset, bytes, false); @@ -603,7 +629,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -946,7 +972,8 @@ retry: const u64 end = start + async_extent->ram_size - 1; p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(p, start, end, 0); + btrfs_writepage_endio_finish_ordered(inode, p, start, + end, 0); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1064,7 +1091,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(inode, start, end, NULL, + extent_clear_unlock_delalloc(inode, start, end, + locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | @@ -1072,6 +1100,19 @@ static noinline int cow_file_range(struct btrfs_inode *inode, *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; + /* + * locked_page is locked by the caller of + * writepage_delalloc(), not locked by + * __process_pages_contig(). + * + * We can't let __process_pages_contig() to unlock it, + * as it doesn't have any subpage::writers recorded. + * + * Here we manually unlock the page, since the caller + * can't use page_started to determine if it's an + * inline extent or a compressed extent. + */ + unlock_page(locked_page); goto out; } else if (ret < 0) { goto out_unlock; @@ -1150,15 +1191,16 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* we're not doing compressed IO, don't unlock the first - * page (which the caller expects to stay locked), don't - * clear any dirty bits and don't set any writeback bits + /* + * We're not doing compressed IO, don't unlock the first page + * (which the caller expects to stay locked), don't clear any + * dirty bits and don't set any writeback bits * - * Do set the Private2 bit so we know this page was properly - * setup for writepage + * Do set the Ordered (Private2) bit so we know this page was + * properly setup for writepage. */ page_ops = unlock ? PAGE_UNLOCK : 0; - page_ops |= PAGE_SET_PRIVATE2; + page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, @@ -1822,7 +1864,7 @@ out_check: locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_PRIVATE2); + PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; @@ -2193,26 +2235,22 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = bio->bi_iter.bi_sector << 9; + u32 bio_len = bio->bi_iter.bi_size; struct extent_map *em; - u64 length = 0; - u64 map_length; int ret = 0; struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; - length = bio->bi_iter.bi_size; - map_length = length; - em = btrfs_get_chunk_map(fs_info, logical, map_length); + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); if (IS_ERR(em)) return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, - map_length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); if (ret < 0) goto out; - if (geom.len < length + size) + if (geom.len < bio_len + size) ret = 1; out: free_extent_map(em); @@ -2233,33 +2271,6 @@ static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); } -bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, - unsigned int size) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered; - u64 len = bio->bi_iter.bi_size + size; - bool ret = true; - - ASSERT(btrfs_is_zoned(fs_info)); - ASSERT(fs_info->max_zone_append_size > 0); - ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); - - /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); - if (!ordered) - return ret; - - if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > - ordered->disk_bytenr + ordered->disk_num_bytes) - ret = false; - - btrfs_put_ordered_extent(ordered); - - return ret; -} - static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, struct bio *bio, loff_t file_offset) { @@ -2601,7 +2612,7 @@ again: lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ - if (PagePrivate2(page)) + if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); @@ -2676,8 +2687,8 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; - /* this page is properly in the ordered list */ - if (TestClearPagePrivate2(page)) + /* This page has ordered extent covering it already */ + if (PageOrdered(page)) return 0; /* @@ -2773,7 +2784,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the - * number of bytes only for that range contaning the inline extent. + * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ @@ -3069,28 +3080,14 @@ static void finish_ordered_fn(struct btrfs_work *work) btrfs_finish_ordered_io(ordered_extent); } -void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, u64 end, int uptodate) { - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *wq; - - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); - - ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) - return; + trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(wq, &ordered_extent->work); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, + finish_ordered_fn, uptodate); } /* @@ -3152,15 +3149,19 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) + * + * Return a bitmap where bit set means a csum mismatch, and bit not set means + * csum match. */ -int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; + unsigned int result = 0; if (PageChecked(page)) { ClearPageChecked(page); @@ -3188,10 +3189,14 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); - if (ret < 0) - return -EIO; + if (ret < 0) { + const int nr_bit = (pg_off - offset_in_page(start)) >> + root->fs_info->sectorsize_bits; + + result |= (1U << nr_bit); + } } - return 0; + return result; } /* @@ -4109,7 +4114,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so - * depending on btrfs_del_root_ref to return -ENOENT here is incorret. + * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. @@ -4464,20 +4469,36 @@ out: #define NEED_TRUNCATE_BLOCK 1 /* - * this can truncate away extent items, csum items and directory items. - * It starts at a high offset and removes keys until it can't find - * any higher than new_size + * Remove inode items from a given root. * - * csum items that cross the new i_size are truncated to the new size - * as well. + * @trans: A transaction handle. + * @root: The root from which to remove items. + * @inode: The inode whose items we want to remove. + * @new_size: The new i_size for the inode. This is only applicable when + * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. + * @min_type: The minimum key type to remove. All keys with a type + * greater than this value are removed and all keys with + * this type are removed only if their offset is >= @new_size. + * @extents_found: Output parameter that will contain the number of file + * extent items that were removed or adjusted to the new + * inode i_size. The caller is responsible for initializing + * the counter. Also, it can be NULL if the caller does not + * need this counter. * - * min_type is the minimum key type to truncate down to. If set to 0, this - * will kill all the items on this inode, including the INODE_ITEM_KEY. + * Remove all keys associated with the inode from the given root that have a key + * with a type greater than or equals to @min_type. When @min_type has a value of + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value + * greater than or equals to @new_size. If a file extent item that starts before + * @new_size and ends after it is found, its length is adjusted. + * + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. */ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, - u64 new_size, u32 min_type) + u64 new_size, u32 min_type, + u64 *extents_found) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; @@ -4623,6 +4644,9 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; + if (extents_found != NULL) + (*extents_found)++; + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; @@ -4941,7 +4965,7 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -5455,7 +5479,7 @@ void btrfs_evict_inode(struct inode *inode) trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), - 0, 0); + 0, 0, NULL); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -7937,19 +7961,17 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, btrfs_ino(BTRFS_I(inode)), pgoff); } else { - blk_status_t status; + int ret; ASSERT((start - io_bio->logical) < UINT_MAX); - status = btrfs_submit_read_repair(inode, - &io_bio->bio, - start - io_bio->logical, - bvec.bv_page, pgoff, - start, - start + sectorsize - 1, - io_bio->mirror_num, - submit_dio_repair_bio); - if (status) - err = status; + ret = btrfs_repair_one_sector(inode, + &io_bio->bio, + start - io_bio->logical, + bvec.bv_page, pgoff, + start, io_bio->mirror_num, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } start += sectorsize; ASSERT(bio_offset + sectorsize > bio_offset); @@ -7964,41 +7986,8 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, const u64 offset, const u64 bytes, const bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_extent *ordered = NULL; - struct btrfs_workqueue *wq; - u64 ordered_offset = offset; - u64 ordered_bytes = bytes; - u64 last_offset; - - if (btrfs_is_free_space_inode(inode)) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - - while (ordered_offset < offset + bytes) { - last_offset = ordered_offset; - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, - &ordered_offset, - ordered_bytes, - uptodate)) { - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, - NULL); - btrfs_queue_work(wq, &ordered->work); - } - - /* No ordered extent found in the range, exit */ - if (ordered_offset == last_offset) - return; - /* - * Our bio might span multiple ordered extents. In this case - * we keep going until we have accounted the whole dio. - */ - if (ordered_offset < offset + bytes) { - ordered_bytes = offset + bytes - ordered_offset; - ordered = NULL; - } - } + btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, + finish_ordered_fn, uptodate); } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -8172,7 +8161,7 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, goto out_err_em; } ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), - logical, submit_len, &geom); + logical, &geom); if (ret) { status = errno_to_blk_status(ret); goto out_err_em; @@ -8276,15 +8265,14 @@ int btrfs_readpage(struct file *file, struct page *page) struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; - unsigned long bio_flags = 0; - struct bio *bio = NULL; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; int ret; btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); - if (bio) - ret = submit_one_bio(bio, 0, bio_flags); + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + if (bio_ctrl.bio) + ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); return ret; } @@ -8353,9 +8341,9 @@ static int btrfs_migratepage(struct address_space *mapping, if (page_has_private(page)) attach_page_private(newpage, detach_page_private(page)); - if (PagePrivate2(page)) { - ClearPagePrivate2(page); - SetPagePrivate2(newpage); + if (PageOrdered(page)) { + ClearPageOrdered(page); + SetPageOrdered(newpage); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -8370,27 +8358,42 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; - u64 start; - u64 end; + u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; - bool found_ordered = false; - bool completed_ordered = false; /* - * we have the page locked, so new writeback can't start, - * and the dirty bit won't be cleared while we are here. + * We have page locked so no new ordered extent can be created on this + * page, nor bio can be submitted for this page. + * + * But already submitted bio can still be finished on this page. + * Furthermore, endio function won't skip page which has Ordered + * (Private2) already cleared, so it's possible for endio and + * invalidatepage to do the same ordered extent accounting twice + * on one page. * - * Wait for IO on this page so that we can safely clear - * the PagePrivate2 bit and do ordered accounting + * So here we wait for any submitted bios to finish, so that we won't + * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); - if (offset) { + /* + * For subpage case, we have call sites like + * btrfs_punch_hole_lock_range() which passes range not aligned to + * sectorsize. + * If the range doesn't cover the full page, we don't need to and + * shouldn't clear page extent mapped, as page->private can still + * record subpage dirty bits for other part of the range. + * + * For cases that can invalidate the full even the range doesn't + * cover the full page, like invalidating the last page, we're + * still safe to wait for ordered extent to finish. + */ + if (!(offset == 0 && length == PAGE_SIZE)) { btrfs_releasepage(page, GFP_NOFS); return; } @@ -8398,89 +8401,123 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) lock_extent_bits(tree, page_start, page_end, &cached_state); - start = page_start; -again: - ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); - if (ordered) { - found_ordered = true; - end = min(page_end, - ordered->file_offset + ordered->num_bytes - 1); + cur = page_start; + while (cur < page_end) { + struct btrfs_ordered_extent *ordered; + bool delete_states; + u64 range_end; + u32 range_len; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + page_end + 1 - cur); + if (!ordered) { + range_end = page_end; + /* + * No ordered extent covering this range, we are safe + * to delete all extent states in the range. + */ + delete_states = true; + goto next; + } + if (ordered->file_offset > cur) { + /* + * There is a range between [cur, oe->file_offset) not + * covered by any ordered extent. + * We are safe to delete all extent states, and handle + * the ordered extent in the next iteration. + */ + range_end = ordered->file_offset - 1; + delete_states = true; + goto next; + } + + range_end = min(ordered->file_offset + ordered->num_bytes - 1, + page_end); + ASSERT(range_end + 1 - cur < U32_MAX); + range_len = range_end + 1 - cur; + if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { + /* + * If Ordered (Private2) is cleared, it means endio has + * already been executed for the range. + * We can't delete the extent states as + * btrfs_finish_ordered_io() may still use some of them. + */ + delete_states = false; + goto next; + } + btrfs_page_clear_ordered(fs_info, page, cur, range_len); + /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. + * + * This will also unlock the range for incoming + * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, start, end, + clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state); + + spin_lock_irq(&inode->ordered_tree.lock); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); + + if (btrfs_dec_test_ordered_pending(inode, &ordered, + cur, range_end + 1 - cur, 1)) { + btrfs_finish_ordered_io(ordered); + /* + * The ordered extent has finished, now we're again + * safe to delete all extent states of the range. + */ + delete_states = true; + } else { + /* + * btrfs_finish_ordered_io() will get executed by endio + * of other pages, thus we can't delete extent states + * anymore + */ + delete_states = false; + } +next: + if (ordered) + btrfs_put_ordered_extent(ordered); /* - * whoever cleared the private bit is responsible - * for the finish_ordered_io + * Qgroup reserved space handler + * Sector(s) here will be either: + * + * 1) Already written to disk or bio already finished + * Then its QGROUP_RESERVED bit in io_tree is already cleared. + * Qgroup will be handled by its qgroup_record then. + * btrfs_qgroup_free_data() call will do nothing here. + * + * 2) Not written to disk yet + * Then btrfs_qgroup_free_data() call will clear the + * QGROUP_RESERVED bit of its io_tree, and free the qgroup + * reserved data space. + * Since the IO will never happen for this page. */ - if (TestClearPagePrivate2(page)) { - spin_lock_irq(&inode->ordered_tree.lock); - set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - ordered->truncated_len = min(ordered->truncated_len, - start - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); - - if (btrfs_dec_test_ordered_pending(inode, &ordered, - start, - end - start + 1, 1)) { - btrfs_finish_ordered_io(ordered); - completed_ordered = true; - } - } - btrfs_put_ordered_extent(ordered); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); if (!inode_evicting) { - cached_state = NULL; - lock_extent_bits(tree, start, end, - &cached_state); + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_UPTODATE | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, + delete_states, &cached_state); } - - start = end + 1; - if (start < page_end) - goto again; + cur = range_end + 1; } - /* - * Qgroup reserved space handler - * Page here will be either - * 1) Already written to disk or ordered extent already submitted - * Then its QGROUP_RESERVED bit in io_tree is already cleaned. - * Qgroup will be handled by its qgroup_record then. - * btrfs_qgroup_free_data() call will do nothing here. - * - * 2) Not written to disk yet - * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED - * bit of its io_tree, and free the qgroup reserved data space. - * Since the IO will never happen for this page. + * We have iterated through all ordered extents of the page, the page + * should not have Ordered (Private2) anymore, or the above iteration + * did something wrong. */ - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); - if (!inode_evicting) { - bool delete = true; - - /* - * If there's an ordered extent for this range and we have not - * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set - * in the range for the ordered extent completion. We must also - * not delete the range, otherwise we would lose that bit (and - * any other bits set in the range). Make sure EXTENT_UPTODATE - * is cleared if we don't delete, otherwise it can lead to - * corruptions if the i_size is extented later. - */ - if (found_ordered && !completed_ordered) - delete = false; - clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, - delete, &cached_state); - + ASSERT(!PageOrdered(page)); + if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - } - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8626,8 +8663,8 @@ again: flush_dcache_page(page); } ClearPageChecked(page); - set_page_dirty(page); - SetPageUptodate(page); + btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); + btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8661,6 +8698,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + u64 extents_found = 0; if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), @@ -8718,20 +8756,13 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) min_size, false); BUG_ON(ret); - /* - * So if we truncate and then write and fsync we normally would just - * write the extents that changed, which is a problem if we need to - * first truncate that entire inode. So set this flag so we write out - * all of the extents in the inode to the sync log so we're completely - * safe. - */ - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); trans->block_rsv = rsv; while (1) { ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), inode->i_size, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + &extents_found); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; @@ -8793,6 +8824,22 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) } out: btrfs_free_block_rsv(fs_info, rsv); + /* + * So if we truncate and then write and fsync we normally would just + * write the extents that changed, which is a problem if we need to + * first truncate that entire inode. So set this flag so we write out + * all of the extents in the inode to the sync log so we're completely + * safe. + * + * If no extents were dropped or trimmed we don't need to force the next + * fsync to truncate all the inode's items from the log and re-log them + * all. This means the truncate operation did not change the file size, + * or changed it to a smaller size but there was only an implicit hole + * between the old i_size and the new i_size, and there were no prealloc + * extents beyond i_size to drop. + */ + if (extents_found > 0) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); return ret; } @@ -10199,17 +10246,21 @@ out: return ret; } -void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; + u32 len; + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); + page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - set_page_writeback(page); + + btrfs_page_set_writeback(fs_info, page, start, len); put_page(page); index++; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5dc2fd843ae3..0ba98e08a029 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -353,15 +353,55 @@ update_flags: return ret; } +/* + * Start exclusive operation @type, return true on success + */ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { - return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); + bool ret = false; + + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { + fs_info->exclusive_operation = type; + ret = true; + } + spin_unlock(&fs_info->super_lock); + + return ret; +} + +/* + * Conditionally allow to enter the exclusive operation in case it's compatible + * with the running one. This must be paired with btrfs_exclop_start_unlock and + * btrfs_exclop_finish. + * + * Compatibility: + * - the same type is already running + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller + * must check the condition first that would allow none -> @type + */ +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + spin_lock(&fs_info->super_lock); + if (fs_info->exclusive_operation == type) + return true; + + spin_unlock(&fs_info->super_lock); + return false; +} + +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) +{ + spin_unlock(&fs_info->super_lock); } void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) { + spin_lock(&fs_info->super_lock); WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + spin_unlock(&fs_info->super_lock); sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } @@ -1455,7 +1495,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (btrfs_defrag_cancelled(fs_info)) { btrfs_debug(fs_info, "defrag_file cancelled"); ret = -EAGAIN; - break; + goto error; } if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, @@ -1533,6 +1573,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } } + ret = defrag_count; +error: if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { filemap_flush(inode->i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, @@ -1546,8 +1588,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } - ret = defrag_count; - out_ra: if (do_compress) { btrfs_inode_lock(inode, 0); @@ -1560,6 +1600,48 @@ out_ra: return ret; } +/* + * Try to start exclusive operation @type or cancel it if it's running. + * + * Return: + * 0 - normal mode, newly claimed op started + * >0 - normal mode, something else is running, + * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space + * ECANCELED - cancel mode, successful cancel + * ENOTCONN - cancel mode, operation not running anymore + */ +static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type, bool cancel) +{ + if (!cancel) { + /* Start normal op */ + if (!btrfs_exclop_start(fs_info, type)) + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + /* Exclusive operation is now claimed */ + return 0; + } + + /* Cancel running op */ + if (btrfs_exclop_start_try_lock(fs_info, type)) { + /* + * This blocks any exclop finish from setting it to NONE, so we + * request cancellation. Either it runs and we will wait for it, + * or it has finished and no waiting will happen. + */ + atomic_inc(&fs_info->reloc_cancel_req); + btrfs_exclop_start_unlock(fs_info); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING, + TASK_INTERRUPTIBLE); + + return -ECANCELED; + } + + /* Something else is running or none */ + return -ENOTCONN; +} + static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { @@ -1577,6 +1659,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, char *devstr = NULL; int ret = 0; int mod = 0; + bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1585,20 +1668,23 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { - mnt_drop_write_file(file); - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - } - + /* + * Read the arguments before checking exclusivity to be able to + * distinguish regular resize and cancel + */ vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto out_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - sizestr = vol_args->name; + cancel = (strcmp("cancel", sizestr) == 0); + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); + if (ret) + goto out_free; + /* Exclusive operation is now claimed */ + devstr = strchr(sizestr, ':'); if (devstr) { sizestr = devstr + 1; @@ -1606,10 +1692,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, devstr = vol_args->name; ret = kstrtoull(devstr, 10, &devid); if (ret) - goto out_free; + goto out_finish; if (!devid) { ret = -EINVAL; - goto out_free; + goto out_finish; } btrfs_info(fs_info, "resizing devid %llu", devid); } @@ -1619,7 +1705,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizer unable to find device %llu", devid); ret = -ENODEV; - goto out_free; + goto out_finish; } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1627,7 +1713,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; - goto out_free; + goto out_finish; } if (!strcmp(sizestr, "max")) @@ -1643,13 +1729,13 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = memparse(sizestr, &retptr); if (*retptr != '\0' || new_size == 0) { ret = -EINVAL; - goto out_free; + goto out_finish; } } if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { ret = -EPERM; - goto out_free; + goto out_finish; } old_size = btrfs_device_get_total_bytes(device); @@ -1657,24 +1743,24 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (mod < 0) { if (new_size > old_size) { ret = -EINVAL; - goto out_free; + goto out_finish; } new_size = old_size - new_size; } else if (mod > 0) { if (new_size > ULLONG_MAX - old_size) { ret = -ERANGE; - goto out_free; + goto out_finish; } new_size = old_size + new_size; } if (new_size < SZ_256M) { ret = -EINVAL; - goto out_free; + goto out_finish; } if (new_size > device->bdev->bd_inode->i_size) { ret = -EFBIG; - goto out_free; + goto out_finish; } new_size = round_down(new_size, fs_info->sectorsize); @@ -1683,7 +1769,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - goto out_free; + goto out_finish; } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans); @@ -1696,10 +1782,11 @@ static noinline int btrfs_ioctl_resize(struct file *file, "resize device %s (devid %llu) from %llu to %llu", rcu_str_deref(device->name), device->devid, old_size, new_size); +out_finish: + btrfs_exclop_finish(fs_info); out_free: kfree(vol_args); -out: - btrfs_exclop_finish(fs_info); +out_drop: mnt_drop_write_file(file); return ret; } @@ -2897,7 +2984,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = PTR_ERR(subvol_name_ptr); goto free_parent; } - /* subvol_name_ptr is already NULL termined */ + /* subvol_name_ptr is already nul terminated */ subvol_name = (char *)kbasename(subvol_name_ptr); } } else { @@ -3119,6 +3206,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; + bool cancel = false; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3137,18 +3225,22 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = -EOPNOTSUPP; goto out; } + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && + strcmp("cancel", vol_args->name) == 0) + cancel = true; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret) goto out; - } + /* Exclusive operation is now claimed */ - if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); - } else { - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + else ret = btrfs_rm_device(fs_info, vol_args->name, 0); - } + btrfs_exclop_finish(fs_info); if (!ret) { @@ -3172,6 +3264,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; int ret; + bool cancel; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3180,25 +3273,24 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - goto out_drop_write; - } - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto out_drop_write; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_rm_device(fs_info, vol_args->name, 0); + cancel = (strcmp("cancel", vol_args->name) == 0); + + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, + cancel); + if (ret == 0) { + ret = btrfs_rm_device(fs_info, vol_args->name, 0); + if (!ret) + btrfs_info(fs_info, "disk deleted %s", vol_args->name); + btrfs_exclop_finish(fs_info); + } - if (!ret) - btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); -out: - btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3551,7 +3643,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, 0); + ret = btrfs_commit_transaction_async(trans); if (ret) { btrfs_end_transaction(trans); return ret; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5fafc5e89bb7..313d9d685adb 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,7 +57,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) /* * Try-lock for read. * - * Retrun 1 if the rwlock has been taken, 0 otherwise + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { @@ -72,7 +72,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) /* * Try-lock for write. * - * Retrun 1 if the rwlock has been taken, 0 otherwise + * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 6c413bb451a3..6eb41b7c0c84 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -16,6 +16,7 @@ #include "compression.h" #include "delalloc-space.h" #include "qgroup.h" +#include "subpage.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -300,81 +301,142 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, } /* - * Finish IO for one ordered extent across a given range. The range can - * contain several ordered extents. + * Mark all ordered extents io inside the specified range finished. * - * @found_ret: Return the finished ordered extent - * @file_offset: File offset for the finished IO - * Will also be updated to one byte past the range that is - * recordered as finished. This allows caller to walk forward. - * @io_size: Length of the finish IO range - * @uptodate: If the IO finished without problem - * - * Return true if any ordered extent is finished in the range, and update - * @found_ret and @file_offset. - * Return false otherwise. + * @page: The invovled page for the opeartion. + * For uncompressed buffered IO, the page status also needs to be + * updated to indicate whether the pending ordered io is finished. + * Can be NULL for direct IO and compressed write. + * For these cases, callers are ensured they won't execute the + * endio function twice. + * @finish_func: The function to be executed when all the IO of an ordered + * extent are finished. * - * NOTE: Although The range can cross multiple ordered extents, only one - * ordered extent will be updated during one call. The caller is responsible to - * iterate all ordered extents in the range. + * This function is called for endio, thus the range must have ordered + * extent(s) coveri it. */ -bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **finished_ret, - u64 *file_offset, u64 io_size, int uptodate) +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, btrfs_func_t finish_func, + bool uptodate) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_workqueue *wq; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - bool finished = false; unsigned long flags; - u64 dec_end; - u64 dec_start; - u64 to_dec; + u64 cur = file_offset; + + if (btrfs_is_free_space_inode(inode)) + wq = fs_info->endio_freespace_worker; + else + wq = fs_info->endio_write_workers; + + if (page) + ASSERT(page->mapping && page_offset(page) <= file_offset && + file_offset + num_bytes <= page_offset(page) + PAGE_SIZE); spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, *file_offset); - if (!node) - goto out; + while (cur < file_offset + num_bytes) { + u64 entry_end; + u64 end; + u32 len; - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!in_range(*file_offset, entry->file_offset, entry->num_bytes)) - goto out; + node = tree_search(tree, cur); + /* No ordered extents at all */ + if (!node) + break; - dec_start = max(*file_offset, entry->file_offset); - dec_end = min(*file_offset + io_size, - entry->file_offset + entry->num_bytes); - *file_offset = dec_end; - if (dec_start > dec_end) { - btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", - dec_start, dec_end); - } - to_dec = dec_end - dec_start; - if (to_dec > entry->bytes_left) { - btrfs_crit(fs_info, - "bad ordered accounting left %llu size %llu", - entry->bytes_left, to_dec); - } - entry->bytes_left -= to_dec; - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + entry_end = entry->file_offset + entry->num_bytes; + /* + * |<-- OE --->| | + * cur + * Go to next OE. + */ + if (cur >= entry_end) { + node = rb_next(node); + /* No more ordered extents, exit */ + if (!node) + break; + entry = rb_entry(node, struct btrfs_ordered_extent, + rb_node); + + /* Go to next ordered extent and continue */ + cur = entry->file_offset; + continue; + } + /* + * | |<--- OE --->| + * cur + * Go to the start of OE. + */ + if (cur < entry->file_offset) { + cur = entry->file_offset; + continue; + } - if (entry->bytes_left == 0) { /* - * Ensure only one caller can set the flag and finished_ret - * accordingly + * Now we are definitely inside one ordered extent. + * + * |<--- OE --->| + * | + * cur */ - finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); - /* test_and_set_bit implies a barrier */ - cond_wake_up_nomb(&entry->wait); - } -out: - if (finished && finished_ret && entry) { - *finished_ret = entry; - refcount_inc(&entry->refs); + end = min(entry->file_offset + entry->num_bytes, + file_offset + num_bytes) - 1; + ASSERT(end + 1 - cur < U32_MAX); + len = end + 1 - cur; + + if (page) { + /* + * Ordered (Private2) bit indicates whether we still + * have pending io unfinished for the ordered extent. + * + * If there's no such bit, we need to skip to next range. + */ + if (!btrfs_page_test_ordered(fs_info, page, cur, len)) { + cur += len; + continue; + } + btrfs_page_clear_ordered(fs_info, page, cur, len); + } + + /* Now we're fine to update the accounting */ + if (unlikely(len > entry->bytes_left)) { + WARN_ON(1); + btrfs_crit(fs_info, +"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu", + inode->root->root_key.objectid, + btrfs_ino(inode), + entry->file_offset, + entry->num_bytes, + len, entry->bytes_left); + entry->bytes_left = 0; + } else { + entry->bytes_left -= len; + } + + if (!uptodate) + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); + + /* + * All the IO of the ordered extent is finished, we need to queue + * the finish_func to be executed. + */ + if (entry->bytes_left == 0) { + set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + cond_wake_up(&entry->wait); + refcount_inc(&entry->refs); + spin_unlock_irqrestore(&tree->lock, flags); + btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_queue_work(wq, &entry->work); + spin_lock_irqsave(&tree->lock, flags); + } + cur += len; } spin_unlock_irqrestore(&tree->lock, flags); - return finished; } /* @@ -870,6 +932,81 @@ out: } /* + * Lookup the first ordered extent that overlaps the range + * [@file_offset, @file_offset + @len). + * + * The difference between this and btrfs_lookup_first_ordered_extent() is + * that this one won't return any ordered extent that does not overlap the range. + * And the difference against btrfs_lookup_ordered_extent() is, this function + * ensures the first ordered extent gets returned. + */ +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *node; + struct rb_node *cur; + struct rb_node *prev; + struct rb_node *next; + struct btrfs_ordered_extent *entry = NULL; + + spin_lock_irq(&tree->lock); + node = tree->tree.rb_node; + /* + * Here we don't want to use tree_search() which will use tree->last + * and screw up the search order. + * And __tree_search() can't return the adjacent ordered extents + * either, thus here we do our own search. + */ + while (node) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) { + node = node->rb_left; + } else if (file_offset >= entry_end(entry)) { + node = node->rb_right; + } else { + /* + * Direct hit, got an ordered extent that starts at + * @file_offset + */ + goto out; + } + } + if (!entry) { + /* Empty tree */ + goto out; + } + + cur = &entry->rb_node; + /* We got an entry around @file_offset, check adjacent entries */ + if (entry->file_offset < file_offset) { + prev = cur; + next = rb_next(cur); + } else { + prev = rb_prev(cur); + next = cur; + } + if (prev) { + entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + if (next) { + entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + goto out; + } + /* No ordered extent in the range */ + entry = NULL; +out: + if (entry) + refcount_inc(&entry->refs); + spin_unlock_irq(&tree->lock); + return entry; +} + +/* * btrfs_flush_ordered_range - Lock the passed range and ensures all pending * ordered extents in it are run to completion. * diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e60c07f36427..566472004edd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -172,13 +172,13 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); +void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + struct page *page, u64 file_offset, + u64 num_bytes, btrfs_func_t finish_func, + bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); -bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, - struct btrfs_ordered_extent **finished_ret, - u64 *file_offset, u64 io_size, - int uptodate); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); @@ -196,6 +196,8 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( + struct btrfs_inode *inode, u64 file_offset, u64 len); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 2dcb1cb21634..b1cb5a8c2999 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -260,6 +260,10 @@ static int prop_compression_validate(const char *value, size_t len) if (btrfs_compress_is_valid_type(value, len)) return 0; + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) + return 0; + return -EINVAL; } @@ -269,7 +273,17 @@ static int prop_compression_apply(struct inode *inode, const char *value, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int type; + /* Reset to defaults */ if (len == 0) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + return 0; + } + + /* Set NOCOMPRESS flag */ + if ((len == 2 && strncmp("no", value, 2) == 0) || + (len == 4 && strncmp("none", value, 4) == 0)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; @@ -348,7 +362,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, /* * This is not strictly necessary as the property should be - * valid, but in case it isn't, don't propagate it futher. + * valid, but in case it isn't, don't propagate it further. */ ret = h->validate(value, strlen(value)); if (ret) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3ded812f522c..07ec06d4e972 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2521,7 +2521,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, int ret = 0; /* - * If quotas get disabled meanwhile, the resouces need to be freed and + * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) @@ -3545,13 +3545,7 @@ static int try_flush_qgroup(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - /* - * Can't hold an open transaction or we run the risk of deadlocking, - * and can't either be under the context of a send operation (where - * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that - * would result in a crash when starting a transaction and does not - * make sense either (send is a read-only operation). - */ + /* Can't hold an open transaction or we run the risk of deadlocking. */ ASSERT(current->journal_info == NULL); if (WARN_ON(current->journal_info)) return 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9178da07cc9c..9b0814318e72 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -7,6 +7,7 @@ #include "delalloc-space.h" #include "reflink.h" #include "transaction.h" +#include "subpage.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M @@ -52,7 +53,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 block_size = fs_info->sectorsize; const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); @@ -106,10 +108,12 @@ static int copy_inline_to_page(struct btrfs_inode *inode, set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); if (comp_type == BTRFS_COMPRESS_NONE) { - memcpy_to_page(page, 0, data_start, datal); + memcpy_to_page(page, offset_in_page(file_offset), data_start, + datal); flush_dcache_page(page); } else { - ret = btrfs_decompress(comp_type, data_start, page, 0, + ret = btrfs_decompress(comp_type, data_start, page, + offset_in_page(file_offset), inline_size, datal); if (ret) goto out_unlock; @@ -133,9 +137,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode, flush_dcache_page(page); } - SetPageUptodate(page); + btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); ClearPageChecked(page); - set_page_dirty(page); + btrfs_page_set_dirty(fs_info, page, file_offset, block_size); out_unlock: if (page) { unlock_page(page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b70be2ac2e9e..fc831597cb22 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2876,11 +2876,12 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, } /* - * Allow error injection to test balance cancellation + * Allow error injection to test balance/relocation cancellation */ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || + atomic_read(&fs_info->reloc_cancel_req) || fatal_signal_pending(current); } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); @@ -3780,6 +3781,60 @@ out: return inode; } +/* + * Mark start of chunk relocation that is cancellable. Check if the cancellation + * has been requested meanwhile and don't start in that case. + * + * Return: + * 0 success + * -EINPROGRESS operation is already in progress, that's probably a bug + * -ECANCELED cancellation request was set before the operation started + * -EAGAIN can not start because there are ongoing send operations + */ +static int reloc_chunk_start(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->send_reloc_lock); + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run relocation while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + spin_unlock(&fs_info->send_reloc_lock); + return -EAGAIN; + } + if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { + /* This should not happen */ + spin_unlock(&fs_info->send_reloc_lock); + btrfs_err(fs_info, "reloc already running, cannot start"); + return -EINPROGRESS; + } + spin_unlock(&fs_info->send_reloc_lock); + + if (atomic_read(&fs_info->reloc_cancel_req) > 0) { + btrfs_info(fs_info, "chunk relocation canceled on start"); + /* + * On cancel, clear all requests but let the caller mark + * the end after cleanup operations. + */ + atomic_set(&fs_info->reloc_cancel_req, 0); + return -ECANCELED; + } + return 0; +} + +/* + * Mark end of chunk relocation that is cancellable and wake any waiters. + */ +static void reloc_chunk_end(struct btrfs_fs_info *fs_info) +{ + /* Requested after start, clear bit first so any waiters can continue */ + if (atomic_read(&fs_info->reloc_cancel_req) > 0) + btrfs_info(fs_info, "chunk relocation canceled during operation"); + spin_lock(&fs_info->send_reloc_lock); + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); + spin_unlock(&fs_info->send_reloc_lock); + atomic_set(&fs_info->reloc_cancel_req, 0); +} + static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -3862,6 +3917,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) return -ENOMEM; } + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_put_bg; + } + rc->extent_root = extent_root; rc->block_group = bg; @@ -3952,7 +4013,9 @@ out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); - btrfs_put_block_group(rc->block_group); +out_put_bg: + btrfs_put_block_group(bg); + reloc_chunk_end(fs_info); free_reloc_control(rc); return err; } @@ -4073,6 +4136,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) goto out; } + ret = reloc_chunk_start(fs_info); + if (ret < 0) { + err = ret; + goto out_end; + } + rc->extent_root = fs_info->extent_root; set_reloc_control(rc); @@ -4137,6 +4206,8 @@ out_clean: err = ret; out_unset: unset_reloc_control(rc); +out_end: + reloc_chunk_end(fs_info); free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 485cda3eb8d7..088641ba7a8e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -165,6 +165,10 @@ struct scrub_ctx { int readonly; int pages_per_rd_bio; + /* State of IO submission throttling affecting the associated device */ + ktime_t throttle_deadline; + u64 throttle_sent; + int is_dev_replace; u64 write_pointer; @@ -605,6 +609,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( spin_lock_init(&sctx->list_lock); spin_lock_init(&sctx->stat_lock); init_waitqueue_head(&sctx->list_wait); + sctx->throttle_deadline = 0; WARN_ON(sctx->wr_curr_bio != NULL); mutex_init(&sctx->wr_lock); @@ -626,7 +631,6 @@ nomem: static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *warn_ctx) { - u64 isize; u32 nlink; int ret; int i; @@ -662,7 +666,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, eb = swarn->path->nodes[0]; inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], struct btrfs_inode_item); - isize = btrfs_inode_size(eb, inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(swarn->path); @@ -691,12 +694,12 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), swarn->physical, root, inum, offset, - min(isize - offset, (u64)PAGE_SIZE), nlink, + fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); @@ -885,25 +888,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * read all mirrors one after the other. This includes to * re-read the extent or metadata block that failed (that was * the cause that this fixup code is called) another time, - * page by page this time in order to know which pages + * sector by sector this time in order to know which sectors * caused I/O errors and which ones are good (for all mirrors). * It is the goal to handle the situation when more than one * mirror contains I/O errors, but the errors do not * overlap, i.e. the data can be repaired by selecting the - * pages from those mirrors without I/O error on the - * particular pages. One example (with blocks >= 2 * PAGE_SIZE) - * would be that mirror #1 has an I/O error on the first page, - * the second page is good, and mirror #2 has an I/O error on - * the second page, but the first page is good. - * Then the first page of the first mirror can be repaired by - * taking the first page of the second mirror, and the - * second page of the second mirror can be repaired by - * copying the contents of the 2nd page of the 1st mirror. - * One more note: if the pages of one mirror contain I/O + * sectors from those mirrors without I/O error on the + * particular sectors. One example (with blocks >= 2 * sectorsize) + * would be that mirror #1 has an I/O error on the first sector, + * the second sector is good, and mirror #2 has an I/O error on + * the second sector, but the first sector is good. + * Then the first sector of the first mirror can be repaired by + * taking the first sector of the second mirror, and the + * second sector of the second mirror can be repaired by + * copying the contents of the 2nd sector of the 1st mirror. + * One more note: if the sectors of one mirror contain I/O * errors, the checksum cannot be verified. In order to get * the best data for repairing, the first attempt is to find * a mirror without I/O errors and with a validated checksum. - * Only if this is not possible, the pages are picked from + * Only if this is not possible, the sectors are picked from * mirrors with I/O errors without considering the checksum. * If the latter is the case, at the end, the checksum of the * repaired area is verified in order to correctly maintain @@ -1060,26 +1063,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* * In case of I/O errors in the area that is supposed to be - * repaired, continue by picking good copies of those pages. - * Select the good pages from mirrors to rewrite bad pages from + * repaired, continue by picking good copies of those sectors. + * Select the good sectors from mirrors to rewrite bad sectors from * the area to fix. Afterwards verify the checksum of the block * that is supposed to be repaired. This verification step is * only done for the purpose of statistic counting and for the * final scrub report, whether errors remain. * A perfect algorithm could make use of the checksum and try - * all possible combinations of pages from the different mirrors + * all possible combinations of sectors from the different mirrors * until the checksum verification succeeds. For example, when - * the 2nd page of mirror #1 faces I/O errors, and the 2nd page + * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector * of mirror #2 is readable but the final checksum test fails, - * then the 2nd page of mirror #3 could be tried, whether now + * then the 2nd sector of mirror #3 could be tried, whether now * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors * without I/O error based on sector sizes (512 bytes on legacy - * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one + * disks) instead of on sectorsize. Then maybe 512 byte of one * mirror could be repaired by taking 512 byte of a different - * mirror, even if other 512 byte sectors in the same PAGE_SIZE + * mirror, even if other 512 byte sectors in the same sectorsize * area are unreadable. */ success = 1; @@ -1260,7 +1263,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * PAGE_SIZE; + u64 length = original_sblock->page_count * fs_info->sectorsize; u64 logical = original_sblock->pagev[0]->logical; u64 generation = original_sblock->pagev[0]->generation; u64 flags = original_sblock->pagev[0]->flags; @@ -1283,13 +1286,13 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, */ while (length > 0) { - sublen = min_t(u64, length, PAGE_SIZE); + sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; bbio = NULL; /* - * with a length of PAGE_SIZE, each returned stripe - * represents one mirror + * With a length of sectorsize, each returned stripe represents + * one mirror */ btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, @@ -1480,7 +1483,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, bio = btrfs_io_bio_alloc(1); bio_set_dev(bio, spage->dev->bdev); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + bio_add_page(bio, spage->page, fs_info->sectorsize, 0); bio->bi_iter.bi_sector = spage->physical >> 9; bio->bi_opf = REQ_OP_READ; @@ -1544,6 +1547,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; struct scrub_page *spage_good = sblock_good->pagev[page_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; + const u32 sectorsize = fs_info->sectorsize; BUG_ON(spage_bad->page == NULL); BUG_ON(spage_good->page == NULL); @@ -1563,8 +1567,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; - ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0); - if (PAGE_SIZE != ret) { + ret = bio_add_page(bio, spage_good->page, sectorsize, 0); + if (ret != sectorsize) { bio_put(bio); return -EIO; } @@ -1642,6 +1646,7 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, { struct scrub_bio *sbio; int ret; + const u32 sectorsize = sctx->fs_info->sectorsize; mutex_lock(&sctx->wr_lock); again: @@ -1681,16 +1686,16 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_WRITE; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -1729,7 +1734,8 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) btrfsic_submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; + sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->fs_info->sectorsize; } static void scrub_wr_bio_end_io(struct bio *bio) @@ -1988,6 +1994,65 @@ static void scrub_page_put(struct scrub_page *spage) } } +/* + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. + */ +static void scrub_throttle(struct scrub_ctx *sctx) +{ + const int time_slice = 1000; + struct scrub_bio *sbio; + struct btrfs_device *device; + s64 delta; + ktime_t now; + u32 div; + u64 bwlimit; + + sbio = sctx->bios[sctx->curr]; + device = sbio->dev; + bwlimit = READ_ONCE(device->scrub_speed_max); + if (bwlimit == 0) + return; + + /* + * Slice is divided into intervals when the IO is submitted, adjust by + * bwlimit and maximum of 64 intervals. + */ + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); + div = min_t(u32, 64, div); + + /* Start new epoch, set deadline */ + now = ktime_get(); + if (sctx->throttle_deadline == 0) { + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); + sctx->throttle_sent = 0; + } + + /* Still in the time to send? */ + if (ktime_before(now, sctx->throttle_deadline)) { + /* If current bio is within the limit, send it */ + sctx->throttle_sent += sbio->bio->bi_iter.bi_size; + if (sctx->throttle_sent <= div_u64(bwlimit, div)) + return; + + /* We're over the limit, sleep until the rest of the slice */ + delta = ktime_ms_delta(sctx->throttle_deadline, now); + } else { + /* New request after deadline, start new epoch */ + delta = 0; + } + + if (delta) { + long timeout; + + timeout = div_u64(delta * HZ, 1000); + schedule_timeout_interruptible(timeout); + } + + /* Next call will start the deadline period */ + sctx->throttle_deadline = 0; +} + static void scrub_submit(struct scrub_ctx *sctx) { struct scrub_bio *sbio; @@ -1995,6 +2060,8 @@ static void scrub_submit(struct scrub_ctx *sctx) if (sctx->curr == -1) return; + scrub_throttle(sctx); + sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); @@ -2006,6 +2073,7 @@ static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, { struct scrub_block *sblock = spage->sblock; struct scrub_bio *sbio; + const u32 sectorsize = sctx->fs_info->sectorsize; int ret; again: @@ -2044,9 +2112,9 @@ again: bio->bi_iter.bi_sector = sbio->physical >> 9; bio->bi_opf = REQ_OP_READ; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != + } else if (sbio->physical + sbio->page_count * sectorsize != spage->physical || - sbio->logical + sbio->page_count * PAGE_SIZE != + sbio->logical + sbio->page_count * sectorsize != spage->logical || sbio->dev != spage->dev) { scrub_submit(sctx); @@ -2054,8 +2122,8 @@ again: } sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); - if (ret != PAGE_SIZE) { + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + if (ret != sectorsize) { if (sbio->page_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; @@ -2398,7 +2466,7 @@ static void scrub_block_complete(struct scrub_block *sblock) if (sblock->sparity && corrupted && !sblock->data_corrected) { u64 start = sblock->pagev[0]->logical; u64 end = sblock->pagev[sblock->page_count - 1]->logical + - PAGE_SIZE; + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); scrub_parity_mark_sectors_error(sblock->sparity, @@ -2418,7 +2486,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *su * the csum into @csum. * * The search source is sctx->csum_list, which is a pre-populated list - * storing bytenr ordered csum ranges. We're reponsible to cleanup any range + * storing bytenr ordered csum ranges. We're responsible to cleanup any range * that is before @logical. * * Return 0 if there is no csum for the range. @@ -3138,28 +3206,23 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; nstripes = div64_u64(length, map->stripe_len); + mirror_num = 1; + increment = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; - mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { int factor = map->num_stripes / map->sub_stripes; offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); - mirror_num = 1; - } else { - increment = map->stripe_len; - mirror_num = 1; } path = btrfs_alloc_path(); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index bd69db72acc5..6ac37ae6c811 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2078,16 +2078,6 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx, } /* - * Removes the entry from the list and adds it back to the end. This marks the - * entry as recently used so that name_cache_clean_unused does not remove it. - */ -static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) -{ - list_del(&nce->list); - list_add_tail(&nce->list, &sctx->name_cache_list); -} - -/* * Remove some entries from the beginning of name_cache_list. */ static void name_cache_clean_unused(struct send_ctx *sctx) @@ -2147,7 +2137,13 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, kfree(nce); nce = NULL; } else { - name_cache_used(sctx, nce); + /* + * Removes the entry from the list and adds it back to + * the end. This marks the entry as recently used so + * that name_cache_clean_unused does not remove it. + */ + list_move_tail(&nce->list, &sctx->name_cache_list); + *parent_ino = nce->parent_ino; *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); @@ -4064,6 +4060,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; } else { + /* + * If we previously orphanized a directory that + * collided with a new reference that we already + * processed, recompute the current path because + * that directory may be part of the path. + */ + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; @@ -6507,7 +6514,7 @@ static int changed_extent(struct send_ctx *sctx, * updates the inode item, but it only changes the iversion (sequence * field in the inode item) of the inode, so if a file is deduplicated * the same amount of times in both the parent and send snapshots, its - * iversion becames the same in both snapshots, whence the inode item is + * iversion becomes the same in both snapshots, whence the inode item is * the same on both snapshots. */ if (sctx->cur_ino != sctx->cmp_key->objectid) @@ -7409,23 +7416,21 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - mutex_lock(&fs_info->balance_mutex); - if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); + spin_lock(&fs_info->send_reloc_lock); + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { + spin_unlock(&fs_info->send_reloc_lock); btrfs_warn_rl(fs_info, - "cannot run send because a balance operation is in progress"); + "cannot run send because a relocation operation is in progress"); ret = -EAGAIN; goto out; } fs_info->send_in_progress++; - mutex_unlock(&fs_info->balance_mutex); + spin_unlock(&fs_info->send_reloc_lock); - current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); - current->journal_info = NULL; - mutex_lock(&fs_info->balance_mutex); + spin_lock(&fs_info->send_reloc_lock); fs_info->send_in_progress--; - mutex_unlock(&fs_info->balance_mutex); + spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2dc674b7c3b1..f79bf85f2439 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -133,18 +133,13 @@ * operations, however they won't be usable until the transaction commits. * * COMMIT_TRANS - * may_commit_transaction() is the ultimate arbiter on whether we commit the - * transaction or not. In order to avoid constantly churning we do all the - * above flushing first and then commit the transaction as the last resort. - * However we need to take into account things like pinned space that would - * be freed, plus any delayed work we may not have gotten rid of in the case - * of metadata. - * - * FORCE_COMMIT_TRANS - * For use by the preemptive flusher. We use this to bypass the ticketing - * checks in may_commit_transaction, as we have more information about the - * overall state of the system and may want to commit the transaction ahead - * of actual ENOSPC conditions. + * This will commit the transaction. Historically we had a lot of logic + * surrounding whether or not we'd commit the transaction, but this waits born + * out of a pre-tickets era where we could end up committing the transaction + * thousands of times in a row without making progress. Now thanks to our + * ticketing system we know if we're not making progress and can error + * everybody out after a few commits rather than burning the disk hoping for + * a different answer. * * OVERCOMMIT * @@ -197,13 +192,6 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (!space_info) return -ENOMEM; - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, - GFP_KERNEL); - if (ret) { - kfree(space_info); - return ret; - } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); @@ -389,7 +377,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); - /* Check and see if our ticket can be satisified now. */ + /* Check and see if our ticket can be satisfied now. */ if ((used + ticket->bytes <= space_info->total_bytes) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { @@ -495,7 +483,8 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, */ static void shrink_delalloc(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - u64 to_reclaim, bool wait_ordered) + u64 to_reclaim, bool wait_ordered, + bool for_preempt) { struct btrfs_trans_handle *trans; u64 delalloc_bytes; @@ -532,7 +521,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, * ordered extents, otherwise we'll waste time trying to flush delalloc * that likely won't give us the space back we need. */ - if (ordered_bytes > delalloc_bytes) + if (ordered_bytes > delalloc_bytes && !for_preempt) wait_ordered = true; loops = 0; @@ -551,6 +540,14 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, break; } + /* + * If we are for preemption we just want a one-shot of delalloc + * flushing so we can stop flushing if we decide we don't need + * to anymore. + */ + if (for_preempt) + break; + spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { @@ -566,109 +563,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, } } -/** - * Possibly commit the transaction if its ok to - * - * @fs_info: the filesystem - * @space_info: space_info we are checking for commit, either data or metadata - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - struct reserve_ticket *ticket = NULL; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; - struct btrfs_trans_handle *trans; - u64 reclaim_bytes = 0; - u64 bytes_needed = 0; - u64 cur_free_bytes = 0; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - spin_lock(&space_info->lock); - cur_free_bytes = btrfs_space_info_used(space_info, true); - if (cur_free_bytes < space_info->total_bytes) - cur_free_bytes = space_info->total_bytes - cur_free_bytes; - else - cur_free_bytes = 0; - - if (!list_empty(&space_info->priority_tickets)) - ticket = list_first_entry(&space_info->priority_tickets, - struct reserve_ticket, list); - else if (!list_empty(&space_info->tickets)) - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - if (ticket) - bytes_needed = ticket->bytes; - - if (bytes_needed > cur_free_bytes) - bytes_needed -= cur_free_bytes; - else - bytes_needed = 0; - spin_unlock(&space_info->lock); - - if (!bytes_needed) - return 0; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * See if there is enough pinned space to make this reservation, or if - * we have block groups that are going to be freed, allowing us to - * possibly do a chunk allocation the next loop through. - */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || - __percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) - goto commit; - - /* - * See if there is some space in the delayed insertion reserve for this - * reservation. If the space_info's don't match (like for DATA or - * SYSTEM) then just go enospc, reclaiming this space won't recover any - * space to satisfy those reservations. - */ - if (space_info != delayed_rsv->space_info) - goto enospc; - - spin_lock(&delayed_rsv->lock); - reclaim_bytes += delayed_rsv->reserved; - spin_unlock(&delayed_rsv->lock); - - spin_lock(&delayed_refs_rsv->lock); - reclaim_bytes += delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - - spin_lock(&trans_rsv->lock); - reclaim_bytes += trans_rsv->reserved; - spin_unlock(&trans_rsv->lock); - - if (reclaim_bytes >= bytes_needed) - goto commit; - bytes_needed -= reclaim_bytes; - - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) - goto enospc; - -commit: - return btrfs_commit_transaction(trans); -enospc: - btrfs_end_transaction(trans); - return -ENOSPC; -} - /* * Try to flush some data based on policy set by @state. This is only advisory * and may fail for various reasons. The caller is supposed to examine the @@ -702,7 +596,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: shrink_delalloc(fs_info, space_info, num_bytes, - state == FLUSH_DELALLOC_WAIT); + state == FLUSH_DELALLOC_WAIT, for_preempt); break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: @@ -743,9 +637,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info); - break; - case FORCE_COMMIT_TRANS: + ASSERT(current->journal_info == NULL); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -792,12 +684,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info) { + u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; u64 thresh = div_factor_fine(space_info->total_bytes, 98); u64 used; /* If we're just plain full then async reclaim just slows us down. */ - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) + if ((space_info->bytes_used + space_info->bytes_reserved + + global_rsv_size) >= thresh) return false; /* @@ -838,8 +732,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, thresh = calc_available_free_space(fs_info, space_info, BTRFS_RESERVE_FLUSH_ALL); - thresh += (space_info->total_bytes - space_info->bytes_used - - space_info->bytes_reserved - space_info->bytes_readonly); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_readonly + global_rsv_size; + if (used < space_info->total_bytes) + thresh += space_info->total_bytes - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -860,14 +756,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * clearly be heavy enough to warrant preemptive flushing. In the case * of heavy DIO or ordered reservations, preemptive flushing will just * waste time and cause us to slow down. + * + * We want to make sure we truly are maxed out on ordered however, so + * cut ordered in half, and if it's still higher than delalloc then we + * can keep flushing. This is to avoid the case where we start + * flushing, and now delalloc == ordered and we stop preemptively + * flushing when we could still have several gigs of delalloc to flush. */ - ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) used += fs_info->delayed_refs_rsv.reserved + fs_info->delayed_block_rsv.reserved; else - used += space_info->bytes_may_use; + used += space_info->bytes_may_use - global_rsv_size; return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); @@ -921,7 +823,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; - u64 first_ticket_bytes = 0; if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); @@ -937,21 +838,6 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, steal_from_global_rsv(fs_info, space_info, ticket)) return true; - /* - * may_commit_transaction will avoid committing the transaction - * if it doesn't feel like the space reclaimed by the commit - * would result in the ticket succeeding. However if we have a - * smaller ticket in the queue it may be small enough to be - * satisified by committing the transaction, so if any - * subsequent ticket is smaller than the first ticket go ahead - * and send us back for another loop through the enospc flushing - * code. - */ - if (first_ticket_bytes == 0) - first_ticket_bytes = ticket->bytes; - else if (first_ticket_bytes > ticket->bytes) - return true; - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); @@ -1117,7 +1003,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) (delayed_block_rsv->reserved + delayed_refs_rsv->reserved)) { to_reclaim = space_info->bytes_pinned; - flush = FORCE_COMMIT_TRANS; + flush = COMMIT_TRANS; } else if (delayed_block_rsv->reserved > delayed_refs_rsv->reserved) { to_reclaim = delayed_block_rsv->reserved; @@ -1171,28 +1057,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * immediately re-usable, it comes in the form of a delayed ref, which must be * run and then the transaction must be committed. * - * FLUSH_DELAYED_REFS - * The above two cases generate delayed refs that will affect - * ->total_bytes_pinned. However this counter can be inconsistent with - * reality if there are outstanding delayed refs. This is because we adjust - * the counter based solely on the current set of delayed refs and disregard - * any on-disk state which might include more refs. So for example, if we - * have an extent with 2 references, but we only drop 1, we'll see that there - * is a negative delayed ref count for the extent and assume that the space - * will be freed, and thus increase ->total_bytes_pinned. - * - * Running the delayed refs gives us the actual real view of what will be - * freed at the transaction commit time. This stage will not actually free - * space for us, it just makes sure that may_commit_transaction() has all of - * the information it needs to make the right decision. - * * COMMIT_TRANS - * This is where we reclaim all of the pinned space generated by the previous - * two stages. We will not commit the transaction if we don't think we're - * likely to satisfy our request, which means if our current free space + - * total_bytes_pinned < reservation we will not commit. This is why the - * previous states are actually important, to make sure we know for sure - * whether committing the transaction will allow us to make progress. + * This is where we reclaim all of the pinned space generated by running the + * iputs * * ALLOC_CHUNK_FORCE * For data we start with alloc chunk force, however we could have been full @@ -1202,7 +1069,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, - FLUSH_DELAYED_REFS, COMMIT_TRANS, ALLOC_CHUNK_FORCE, }; @@ -1561,6 +1427,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { + /* + * We were forced to add a reserve ticket, so + * our preemptive flushing is unable to keep + * up. Clamp down on the threshold for the + * preemptive flushing in order to keep up with + * the workload. + */ + maybe_clamp_preempt(fs_info, space_info); + space_info->flush = 1; trace_btrfs_trigger_flush(fs_info, space_info->flags, @@ -1572,14 +1447,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, list_add_tail(&ticket.list, &space_info->priority_tickets); } - - /* - * We were forced to add a reserve ticket, so our preemptive - * flushing is unable to keep up. Clamp down on the threshold - * for the preemptive flushing in order to keep up with the - * workload. - */ - maybe_clamp_preempt(fs_info, space_info); } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -1588,8 +1455,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_preemptive_reclaim(fs_info, space_info) && - !work_busy(&fs_info->preempt_reclaim_work)) { + !work_busy(&fs_info->preempt_reclaim_work) && + need_preemptive_reclaim(fs_info, space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index b1a8ffb03b3e..cb5056472e79 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -43,18 +43,6 @@ struct btrfs_space_info { u64 flags; - /* - * bytes_pinned is kept in line with what is actually pinned, as in - * we've called update_block_group and dropped the bytes_used counter - * and increased the bytes_pinned counter. However this means that - * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed every time we - * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zeroed every - * time the transaction commits. - */ - struct percpu_counter total_bytes_pinned; - struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; @@ -157,22 +145,4 @@ static inline void btrfs_space_info_free_bytes_may_use( } int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); - -static inline void __btrfs_mod_total_bytes_pinned( - struct btrfs_space_info *space_info, - s64 mod) -{ - percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, - BTRFS_TOTAL_BYTES_PINNED_BATCH); -} - -static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, - u64 flags, s64 mod) -{ - struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); - - ASSERT(space_info); - __btrfs_mod_total_bytes_pinned(space_info, mod); -} - #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 2d19089ab625..640bcd21bf28 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -3,6 +3,7 @@ #include <linux/slab.h> #include "ctree.h" #include "subpage.h" +#include "btrfs_inode.h" /* * Subpage (sectorsize < PAGE_SIZE) support overview: @@ -110,10 +111,12 @@ int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, if (!*ret) return -ENOMEM; spin_lock_init(&(*ret)->lock); - if (type == BTRFS_SUBPAGE_METADATA) + if (type == BTRFS_SUBPAGE_METADATA) { atomic_set(&(*ret)->eb_refs, 0); - else + } else { atomic_set(&(*ret)->readers, 0); + atomic_set(&(*ret)->writers, 0); + } return 0; } @@ -183,12 +186,10 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; const int nbits = len >> fs_info->sectorsize_bits; - int ret; btrfs_subpage_assert(fs_info, page, start, len); - ret = atomic_add_return(nbits, &subpage->readers); - ASSERT(ret == nbits); + atomic_add(nbits, &subpage->readers); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, @@ -196,10 +197,95 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; const int nbits = len >> fs_info->sectorsize_bits; + bool is_data; + bool last; btrfs_subpage_assert(fs_info, page, start, len); + is_data = is_data_inode(page->mapping->host); ASSERT(atomic_read(&subpage->readers) >= nbits); - if (atomic_sub_and_test(nbits, &subpage->readers)) + last = atomic_sub_and_test(nbits, &subpage->readers); + + /* + * For data we need to unlock the page if the last read has finished. + * + * And please don't replace @last with atomic_sub_and_test() call + * inside if () condition. + * As we want the atomic_sub_and_test() to be always executed. + */ + if (is_data && last) + unlock_page(page); +} + +static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +{ + u64 orig_start = *start; + u32 orig_len = *len; + + *start = max_t(u64, page_offset(page), orig_start); + *len = min_t(u64, page_offset(page) + PAGE_SIZE, + orig_start + orig_len) - *start; +} + +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + int ret; + + btrfs_subpage_assert(fs_info, page, start, len); + + ASSERT(atomic_read(&subpage->readers) == 0); + ret = atomic_add_return(nbits, &subpage->writers); + ASSERT(ret == nbits); +} + +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const int nbits = (len >> fs_info->sectorsize_bits); + + btrfs_subpage_assert(fs_info, page, start, len); + + ASSERT(atomic_read(&subpage->writers) >= nbits); + return atomic_sub_and_test(nbits, &subpage->writers); +} + +/* + * Lock a page for delalloc page writeback. + * + * Return -EAGAIN if the page is not properly initialized. + * Return 0 with the page locked, and writer counter updated. + * + * Even with 0 returned, the page still need extra check to make sure + * it's really the correct page, as the caller is using + * find_get_pages_contig(), which can race with page invalidating. + */ +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { + lock_page(page); + return 0; + } + lock_page(page); + if (!PagePrivate(page) || !page->private) { + unlock_page(page); + return -EAGAIN; + } + btrfs_subpage_clamp_range(page, &start, &len); + btrfs_subpage_start_writer(fs_info, page, start, len); + return 0; +} + +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) + return unlock_page(page); + btrfs_subpage_clamp_range(page, &start, &len); + if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) unlock_page(page); } @@ -354,6 +440,32 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->ordered_bitmap |= tmp; + SetPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->ordered_bitmap &= ~tmp; + if (subpage->ordered_bitmap == 0) + ClearPageOrdered(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -376,6 +488,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -408,6 +521,34 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + set_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_set_##name(fs_info, page, start, len); \ +} \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + clear_page_func(page); \ + return; \ + } \ + btrfs_subpage_clamp_range(page, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, page, start, len); \ +} \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len) \ +{ \ + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + return test_page_func(page); \ + btrfs_subpage_clamp_range(page, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, page, start, len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); @@ -416,3 +557,5 @@ IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, PageDirty); IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, + PageOrdered); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index bfd626e955be..4d7aca85d915 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -22,6 +22,14 @@ struct btrfs_subpage { u16 error_bitmap; u16 dirty_bitmap; u16 writeback_bitmap; + /* + * Both data and metadata needs to track how many readers are for the + * page. + * Data relies on @readers to unlock the page when last reader finished. + * While metadata doesn't need page unlock, it needs to prevent + * page::private get cleared before the last end_page_read(). + */ + atomic_t readers; union { /* * Structures only used by metadata @@ -32,7 +40,10 @@ struct btrfs_subpage { atomic_t eb_refs; /* Structures only used by data */ struct { - atomic_t readers; + atomic_t writers; + + /* Tracke pending ordered extent in this sector */ + u16 ordered_bitmap; }; }; }; @@ -63,6 +74,15 @@ void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); +void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); + /* * Template for subpage related operations. * @@ -72,6 +92,10 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, * btrfs_page_*() are for call sites where the page can either be subpage * specific or regular page. The function will handle both cases. * But the range still needs to be inside the page. + * + * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * need to be inside the page. Those functions will truncate the range + * automatically. */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -85,12 +109,19 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct page *page, u64 start, u32 len); \ +bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); +DECLARE_BTRFS_SUBPAGE_OPS(ordered); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4a396c1147f1..d07b18b2b250 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -299,17 +299,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; WRITE_ONCE(trans->aborted, errno); - /* Nothing used. The other threads that have joined this - * transaction may be able to continue. */ - if (!trans->dirty && list_empty(&trans->new_bgs)) { - const char *errstr; - - errstr = btrfs_decode_error(errno); - btrfs_warn(fs_info, - "%s:%d: Aborting unused transaction(%s).", - function, line, errstr); - return; - } WRITE_ONCE(trans->transaction->aborted, errno); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); @@ -945,8 +934,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_check_integrity_including_extent_data: btrfs_info(info, "enabling check integrity including extent data"); - btrfs_set_opt(info->mount_opt, - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: @@ -1527,7 +1515,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) seq_puts(seq, ",check_int_data"); else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 436ac7b4b334..9d1d140118ff 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -429,7 +429,7 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -451,7 +451,7 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -665,15 +665,6 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) -static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, - struct kobj_attribute *a, - char *buf) -{ - struct btrfs_space_info *sinfo = to_space_info(kobj); - s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); - return scnprintf(buf, PAGE_SIZE, "%lld\n", val); -} - SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -684,8 +675,6 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(space_info, total_bytes_pinned, - btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), @@ -698,7 +687,6 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), - BTRFS_ATTR_PTR(space_info, total_bytes_pinned), NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -706,7 +694,6 @@ ATTRIBUTE_GROUPS(space_info); static void space_info_release(struct kobject *kobj) { struct btrfs_space_info *sinfo = to_space_info(kobj); - percpu_counter_destroy(&sinfo->total_bytes_pinned); kfree(sinfo); } @@ -1455,6 +1442,33 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); +static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", + READ_ONCE(device->scrub_speed_max)); +} + +static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + char *endptr; + unsigned long long limit; + + limit = memparse(buf, &endptr); + WRITE_ONCE(device->scrub_speed_max, limit); + return len; +} +BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, + btrfs_devinfo_scrub_speed_max_store); + static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1468,10 +1482,40 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); +static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + if (!device->dev_stats_valid) + return scnprintf(buf, PAGE_SIZE, "invalid\n"); + + /* + * Print all at once so we get a snapshot of all values from the same + * time. Keep them in sync and in order of definition of + * btrfs_dev_stat_values. + */ + return scnprintf(buf, PAGE_SIZE, + "write_errs %d\n" + "read_errs %d\n" + "flush_errs %d\n" + "corruption_errs %d\n" + "generation_errs %d\n", + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); +} +BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); + static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, in_fs_metadata), BTRFS_ATTR_PTR(devid, missing), BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, scrub_speed_max), BTRFS_ATTR_PTR(devid, writeable), NULL }; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c0aefe6dee0b..319fed82d741 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -557,7 +557,7 @@ int btrfs_test_extent_map(void) { /* * Test a chunk with 2 data stripes one of which - * interesects the physical address of the super block + * intersects the physical address of the super block * is correctly recognised. */ .raid_type = BTRFS_BLOCK_GROUP_RAID1, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f75de9f6c0ad..50318231c1a8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -583,9 +583,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool do_chunk_alloc = false; int ret; - /* Send isn't supposed to start transactions. */ - ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return ERR_PTR(-EROFS); @@ -1406,8 +1403,10 @@ int btrfs_defrag_root(struct btrfs_root *root) while (1) { trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } ret = btrfs_defrag_leaves(trans, root); @@ -1476,7 +1475,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); if (ret) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } /* @@ -1869,31 +1868,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) } /* - * wait for the current transaction commit to start and block subsequent - * transaction joins - */ -static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, - struct btrfs_transaction *trans) -{ - wait_event(fs_info->transaction_blocked_wait, - trans->state >= TRANS_STATE_COMMIT_START || - TRANS_ABORTED(trans)); -} - -/* - * wait for the current transaction to start and then become unblocked. - * caller holds ref. - */ -static void wait_current_trans_commit_start_and_unblock( - struct btrfs_fs_info *fs_info, - struct btrfs_transaction *trans) -{ - wait_event(fs_info->transaction_wait, - trans->state >= TRANS_STATE_UNBLOCKED || - TRANS_ABORTED(trans)); -} - -/* * commit transactions asynchronously. once btrfs_commit_transaction_async * returns, any subsequent transaction will not be allowed to join. */ @@ -1920,8 +1894,7 @@ static void do_async_commit(struct work_struct *work) kfree(ac); } -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - int wait_for_unblock) +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_async_commit *ac; @@ -1953,13 +1926,13 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, __sb_writers_release(fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); - - /* wait for transaction to start and unblock */ - if (wait_for_unblock) - wait_current_trans_commit_start_and_unblock(fs_info, cur_trans); - else - wait_current_trans_commit_start(fs_info, cur_trans); - + /* + * Wait for the current transaction commit to start and block + * subsequent transaction joins + */ + wait_event(fs_info->transaction_blocked_wait, + cur_trans->state >= TRANS_STATE_COMMIT_START || + TRANS_ABORTED(cur_trans)); if (current->journal_info == trans) current->journal_info = NULL; @@ -2074,14 +2047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); - /* - * Some places just start a transaction to commit it. We need to make - * sure that if this commit fails that the abort code actually marks the - * transaction as failed, so set trans->dirty to make the abort code do - * the right thing. - */ - trans->dirty = true; - /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 364cfbb4c5c5..07d76029f598 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -122,8 +122,6 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) -#define BTRFS_SEND_TRANS_STUB ((void *)1) - struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; @@ -143,7 +141,6 @@ struct btrfs_trans_handle { bool allocating_chunk; bool can_flush_pending_bgs; bool reloc_reserved; - bool dirty; bool in_fsync; struct btrfs_root *root; struct btrfs_fs_info *fs_info; @@ -227,8 +224,7 @@ void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); -int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - int wait_for_unblock); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dbcf8bb2f3b9..cab451d19547 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4468,7 +4468,8 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, root->log_root, inode, truncate_offset, - BTRFS_EXTENT_DATA_KEY); + BTRFS_EXTENT_DATA_KEY, + NULL); } while (ret == -EAGAIN); if (ret) goto out; @@ -5416,7 +5417,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags); while(1) { ret = btrfs_truncate_inode_items(trans, - log, inode, 0, 0); + log, inode, 0, 0, NULL); if (ret != -EAGAIN) break; } @@ -5466,13 +5467,23 @@ log_extents: btrfs_release_path(dst_path); if (need_log_inode_item) { err = log_inode_item(trans, log, dst_path, inode); - if (!err && !xattrs_logged) { + if (err) + goto out_unlock; + /* + * If we are doing a fast fsync and the inode was logged before + * in this transaction, we don't need to log the xattrs because + * they were logged before. If xattrs were added, changed or + * deleted since the last time we logged the inode, then we have + * already logged them because the inode had the runtime flag + * BTRFS_INODE_COPY_EVERYTHING set. + */ + if (!xattrs_logged && inode->logged_trans < trans->transid) { err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; btrfs_release_path(path); } - if (err) - goto out_unlock; } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, @@ -6371,6 +6382,7 @@ next: error: if (wc.trans) btrfs_end_transaction(wc.trans); + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 47d27059d064..807502cd6510 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -717,7 +717,7 @@ static struct btrfs_fs_devices *find_fsid_changed( /* * Handles the case where scanned device is part of an fs that had - * multiple successful changes of FSID but curently device didn't + * multiple successful changes of FSID but currently device didn't * observe it. Meaning our fsid will be different than theirs. We need * to handle two subcases : * 1 - The fs still continues to have different METADATA/FSID uuids. @@ -1247,7 +1247,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, lockdep_assert_held(&uuid_mutex); /* * The device_list_mutex cannot be taken here in case opening the - * underlying device takes further locks like bd_mutex. + * underlying device takes further locks like open_mutex. * * We also don't need the lock here as this is called during mount and * exclusion is provided by uuid_mutex @@ -1550,7 +1550,7 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * check to ensure dev extents are not double allocated. * This makes the function safe to allocate dev extents but may not report * correct usable device space, as device extent freed in current transaction - * is not reported as avaiable. + * is not reported as available. */ static int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes, u64 search_start, u64 *start, @@ -4217,14 +4217,6 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, btrfs_bg_type_to_raid_name(data_target)); } - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run balance while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - ret = -EAGAIN; - goto out; - } - ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; @@ -6127,17 +6119,17 @@ static bool need_full_stripe(enum btrfs_map_op op) * @em: mapping containing the logical extent * @op: type of operation - write or read * @logical: address that we want to figure out the geometry of - * @len: the length of IO we are going to perform, starting at @logical * @io_geom: pointer used to return values * * Returns < 0 in case a chunk for the given logical address cannot be found, * usually shouldn't happen unless @logical is corrupted, 0 otherwise. */ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, - enum btrfs_map_op op, u64 logical, u64 len, + enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom) { struct map_lookup *map; + u64 len; u64 offset; u64 stripe_offset; u64 stripe_nr; @@ -6152,7 +6144,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, offset = logical - em->start; /* Len of a stripe in a chunk */ stripe_len = map->stripe_len; - /* Stripe wher this block falls in */ + /* Stripe where this block falls in */ stripe_nr = div64_u64(offset, stripe_len); /* Offset of stripe in the chunk */ stripe_offset = stripe_nr * stripe_len; @@ -6243,7 +6235,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, em = btrfs_get_chunk_map(fs_info, logical, *length); ASSERT(!IS_ERR(em)); - ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); + ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); if (ret < 0) return ret; @@ -6670,8 +6662,6 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. - * - * If @seed is true, traverse through the seed devices. */ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *uuid, u8 *fsid) @@ -7865,7 +7855,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, ret = -EUCLEAN; } - /* Make sure no dev extent is beyond device bondary */ + /* Make sure no dev extent is beyond device boundary */ dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9c0d84e5ec06..c7fc7caf575c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -143,6 +143,9 @@ struct btrfs_device { struct completion kobj_unregister; /* For sysfs/FSID/devinfo/devid/ */ struct kobject devid_kobj; + + /* Bandwidth limit for scrub, in bytes */ + u64 scrub_speed_max; }; /* @@ -443,7 +446,7 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, - enum btrfs_map_op op, u64 logical, u64 len, + enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index f1f3b10d1dbb..297c0b1c0634 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -81,7 +81,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * *: Special case, no superblock is written * 0: Use write pointer of zones[0] * 1: Use write pointer of zones[1] - * C: Compare super blcoks from zones[0] and zones[1], use the latest + * C: Compare super blocks from zones[0] and zones[1], use the latest * one determined by generation * x: Invalid state */ @@ -433,7 +433,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) } /* - * If zones[0] is conventional, always use the beggining of the + * If zones[0] is conventional, always use the beginning of the * zone to record superblock. No need to validate in that case. */ if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == @@ -1140,6 +1140,10 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + btrfs_err_in_rcu(fs_info, + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", + zone.start << SECTOR_SHIFT, + rcu_str_deref(device->name), device->devid); ret = -EIO; goto out; } @@ -1200,6 +1204,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ + if (alloc_offsets[0] == WP_MISSING_DEV) { + btrfs_err(fs_info, + "zoned: cannot recover write pointer for zone %llu", + physical); + ret = -EIO; + goto out; + } cache->alloc_offset = alloc_offsets[0]; break; case BTRFS_BLOCK_GROUP_DUP: @@ -1217,6 +1228,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: + if (cache->alloc_offset > fs_info->zone_size) { + btrfs_err(fs_info, + "zoned: invalid write pointer %llu in block group %llu", + cache->alloc_offset, cache->start); + ret = -EIO; + } + /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, @@ -1515,3 +1533,24 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, length = wp - physical_pos; return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); } + +struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct btrfs_device *device; + struct extent_map *em; + struct map_lookup *map; + + em = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return ERR_CAST(em); + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + device = map->stripes[0].dev; + + free_extent_map(em); + + return device; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index e55d32595c2c..b0ae2608cb6b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -65,6 +65,8 @@ void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); +struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -191,6 +193,13 @@ static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, return -EOPNOTSUPP; } +static inline struct btrfs_device *btrfs_zoned_get_device( + struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + return ERR_PTR(-EOPNOTSUPP); +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index bf52e9326ebe..7364950a9ef4 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -19,6 +19,8 @@ config CIFS select CRYPTO_LIB_DES select KEYS select DNS_RESOLVER + select ASN1 + select OID_REGISTRY help This is the client VFS module for the SMB3 family of NAS protocols, (including support for the most recent, most secure dialect SMB3.1.1) @@ -57,6 +59,7 @@ config CIFS config CIFS_STATS2 bool "Extended statistics" depends on CIFS + default y help Enabling this option will allow more detailed statistics on SMB request timing to be displayed in /proc/fs/cifs/DebugData and also @@ -65,8 +68,7 @@ config CIFS_STATS2 for more details. These additional statistics may have a minor effect on performance and memory utilization. - Unless you are a developer or are doing network performance analysis - or tuning, say N. + If unsure, say Y. config CIFS_ALLOW_INSECURE_LEGACY bool "Support legacy servers which use less secure dialects" diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 3ee3b7de4ded..87fcacdf3de7 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -6,12 +6,16 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_CIFS) += cifs.o cifs-y := trace.o cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o \ - inode.o link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ + inode.o link.o misc.o netmisc.o smbencrypt.o transport.o \ cifs_unicode.o nterr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o smb1ops.o unc.o winucase.o \ smb2ops.o smb2maperror.o smb2transport.o \ smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \ - dns_resolve.o + dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o + +$(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h + +$(obj)/cifs_spnego_negtokeninit.asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.c $(obj)/cifs_spnego_negtokeninit.asn1.h cifs-$(CONFIG_CIFS_XATTR) += xattr.o diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 3150c19cdc2f..b5724ef9f182 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -1,612 +1,63 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* - * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in - * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich - * - * Copyright (c) 2000 RP Internet (www.rpi.net.au). - */ #include <linux/module.h> -#include <linux/types.h> #include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include "cifspdu.h" +#include <linux/oid_registry.h> #include "cifsglob.h" #include "cifs_debug.h" #include "cifsproto.h" +#include "cifs_spnego_negtokeninit.asn1.h" -/***************************************************************************** - * - * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) - * - *****************************************************************************/ - -/* Class */ -#define ASN1_UNI 0 /* Universal */ -#define ASN1_APL 1 /* Application */ -#define ASN1_CTX 2 /* Context */ -#define ASN1_PRV 3 /* Private */ - -/* Tag */ -#define ASN1_EOC 0 /* End Of Contents or N/A */ -#define ASN1_BOL 1 /* Boolean */ -#define ASN1_INT 2 /* Integer */ -#define ASN1_BTS 3 /* Bit String */ -#define ASN1_OTS 4 /* Octet String */ -#define ASN1_NUL 5 /* Null */ -#define ASN1_OJI 6 /* Object Identifier */ -#define ASN1_OJD 7 /* Object Description */ -#define ASN1_EXT 8 /* External */ -#define ASN1_ENUM 10 /* Enumerated */ -#define ASN1_SEQ 16 /* Sequence */ -#define ASN1_SET 17 /* Set */ -#define ASN1_NUMSTR 18 /* Numerical String */ -#define ASN1_PRNSTR 19 /* Printable String */ -#define ASN1_TEXSTR 20 /* Teletext String */ -#define ASN1_VIDSTR 21 /* Video String */ -#define ASN1_IA5STR 22 /* IA5 String */ -#define ASN1_UNITIM 23 /* Universal Time */ -#define ASN1_GENTIM 24 /* General Time */ -#define ASN1_GRASTR 25 /* Graphical String */ -#define ASN1_VISSTR 26 /* Visible String */ -#define ASN1_GENSTR 27 /* General String */ - -/* Primitive / Constructed methods*/ -#define ASN1_PRI 0 /* Primitive */ -#define ASN1_CON 1 /* Constructed */ - -/* - * Error codes. - */ -#define ASN1_ERR_NOERROR 0 -#define ASN1_ERR_DEC_EMPTY 2 -#define ASN1_ERR_DEC_EOC_MISMATCH 3 -#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 -#define ASN1_ERR_DEC_BADVALUE 5 - -#define SPNEGO_OID_LEN 7 -#define NTLMSSP_OID_LEN 10 -#define KRB5_OID_LEN 7 -#define KRB5U2U_OID_LEN 8 -#define MSKRB5_OID_LEN 7 -static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; -static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; -static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; -static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; -static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; - -/* - * ASN.1 context. - */ -struct asn1_ctx { - int error; /* Error condition */ - unsigned char *pointer; /* Octet just to be decoded */ - unsigned char *begin; /* First octet */ - unsigned char *end; /* Octet after last octet */ -}; - -/* - * Octet string (not null terminated) - */ -struct asn1_octstr { - unsigned char *data; - unsigned int len; -}; - -static void -asn1_open(struct asn1_ctx *ctx, unsigned char *buf, unsigned int len) -{ - ctx->begin = buf; - ctx->end = buf + len; - ctx->pointer = buf; - ctx->error = ASN1_ERR_NOERROR; -} - -static unsigned char -asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) -{ - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - *ch = *(ctx->pointer)++; - return 1; -} - -#if 0 /* will be needed later by spnego decoding/encoding of ntlmssp */ -static unsigned char -asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val) -{ - unsigned char ch; - - if (ctx->pointer >= ctx->end) { - ctx->error = ASN1_ERR_DEC_EMPTY; - return 0; - } - - ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */ - if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ - *val = *(++(ctx->pointer)); /* value has enum value */ - else - return 0; - - ctx->pointer++; - return 1; -} -#endif - -static unsigned char -asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) -{ - unsigned char ch; - - *tag = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *tag <<= 7; - *tag |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static unsigned char -asn1_id_decode(struct asn1_ctx *ctx, - unsigned int *cls, unsigned int *con, unsigned int *tag) -{ - unsigned char ch; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *cls = (ch & 0xC0) >> 6; - *con = (ch & 0x20) >> 5; - *tag = (ch & 0x1F); - - if (*tag == 0x1F) { - if (!asn1_tag_decode(ctx, tag)) - return 0; - } - return 1; -} - -static unsigned char -asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len) -{ - unsigned char ch, cnt; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch == 0x80) - *def = 0; - else { - *def = 1; - - if (ch < 0x80) - *len = ch; - else { - cnt = (unsigned char) (ch & 0x7F); - *len = 0; - - while (cnt > 0) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - *len <<= 8; - *len |= ch; - cnt--; - } - } - } - - /* don't trust len bigger than ctx buffer */ - if (*len > ctx->end - ctx->pointer) - return 0; - - return 1; -} - -static unsigned char -asn1_header_decode(struct asn1_ctx *ctx, - unsigned char **eoc, - unsigned int *cls, unsigned int *con, unsigned int *tag) -{ - unsigned int def = 0; - unsigned int len = 0; - - if (!asn1_id_decode(ctx, cls, con, tag)) - return 0; - - if (!asn1_length_decode(ctx, &def, &len)) - return 0; - - /* primitive shall be definite, indefinite shall be constructed */ - if (*con == ASN1_PRI && !def) - return 0; - - if (def) - *eoc = ctx->pointer + len; - else - *eoc = NULL; - return 1; -} - -static unsigned char -asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +int +decode_negTokenInit(unsigned char *security_blob, int length, + struct TCP_Server_Info *server) { - unsigned char ch; - - if (eoc == NULL) { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - if (ch != 0x00) { - ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; - return 0; - } - return 1; - } else { - if (ctx->pointer != eoc) { - ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; - return 0; - } + if (asn1_ber_decoder(&cifs_spnego_negtokeninit_decoder, server, + security_blob, length) == 0) return 1; - } -} - -/* static unsigned char asn1_null_decode(struct asn1_ctx *ctx, - unsigned char *eoc) -{ - ctx->pointer = eoc; - return 1; -} - -static unsigned char asn1_long_decode(struct asn1_ctx *ctx, - unsigned char *eoc, long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = (signed char) ch; - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof(long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned int *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer = ch; - if (ch == 0) - len = 0; else - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof(unsigned int)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; -} - -static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned long *integer) -{ - unsigned char ch; - unsigned int len; - - if (!asn1_octet_decode(ctx, &ch)) return 0; - - *integer = ch; - if (ch == 0) - len = 0; - else - len = 1; - - while (ctx->pointer < eoc) { - if (++len > sizeof(unsigned long)) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - return 0; - } - - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *integer <<= 8; - *integer |= ch; - } - return 1; } -static unsigned char -asn1_octets_decode(struct asn1_ctx *ctx, - unsigned char *eoc, - unsigned char **octets, unsigned int *len) +int cifs_gssapi_this_mech(void *context, size_t hdrlen, + unsigned char tag, const void *value, size_t vlen) { - unsigned char *ptr; - - *len = 0; - - *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); - if (*octets == NULL) { - return 0; - } - - ptr = *octets; - while (ctx->pointer < eoc) { - if (!asn1_octet_decode(ctx, (unsigned char *) ptr++)) { - kfree(*octets); - *octets = NULL; - return 0; - } - (*len)++; - } - return 1; -} */ - -static unsigned char -asn1_subid_decode(struct asn1_ctx *ctx, unsigned long *subid) -{ - unsigned char ch; - - *subid = 0; - - do { - if (!asn1_octet_decode(ctx, &ch)) - return 0; - - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - return 1; -} - -static int -asn1_oid_decode(struct asn1_ctx *ctx, - unsigned char *eoc, unsigned long **oid, unsigned int *len) -{ - unsigned long subid; - unsigned int size; - unsigned long *optr; - - size = eoc - ctx->pointer + 1; - - /* first subid actually encodes first two subids */ - if (size < 2 || size > UINT_MAX/sizeof(unsigned long)) - return 0; - - *oid = kmalloc_array(size, sizeof(unsigned long), GFP_ATOMIC); - if (*oid == NULL) - return 0; - - optr = *oid; - - if (!asn1_subid_decode(ctx, &subid)) { - kfree(*oid); - *oid = NULL; - return 0; - } - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *len = 2; - optr += 2; + enum OID oid; - while (ctx->pointer < eoc) { - if (++(*len) > size) { - ctx->error = ASN1_ERR_DEC_BADVALUE; - kfree(*oid); - *oid = NULL; - return 0; - } + oid = look_up_OID(value, vlen); + if (oid != OID_spnego) { + char buf[50]; - if (!asn1_subid_decode(ctx, optr++)) { - kfree(*oid); - *oid = NULL; - return 0; - } + sprint_oid(value, vlen, buf, sizeof(buf)); + cifs_dbg(FYI, "Error decoding negTokenInit header: unexpected OID %s\n", + buf); + return -EBADMSG; } - return 1; + return 0; } -static int -compare_oid(unsigned long *oid1, unsigned int oid1len, - unsigned long *oid2, unsigned int oid2len) +int cifs_neg_token_init_mech_type(void *context, size_t hdrlen, + unsigned char tag, + const void *value, size_t vlen) { - unsigned int i; + struct TCP_Server_Info *server = context; + enum OID oid; - if (oid1len != oid2len) - return 0; + oid = look_up_OID(value, vlen); + if (oid == OID_mskrb5) + server->sec_mskerberos = true; + else if (oid == OID_krb5u2u) + server->sec_kerberosu2u = true; + else if (oid == OID_krb5) + server->sec_kerberos = true; + else if (oid == OID_ntlmssp) + server->sec_ntlmssp = true; else { - for (i = 0; i < oid1len; i++) { - if (oid1[i] != oid2[i]) - return 0; - } - return 1; - } -} - - /* BB check for endian conversion issues here */ - -int -decode_negTokenInit(unsigned char *security_blob, int length, - struct TCP_Server_Info *server) -{ - struct asn1_ctx ctx; - unsigned char *end; - unsigned char *sequence_end; - unsigned long *oid = NULL; - unsigned int cls, con, tag, oidlen, rc; - - /* cifs_dump_mem(" Received SecBlob ", security_blob, length); */ - - asn1_open(&ctx, security_blob, length); + char buf[50]; - /* GSSAPI header */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding negTokenInit header\n"); - return 0; - } else if ((cls != ASN1_APL) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d\n", cls, con, tag); - return 0; + sprint_oid(value, vlen, buf, sizeof(buf)); + cifs_dbg(FYI, "Decoding negTokenInit: unsupported OID %s\n", + buf); } - - /* Check for SPNEGO OID -- remember to free obj->oid */ - rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); - if (rc) { - if ((tag == ASN1_OJI) && (con == ASN1_PRI) && - (cls == ASN1_UNI)) { - rc = asn1_oid_decode(&ctx, end, &oid, &oidlen); - if (rc) { - rc = compare_oid(oid, oidlen, SPNEGO_OID, - SPNEGO_OID_LEN); - kfree(oid); - } - } else - rc = 0; - } - - /* SPNEGO OID not present or garbled -- bail out */ - if (!rc) { - cifs_dbg(FYI, "Error decoding negTokenInit header\n"); - return 0; - } - - /* SPNEGO */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding negTokenInit\n"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", - cls, con, tag, end); - return 0; - } - - /* negTokenInit */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding negTokenInit\n"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n", - cls, con, tag, end); - return 0; - } - - /* sequence */ - if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); - return 0; - } else if ((cls != ASN1_CTX) || (con != ASN1_CON) - || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", - cls, con, tag, end); - return 0; - } - - /* sequence of */ - if (asn1_header_decode - (&ctx, &sequence_end, &cls, &con, &tag) == 0) { - cifs_dbg(FYI, "Error decoding 2nd part of negTokenInit\n"); - return 0; - } else if ((cls != ASN1_UNI) || (con != ASN1_CON) - || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n", - cls, con, tag, sequence_end); - return 0; - } - - /* list of security mechanisms */ - while (!asn1_eoc_decode(&ctx, sequence_end)) { - rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag); - if (!rc) { - cifs_dbg(FYI, "Error decoding negTokenInit hdr exit2\n"); - return 0; - } - if ((tag == ASN1_OJI) && (con == ASN1_PRI)) { - if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) { - - cifs_dbg(FYI, "OID len = %d oid = 0x%lx 0x%lx 0x%lx 0x%lx\n", - oidlen, *oid, *(oid + 1), *(oid + 2), - *(oid + 3)); - - if (compare_oid(oid, oidlen, MSKRB5_OID, - MSKRB5_OID_LEN)) - server->sec_mskerberos = true; - else if (compare_oid(oid, oidlen, KRB5U2U_OID, - KRB5U2U_OID_LEN)) - server->sec_kerberosu2u = true; - else if (compare_oid(oid, oidlen, KRB5_OID, - KRB5_OID_LEN)) - server->sec_kerberos = true; - else if (compare_oid(oid, oidlen, NTLMSSP_OID, - NTLMSSP_OID_LEN)) - server->sec_ntlmssp = true; - - kfree(oid); - } - } else { - cifs_dbg(FYI, "Should be an oid what is going on?\n"); - } - } - - /* - * We currently ignore anything at the end of the SPNEGO blob after - * the mechTypes have been parsed, since none of that info is - * used at the moment. - */ - return 1; + return 0; } diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 488fe0ffc1ef..8a3b30ec860c 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cache.c - CIFS filesystem cache index structure definitions * * Copyright (c) 2010 Novell, Inc. * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "fscache.h" #include "cifs_debug.h" diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 68e8e5b27841..8857ac7e7a14 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -50,7 +50,6 @@ void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) void cifs_dump_mids(struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct list_head *tmp; struct mid_q_entry *mid_entry; if (server == NULL) @@ -58,8 +57,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server) cifs_dbg(VFS, "Dump pending requests:\n"); spin_lock(&GlobalMid_Lock); - list_for_each(tmp, &server->pending_mid_q) { - mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) { cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n", mid_entry->mid_state, le16_to_cpu(mid_entry->command), @@ -168,7 +166,7 @@ cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) static int cifs_debug_files_proc_show(struct seq_file *m, void *v) { - struct list_head *stmp, *tmp, *tmp1, *tmp2; + struct list_head *tmp, *tmp1, *tmp2; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -183,9 +181,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) seq_printf(m, " <filename>\n"); #endif /* CIFS_DEBUG2 */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(stmp, &cifs_tcp_ses_list) { - server = list_entry(stmp, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { list_for_each(tmp, &server->smb_ses_list) { ses = list_entry(tmp, struct cifs_ses, smb_ses_list); list_for_each(tmp1, &ses->tcon_list) { @@ -220,7 +216,7 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { - struct list_head *tmp1, *tmp2, *tmp3; + struct list_head *tmp2, *tmp3; struct mid_q_entry *mid_entry; struct TCP_Server_Info *server; struct cifs_ses *ses; @@ -278,11 +274,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) c = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp1, &cifs_tcp_ses_list) { - server = list_entry(tmp1, struct TCP_Server_Info, - tcp_ses_list); - - /* channel info will be printed as a part of sessions below */ + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { if (server->is_channel) continue; @@ -563,7 +555,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_STATS2 int j; #endif /* STATS2 */ - struct list_head *tmp1, *tmp2, *tmp3; + struct list_head *tmp2, *tmp3; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -594,9 +586,7 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) i = 0; spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp1, &cifs_tcp_ses_list) { - server = list_entry(tmp1, struct TCP_Server_Info, - tcp_ses_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { seq_printf(m, "\nMax requests in flight: %d", server->max_in_flight); #ifdef CONFIG_CIFS_STATS2 seq_puts(m, "\nTotal time spent processing by command. Time "); diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 5e66dab712d0..ee4ea2b60c0f 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -3,7 +3,7 @@ * * Copyright (c) International Business Machines Corp., 2000,2002 * Modified by Steve French (sfrench@us.ibm.com) -*/ + */ #ifndef _H_CIFS_DEBUG #define _H_CIFS_DEBUG diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index c87c37cf2914..ec57cdb1590f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -125,7 +125,7 @@ cifs_build_devname(char *nodename, const char *prepath) * @sb_mountdata: parent/root DFS mount options (template) * @fullpath: full path in UNC format * @ref: optional server's referral - * + * @devname: return the built cifs device name if passed pointer not NULL * creates mount options for submount based on template options sb_mountdata * and replacing unc,ip,prefixpath options with ones we've got form ref_unc. * diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 9c45b3a82ad9..4fd788586399 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -1,19 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifs_fs_sb.h * * Copyright (c) International Business Machines Corp., 2002,2004 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ #include <linux/rbtree.h> @@ -72,11 +63,12 @@ struct cifs_sb_info { char *prepath; /* - * Path initially provided by the mount call. We might connect - * to something different via DFS but we want to keep it to do - * failover properly. + * Canonical DFS path initially provided by the mount call. We might connect to something + * different via DFS but we want to keep it to do failover properly. */ char *origin_fullpath; /* \\HOST\SHARE\[OPTIONAL PATH] */ + /* randomly generated 128-bit number for indexing dfs mount groups in referral cache */ + uuid_t dfs_mount_id; /* * Indicate whether serverino option was turned off later * (cifs_autodisable_serverino) in order to match new mounts. diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 37fc7d6ac457..ef723be358af 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifs_ioctl.h * @@ -5,16 +6,6 @@ * * Copyright (c) 2015 Steve French <steve.french@primarydata.com> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ struct smb_mnt_fs_info { diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 7b9b876b513b..8fa26a8530f8 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS * * Copyright (c) 2007 Red Hat, Inc. * Author(s): Jeff Layton (jlayton@redhat.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/list.h> diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 31bef9ee078b..31387d0ea32e 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS * @@ -5,19 +6,6 @@ * Author(s): Jeff Layton (jlayton@redhat.com) * Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFS_SPNEGO_H diff --git a/fs/cifs/cifs_spnego_negtokeninit.asn1 b/fs/cifs/cifs_spnego_negtokeninit.asn1 new file mode 100644 index 000000000000..181c083887d5 --- /dev/null +++ b/fs/cifs/cifs_spnego_negtokeninit.asn1 @@ -0,0 +1,40 @@ +GSSAPI ::= + [APPLICATION 0] IMPLICIT SEQUENCE { + thisMech + OBJECT IDENTIFIER ({cifs_gssapi_this_mech}), + negotiationToken + NegotiationToken + } + +MechType ::= OBJECT IDENTIFIER ({cifs_neg_token_init_mech_type}) + +MechTypeList ::= SEQUENCE OF MechType + +NegHints ::= SEQUENCE { + hintName + [0] GeneralString OPTIONAL, + hintAddress + [1] OCTET STRING OPTIONAL + } + +NegTokenInit2 ::= + SEQUENCE { + mechTypes + [0] MechTypeList OPTIONAL, + reqFlags + [1] BIT STRING OPTIONAL, + mechToken + [2] OCTET STRING OPTIONAL, + negHints + [3] NegHints OPTIONAL, + mechListMIC + [3] OCTET STRING OPTIONAL + } + +NegotiationToken ::= + CHOICE { + negTokenInit + [0] NegTokenInit2, + negTokenTarg + [1] ANY + } diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index d829b8bf833e..93b47818c6c2 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -447,15 +447,13 @@ static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new, const struct sockaddr_storage *old, struct sockaddr_storage *dst) { - __be16 port; + __be16 port = cpu_to_be16(CIFS_PORT); if (old->ss_family == AF_INET) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)old; port = ipv4->sin_port; - } - - if (old->ss_family == AF_INET6) { + } else if (old->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)old; port = ipv6->sin6_port; @@ -465,9 +463,7 @@ static int cifs_swn_store_swn_addr(const struct sockaddr_storage *new, struct sockaddr_in *ipv4 = (struct sockaddr_in *)new; ipv4->sin_port = port; - } - - if (new->ss_family == AF_INET6) { + } else if (new->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)new; ipv6->sin6_port = port; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 784407f9280f..388eb536cff1 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifsacl.c * @@ -6,19 +7,6 @@ * * Contains the routines for mapping CIFS/NTFS ACLs * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> @@ -409,7 +397,6 @@ try_upcall_to_get_id: saved_cred = override_creds(root_cred); sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); if (IS_ERR(sidkey)) { - rc = -EINVAL; cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g'); goto out_revert_creds; @@ -422,7 +409,6 @@ try_upcall_to_get_id: */ BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); if (sidkey->datalen != sizeof(uid_t)) { - rc = -EIO; cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", __func__, sidkey->datalen); key_invalidate(sidkey); @@ -1308,7 +1294,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); ndacl_ptr->revision = dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); - ndacl_ptr->num_aces = dacl_ptr->num_aces; + ndacl_ptr->num_aces = dacl_ptr ? dacl_ptr->num_aces : 0; if (uid_valid(uid)) { /* chown */ uid_t id; diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index d9e704979d99..f8292bcf8594 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -1,28 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsacl.h * * Copyright (c) International Business Machines Corp., 2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSACL_H #define _CIFSACL_H - #define NUM_AUTHS (6) /* number of authority fields */ #define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index b8f1ff9a83f3..ecf15d845dbd 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifsencrypt.c * @@ -7,19 +8,6 @@ * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2ffcb29d5c8f..9fb874dd8d24 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifsfs.c * @@ -6,19 +7,6 @@ * * Common Internet FileSystem (CIFS) client * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Note that BB means BUGBUG (ie something to fix eventually) */ diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 6beddb108ba0..177f3e7ab86d 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsfs.h * * Copyright (c) International Business Machines Corp., 2002, 2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSFS_H diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8488d7024462..3100f8b66e60 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsglob.h * @@ -5,16 +6,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Jeremy Allison (jra@samba.org) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ #ifndef _CIFS_GLOB_H #define _CIFS_GLOB_H @@ -630,7 +621,7 @@ struct TCP_Server_Info { /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ unsigned int capabilities; /* selective disabling of caps by smb sess */ int timeAdj; /* Adjust for difference in server time zone in sec */ - __u64 CurrentMid; /* multiplex id - rotating counter */ + __u64 CurrentMid; /* multiplex id - rotating counter, protected by GlobalMid_Lock */ char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; @@ -896,7 +887,7 @@ struct cifs_ses { struct mutex session_mutex; struct TCP_Server_Info *server; /* pointer to server info */ int ses_count; /* reference counter */ - enum statusEnum status; + enum statusEnum status; /* updates protected by GlobalMid_Lock */ unsigned overrideSecFlg; /* if non-zero override global sec flags */ char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ @@ -1093,8 +1084,7 @@ struct cifs_tcon { struct cached_fid crfid; /* Cached root fid */ /* BB add field for back pointer to sb struct(s)? */ #ifdef CONFIG_CIFS_DFS_UPCALL - char *dfs_path; - int remap:2; + char *dfs_path; /* canonical DFS path */ struct list_head ulist; /* cache update list */ #endif }; @@ -1795,6 +1785,8 @@ require use of the stronger protocol */ * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers * list operations on global DnotifyReqList + * updates to ses->status + * updates to server->CurrentMid * tcp_ses_lock protects: * list operations on tcp and SMB session lists * tcon->open_file_lock protects the list of open files hanging off the tcon diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 554d64fe171e..0923f72d27e9 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifspdu.h * * Copyright (c) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSPDU_H diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index d30cba44ba29..e0def0f0714b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/cifsproto.h * * Copyright (c) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSPROTO_H #define _CIFSPROTO_H diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 41f74163cc1c..58ebec4d4413 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/cifssmb.c * @@ -6,19 +7,6 @@ * * Contains the routines for constructing the SMB PDUs themselves * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* SMB/CIFS PDU handling routines here - except for leftovers in connect.c */ @@ -1220,7 +1208,7 @@ SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon, int *pOplock, FILE_ALL_INFO *pfile_info, const struct nls_table *nls_codepage, int remap) { - int rc = -EACCES; + int rc; OPENX_REQ *pSMB = NULL; OPENX_RSP *pSMBr = NULL; int bytes_returned; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 495c395f9def..5d269f583dac 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/connect.c * * Copyright (C) International Business Machines Corp., 2002,2011 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/net.h> @@ -368,13 +356,7 @@ cifs_reconnect(struct TCP_Server_Info *server) cifs_server_dbg(VFS, "%s: failed to update DFS target hint: rc = %d\n", __func__, rc); } - rc = dfs_cache_update_vol(cifs_sb->origin_fullpath, server); - if (rc) { - cifs_server_dbg(VFS, "%s: failed to update vol info in DFS cache: rc = %d\n", - __func__, rc); - } dfs_cache_free_tgts(&tgt_list); - } cifs_put_tcp_super(sb); @@ -1557,29 +1539,25 @@ out: /** * cifs_free_ipc - helper to release the session IPC tcon * - * Needs to be called everytime a session is destroyed + * Needs to be called everytime a session is destroyed. + * + * On session close, the IPC is closed and the server must release all tcons of the session. + * No need to send a tree disconnect here. + * + * Besides, it will make the server to not close durable and resilient files on session close, as + * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request. */ static int cifs_free_ipc(struct cifs_ses *ses) { - int rc = 0, xid; struct cifs_tcon *tcon = ses->tcon_ipc; if (tcon == NULL) return 0; - if (ses->server->ops->tree_disconnect) { - xid = get_xid(); - rc = ses->server->ops->tree_disconnect(xid, tcon); - free_xid(xid); - } - - if (rc) - cifs_dbg(FYI, "failed to disconnect IPC tcon (rc=%d)\n", rc); - tconInfoFree(tcon); ses->tcon_ipc = NULL; - return rc; + return 0; } static struct cifs_ses * @@ -1605,7 +1583,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) { unsigned int rc, xid; struct TCP_Server_Info *server = ses->server; - cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); spin_lock(&cifs_tcp_ses_lock); @@ -1613,13 +1590,20 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); return; } + + cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count); + cifs_dbg(FYI, "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->treeName : "NONE"); + if (--ses->ses_count > 0) { spin_unlock(&cifs_tcp_ses_lock); return; } + spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&GlobalMid_Lock); if (ses->status == CifsGood) ses->status = CifsExiting; - spin_unlock(&cifs_tcp_ses_lock); + spin_unlock(&GlobalMid_Lock); cifs_free_ipc(ses); @@ -1951,10 +1935,7 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &ses->tcon_list) { tcon = list_entry(tmp, struct cifs_tcon, tcon_list); -#ifdef CONFIG_CIFS_DFS_UPCALL - if (tcon->dfs_path) - continue; -#endif + if (!match_tcon(tcon, ctx)) continue; ++tcon->tc_count; @@ -3017,9 +2998,8 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, return rc; } -static inline int get_next_dfs_tgt(const char *path, - struct dfs_cache_tgt_list *tgt_list, - struct dfs_cache_tgt_iterator **tgt_it) +static int get_next_dfs_tgt(struct dfs_cache_tgt_list *tgt_list, + struct dfs_cache_tgt_iterator **tgt_it) { if (!*tgt_it) *tgt_it = dfs_cache_get_tgt_iterator(tgt_list); @@ -3059,6 +3039,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ struct cifs_ses **ses, struct cifs_tcon **tcon) { int rc; + char *npath = NULL; struct dfs_cache_tgt_list tgt_list = {0}; struct dfs_cache_tgt_iterator *tgt_it = NULL; struct smb3_fs_context tmp_ctx = {NULL}; @@ -3066,11 +3047,15 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) return -EOPNOTSUPP; - cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, path, full_path); + npath = dfs_cache_canonical_path(path, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (IS_ERR(npath)) + return PTR_ERR(npath); + + cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, npath, full_path); - rc = dfs_cache_noreq_find(path, NULL, &tgt_list); + rc = dfs_cache_noreq_find(npath, NULL, &tgt_list); if (rc) - return rc; + goto out; /* * We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to * test connection against new DFS targets. @@ -3084,11 +3069,11 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ char *fake_devname = NULL, *mdata = NULL; /* Get next DFS target server - if any */ - rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it); + rc = get_next_dfs_tgt(&tgt_list, &tgt_it); if (rc) break; - rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); + rc = dfs_cache_get_tgt_referral(npath, tgt_it, &ref); if (rc) break; @@ -3137,6 +3122,7 @@ static int do_dfs_failover(const char *path, const char *full_path, struct cifs_ } out: + kfree(npath); smb3_cleanup_fs_context_contents(&tmp_ctx); dfs_cache_free_tgts(&tgt_list); return rc; @@ -3288,25 +3274,18 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb3_fs_context * } #ifdef CONFIG_CIFS_DFS_UPCALL -static void set_root_ses(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, +static void set_root_ses(struct cifs_sb_info *cifs_sb, const uuid_t *mount_id, struct cifs_ses *ses, struct cifs_ses **root_ses) { if (ses) { spin_lock(&cifs_tcp_ses_lock); ses->ses_count++; - if (ses->tcon_ipc) - ses->tcon_ipc->remap = cifs_remap(cifs_sb); spin_unlock(&cifs_tcp_ses_lock); + dfs_cache_add_refsrv_session(mount_id, ses); } *root_ses = ses; } -static void put_root_ses(struct cifs_ses *ses) -{ - if (ses) - cifs_put_smb_ses(ses); -} - /* Set up next dfs prefix path in @dfs_path */ static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, const unsigned int xid, struct TCP_Server_Info *server, @@ -3352,17 +3331,25 @@ out: } /* Check if resolved targets can handle any DFS referrals */ -static int is_referral_server(const char *ref_path, struct cifs_tcon *tcon, bool *ref_server) +static int is_referral_server(const char *ref_path, struct cifs_sb_info *cifs_sb, + struct cifs_tcon *tcon, bool *ref_server) { int rc; struct dfs_info3_param ref = {0}; + cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path); + if (is_tcon_dfs(tcon)) { *ref_server = true; } else { - cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path); + char *npath; - rc = dfs_cache_noreq_find(ref_path, &ref, NULL); + npath = dfs_cache_canonical_path(ref_path, cifs_sb->local_nls, cifs_remap(cifs_sb)); + if (IS_ERR(npath)) + return PTR_ERR(npath); + + rc = dfs_cache_noreq_find(npath, &ref, NULL); + kfree(npath); if (rc) { cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc); return rc; @@ -3386,9 +3373,9 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) struct cifs_ses *ses = NULL, *root_ses = NULL; struct cifs_tcon *tcon = NULL; int count = 0; + uuid_t mount_id = {0}; char *ref_path = NULL, *full_path = NULL; char *oldmnt = NULL; - char *mntdata = NULL; bool ref_server = false; rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon); @@ -3411,12 +3398,9 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) if (rc != -EREMOTE) goto error; } - /* Save mount options */ - mntdata = kstrdup(cifs_sb->ctx->mount_options, GFP_KERNEL); - if (!mntdata) { - rc = -ENOMEM; - goto error; - } + + ctx->nosharesock = true; + /* Get path of DFS root */ ref_path = build_unc_path_to_root(ctx, cifs_sb, false); if (IS_ERR(ref_path)) { @@ -3425,7 +3409,8 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; } - set_root_ses(cifs_sb, ses, &root_ses); + uuid_gen(&mount_id); + set_root_ses(cifs_sb, &mount_id, ses, &root_ses); do { /* Save full path of last DFS path we used to resolve final target server */ kfree(full_path); @@ -3456,13 +3441,11 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) continue; /* Make sure that requests go through new root servers */ - rc = is_referral_server(ref_path + 1, tcon, &ref_server); + rc = is_referral_server(ref_path + 1, cifs_sb, tcon, &ref_server); if (rc) break; - if (ref_server) { - put_root_ses(root_ses); - set_root_ses(cifs_sb, ses, &root_ses); - } + if (ref_server) + set_root_ses(cifs_sb, &mount_id, ses, &root_ses); /* Get next dfs path and then continue chasing them if -EREMOTE */ rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); @@ -3471,12 +3454,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) rc = -ELOOP; } while (rc == -EREMOTE); - if (rc) + if (rc || !tcon) goto error; - put_root_ses(root_ses); - root_ses = NULL; + kfree(ref_path); - ref_path = NULL; /* * Store DFS full path in both superblock and tree connect structures. * @@ -3485,21 +3466,27 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) * links, the prefix path is included in both and may be changed during reconnect. See * cifs_tree_connect(). */ - cifs_sb->origin_fullpath = kstrdup(full_path, GFP_KERNEL); - if (!cifs_sb->origin_fullpath) { + ref_path = dfs_cache_canonical_path(full_path, cifs_sb->local_nls, cifs_remap(cifs_sb)); + kfree(full_path); + full_path = NULL; + + if (IS_ERR(ref_path)) { + rc = PTR_ERR(ref_path); + ref_path = NULL; + goto error; + } + cifs_sb->origin_fullpath = ref_path; + + ref_path = kstrdup(cifs_sb->origin_fullpath, GFP_KERNEL); + if (!ref_path) { rc = -ENOMEM; goto error; } spin_lock(&cifs_tcp_ses_lock); - tcon->dfs_path = full_path; - full_path = NULL; - tcon->remap = cifs_remap(cifs_sb); + tcon->dfs_path = ref_path; + ref_path = NULL; spin_unlock(&cifs_tcp_ses_lock); - /* Add original context for DFS cache to be used when refreshing referrals */ - rc = dfs_cache_add_vol(mntdata, ctx, cifs_sb->origin_fullpath); - if (rc) - goto error; /* * After reconnecting to a different server, unique ids won't * match anymore, so we disable serverino. This prevents @@ -3514,6 +3501,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) kfree(cifs_sb->prepath); cifs_sb->prepath = ctx->prepath; ctx->prepath = NULL; + uuid_copy(&cifs_sb->dfs_mount_id, &mount_id); out: free_xid(xid); @@ -3523,9 +3511,8 @@ out: error: kfree(ref_path); kfree(full_path); - kfree(mntdata); kfree(cifs_sb->origin_fullpath); - put_root_ses(root_ses); + dfs_cache_put_refsrv_sessions(&mount_id); mount_put_conns(cifs_sb, xid, server, ses, tcon); return rc; } @@ -3755,7 +3742,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb) kfree(cifs_sb->prepath); #ifdef CONFIG_CIFS_DFS_UPCALL - dfs_cache_del_vol(cifs_sb->origin_fullpath); + dfs_cache_put_refsrv_sessions(&cifs_sb->dfs_mount_id); kfree(cifs_sb->origin_fullpath); #endif call_rcu(&cifs_sb->rcu, delayed_free); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index b1fa30fefe1f..7c1769714609 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> #include <linux/nls.h> #include <linux/workqueue.h> +#include <linux/uuid.h> #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" @@ -18,15 +19,14 @@ #include "cifs_debug.h" #include "cifs_unicode.h" #include "smb2glob.h" -#include "fs_context.h" #include "dfs_cache.h" #define CACHE_HTABLE_SIZE 32 #define CACHE_MAX_ENTRIES 64 +#define CACHE_MIN_TTL 120 /* 2 minutes */ -#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \ - DFSREF_STORAGE_SERVER)) +#define IS_DFS_INTERLINK(v) (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER)) struct cache_dfs_tgt { char *name; @@ -48,14 +48,15 @@ struct cache_entry { struct cache_dfs_tgt *tgthint; }; -struct vol_info { - char *fullpath; - spinlock_t ctx_lock; - struct smb3_fs_context ctx; - char *mntdata; +/* List of referral server sessions per dfs mount */ +struct mount_group { struct list_head list; - struct list_head rlist; - struct kref refcnt; + uuid_t id; + struct cifs_ses *sessions[CACHE_MAX_ENTRIES]; + int num_sessions; + spinlock_t lock; + struct list_head refresh_list; + struct kref refcount; }; static struct kmem_cache *cache_slab __read_mostly; @@ -64,7 +65,7 @@ static struct workqueue_struct *dfscache_wq __read_mostly; static int cache_ttl; static DEFINE_SPINLOCK(cache_ttl_lock); -static struct nls_table *cache_nlsc; +static struct nls_table *cache_cp; /* * Number of entries in the cache @@ -74,34 +75,145 @@ static atomic_t cache_count; static struct hlist_head cache_htable[CACHE_HTABLE_SIZE]; static DECLARE_RWSEM(htable_rw_lock); -static LIST_HEAD(vol_list); -static DEFINE_SPINLOCK(vol_list_lock); +static LIST_HEAD(mount_group_list); +static DEFINE_MUTEX(mount_group_list_lock); static void refresh_cache_worker(struct work_struct *work); static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); -static int get_normalized_path(const char *path, const char **npath) +static void get_ipc_unc(const char *ref_path, char *ipc, size_t ipclen) { - if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) - return -EINVAL; + const char *host; + size_t len; - if (*path == '\\') { - *npath = path; - } else { - char *s = kstrdup(path, GFP_KERNEL); - if (!s) - return -ENOMEM; - convert_delimiter(s, '\\'); - *npath = s; + extract_unc_hostname(ref_path, &host, &len); + scnprintf(ipc, ipclen, "\\\\%.*s\\IPC$", (int)len, host); +} + +static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const char *path) +{ + char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0}; + + get_ipc_unc(path, unc, sizeof(unc)); + for (; *ses; ses++) { + if (!strcasecmp(unc, (*ses)->tcon_ipc->treeName)) + return *ses; } - return 0; + return ERR_PTR(-ENOENT); +} + +static void __mount_group_release(struct mount_group *mg) +{ + int i; + + for (i = 0; i < mg->num_sessions; i++) + cifs_put_smb_ses(mg->sessions[i]); + kfree(mg); +} + +static void mount_group_release(struct kref *kref) +{ + struct mount_group *mg = container_of(kref, struct mount_group, refcount); + + mutex_lock(&mount_group_list_lock); + list_del(&mg->list); + mutex_unlock(&mount_group_list_lock); + __mount_group_release(mg); +} + +static struct mount_group *find_mount_group_locked(const uuid_t *id) +{ + struct mount_group *mg; + + list_for_each_entry(mg, &mount_group_list, list) { + if (uuid_equal(&mg->id, id)) + return mg; + } + return ERR_PTR(-ENOENT); +} + +static struct mount_group *__get_mount_group_locked(const uuid_t *id) +{ + struct mount_group *mg; + + mg = find_mount_group_locked(id); + if (!IS_ERR(mg)) + return mg; + + mg = kmalloc(sizeof(*mg), GFP_KERNEL); + if (!mg) + return ERR_PTR(-ENOMEM); + kref_init(&mg->refcount); + uuid_copy(&mg->id, id); + mg->num_sessions = 0; + spin_lock_init(&mg->lock); + list_add(&mg->list, &mount_group_list); + return mg; +} + +static struct mount_group *get_mount_group(const uuid_t *id) +{ + struct mount_group *mg; + + mutex_lock(&mount_group_list_lock); + mg = __get_mount_group_locked(id); + if (!IS_ERR(mg)) + kref_get(&mg->refcount); + mutex_unlock(&mount_group_list_lock); + + return mg; } -static inline void free_normalized_path(const char *path, const char *npath) +static void free_mount_group_list(void) { - if (path != npath) - kfree(npath); + struct mount_group *mg, *tmp_mg; + + list_for_each_entry_safe(mg, tmp_mg, &mount_group_list, list) { + list_del_init(&mg->list); + __mount_group_release(mg); + } +} + +/** + * dfs_cache_canonical_path - get a canonical DFS path + * + * @path: DFS path + * @cp: codepage + * @remap: mapping type + * + * Return canonical path if success, otherwise error. + */ +char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap) +{ + char *tmp; + int plen = 0; + char *npath; + + if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) + return ERR_PTR(-EINVAL); + + if (unlikely(strcmp(cp->charset, cache_cp->charset))) { + tmp = (char *)cifs_strndup_to_utf16(path, strlen(path), &plen, cp, remap); + if (!tmp) { + cifs_dbg(VFS, "%s: failed to convert path to utf16\n", __func__); + return ERR_PTR(-EINVAL); + } + + npath = cifs_strndup_from_utf16(tmp, plen, true, cache_cp); + kfree(tmp); + + if (!npath) { + cifs_dbg(VFS, "%s: failed to convert path from utf16\n", __func__); + return ERR_PTR(-EINVAL); + } + } else { + npath = kstrdup(path, GFP_KERNEL); + if (!npath) + return ERR_PTR(-ENOMEM); + } + convert_delimiter(npath, '\\'); + return npath; } static inline bool cache_entry_expired(const struct cache_entry *ce) @@ -171,7 +283,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v) "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags, - IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", + IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); list_for_each_entry(t, &ce->tlist, list) { @@ -240,7 +352,7 @@ static inline void dump_ce(const struct cache_entry *ce) ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags, - IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", + IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); @@ -284,8 +396,7 @@ int dfs_cache_init(void) int rc; int i; - dfscache_wq = alloc_workqueue("cifs-dfscache", - WQ_FREEZABLE | WQ_MEM_RECLAIM, 1); + dfscache_wq = alloc_workqueue("cifs-dfscache", WQ_FREEZABLE | WQ_UNBOUND, 1); if (!dfscache_wq) return -ENOMEM; @@ -301,7 +412,9 @@ int dfs_cache_init(void) INIT_HLIST_HEAD(&cache_htable[i]); atomic_set(&cache_count, 0); - cache_nlsc = load_nls_default(); + cache_cp = load_nls("utf8"); + if (!cache_cp) + cache_cp = load_nls_default(); cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__); return 0; @@ -311,23 +424,24 @@ out_destroy_wq: return rc; } -static inline unsigned int cache_entry_hash(const void *data, int size) +static int cache_entry_hash(const void *data, int size, unsigned int *hash) { - unsigned int h; - - h = jhash(data, size, 0); - return h & (CACHE_HTABLE_SIZE - 1); -} - -/* Check whether second path component of @path is SYSVOL or NETLOGON */ -static inline bool is_sysvol_or_netlogon(const char *path) -{ - const char *s; - char sep = path[0]; - - s = strchr(path + 1, sep) + 1; - return !strncasecmp(s, "sysvol", strlen("sysvol")) || - !strncasecmp(s, "netlogon", strlen("netlogon")); + int i, clen; + const unsigned char *s = data; + wchar_t c; + unsigned int h = 0; + + for (i = 0; i < size; i += clen) { + clen = cache_cp->char2uni(&s[i], size - i, &c); + if (unlikely(clen < 0)) { + cifs_dbg(VFS, "%s: can't convert char\n", __func__); + return clen; + } + c = cifs_toupper(c); + h = jhash(&c, sizeof(c), h); + } + *hash = h % CACHE_HTABLE_SIZE; + return 0; } /* Return target hint of a DFS cache entry */ @@ -378,7 +492,7 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, { int i; - ce->ttl = refs[0].ttl; + ce->ttl = max_t(int, refs[0].ttl, CACHE_MIN_TTL); ce->etime = get_expire_time(ce->ttl); ce->srvtype = refs[0].server_type; ce->hdr_flags = refs[0].flags; @@ -409,9 +523,7 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, } /* Allocate a new cache entry */ -static struct cache_entry *alloc_cache_entry(const char *path, - const struct dfs_info3_param *refs, - int numrefs) +static struct cache_entry *alloc_cache_entry(struct dfs_info3_param *refs, int numrefs) { struct cache_entry *ce; int rc; @@ -420,11 +532,9 @@ static struct cache_entry *alloc_cache_entry(const char *path, if (!ce) return ERR_PTR(-ENOMEM); - ce->path = kstrdup(path, GFP_KERNEL); - if (!ce->path) { - kmem_cache_free(cache_slab, ce); - return ERR_PTR(-ENOMEM); - } + ce->path = refs[0].path_name; + refs[0].path_name = NULL; + INIT_HLIST_NODE(&ce->hlist); INIT_LIST_HEAD(&ce->tlist); @@ -437,13 +547,14 @@ static struct cache_entry *alloc_cache_entry(const char *path, return ce; } -/* Must be called with htable_rw_lock held */ -static void remove_oldest_entry(void) +static void remove_oldest_entry_locked(void) { int i; struct cache_entry *ce; struct cache_entry *to_del = NULL; + WARN_ON(!rwsem_is_locked(&htable_rw_lock)); + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { struct hlist_head *l = &cache_htable[i]; @@ -467,12 +578,24 @@ static void remove_oldest_entry(void) } /* Add a new DFS cache entry */ -static int add_cache_entry(const char *path, unsigned int hash, - struct dfs_info3_param *refs, int numrefs) +static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs) { + int rc; struct cache_entry *ce; + unsigned int hash; + + WARN_ON(!rwsem_is_locked(&htable_rw_lock)); + + if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) { + cifs_dbg(FYI, "%s: reached max cache size (%d)\n", __func__, CACHE_MAX_ENTRIES); + remove_oldest_entry_locked(); + } - ce = alloc_cache_entry(path, refs, numrefs); + rc = cache_entry_hash(refs[0].path_name, strlen(refs[0].path_name), &hash); + if (rc) + return rc; + + ce = alloc_cache_entry(refs, numrefs); if (IS_ERR(ce)) return PTR_ERR(ce); @@ -486,65 +609,77 @@ static int add_cache_entry(const char *path, unsigned int hash, } spin_unlock(&cache_ttl_lock); - down_write(&htable_rw_lock); hlist_add_head(&ce->hlist, &cache_htable[hash]); dump_ce(ce); - up_write(&htable_rw_lock); + + atomic_inc(&cache_count); return 0; } -static struct cache_entry *__lookup_cache_entry(const char *path) +/* Check if two DFS paths are equal. @s1 and @s2 are expected to be in @cache_cp's charset */ +static bool dfs_path_equal(const char *s1, int len1, const char *s2, int len2) { - struct cache_entry *ce; - unsigned int h; - bool found = false; + int i, l1, l2; + wchar_t c1, c2; - h = cache_entry_hash(path, strlen(path)); + if (len1 != len2) + return false; - hlist_for_each_entry(ce, &cache_htable[h], hlist) { - if (!strcasecmp(path, ce->path)) { - found = true; - dump_ce(ce); - break; + for (i = 0; i < len1; i += l1) { + l1 = cache_cp->char2uni(&s1[i], len1 - i, &c1); + l2 = cache_cp->char2uni(&s2[i], len2 - i, &c2); + if (unlikely(l1 < 0 && l2 < 0)) { + if (s1[i] != s2[i]) + return false; + l1 = 1; + continue; } + if (l1 != l2) + return false; + if (cifs_toupper(c1) != cifs_toupper(c2)) + return false; } + return true; +} - if (!found) - ce = ERR_PTR(-ENOENT); - return ce; +static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int hash, int len) +{ + struct cache_entry *ce; + + hlist_for_each_entry(ce, &cache_htable[hash], hlist) { + if (dfs_path_equal(ce->path, strlen(ce->path), path, len)) { + dump_ce(ce); + return ce; + } + } + return ERR_PTR(-EEXIST); } /* - * Find a DFS cache entry in hash table and optionally check prefix path against - * @path. - * Use whole path components in the match. - * Must be called with htable_rw_lock held. + * Find a DFS cache entry in hash table and optionally check prefix path against normalized @path. + * + * Use whole path components in the match. Must be called with htable_rw_lock held. * - * Return ERR_PTR(-ENOENT) if the entry is not found. + * Return ERR_PTR(-EEXIST) if the entry is not found. */ -static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *hash) +static struct cache_entry *lookup_cache_entry(const char *path) { - struct cache_entry *ce = ERR_PTR(-ENOENT); - unsigned int h; + struct cache_entry *ce; int cnt = 0; - char *npath; - char *s, *e; - char sep; - - npath = kstrdup(path, GFP_KERNEL); - if (!npath) - return ERR_PTR(-ENOMEM); + const char *s = path, *e; + char sep = *s; + unsigned int hash; + int rc; - s = npath; - sep = *npath; while ((s = strchr(s, sep)) && ++cnt < 3) s++; if (cnt < 3) { - h = cache_entry_hash(path, strlen(path)); - ce = __lookup_cache_entry(path); - goto out; + rc = cache_entry_hash(path, strlen(path), &hash); + if (rc) + return ERR_PTR(rc); + return __lookup_cache_entry(path, hash, strlen(path)); } /* * Handle paths that have more than two path components and are a complete prefix of the DFS @@ -552,64 +687,29 @@ static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *ha * * See MS-DFSC 3.2.5.5 "Receiving a Root Referral Request or Link Referral Request". */ - h = cache_entry_hash(npath, strlen(npath)); - e = npath + strlen(npath) - 1; + e = path + strlen(path) - 1; while (e > s) { - char tmp; + int len; /* skip separators */ while (e > s && *e == sep) e--; if (e == s) - goto out; - - tmp = *(e+1); - *(e+1) = 0; - - ce = __lookup_cache_entry(npath); - if (!IS_ERR(ce)) { - h = cache_entry_hash(npath, strlen(npath)); break; - } - *(e+1) = tmp; + len = e + 1 - path; + rc = cache_entry_hash(path, len, &hash); + if (rc) + return ERR_PTR(rc); + ce = __lookup_cache_entry(path, hash, len); + if (!IS_ERR(ce)) + return ce; + /* backward until separator */ while (e > s && *e != sep) e--; } -out: - if (hash) - *hash = h; - kfree(npath); - return ce; -} - -static void __vol_release(struct vol_info *vi) -{ - kfree(vi->fullpath); - kfree(vi->mntdata); - smb3_cleanup_fs_context_contents(&vi->ctx); - kfree(vi); -} - -static void vol_release(struct kref *kref) -{ - struct vol_info *vi = container_of(kref, struct vol_info, refcnt); - - spin_lock(&vol_list_lock); - list_del(&vi->list); - spin_unlock(&vol_list_lock); - __vol_release(vi); -} - -static inline void free_vol_list(void) -{ - struct vol_info *vi, *nvi; - - list_for_each_entry_safe(vi, nvi, &vol_list, list) { - list_del_init(&vi->list); - __vol_release(vi); - } + return ERR_PTR(-EEXIST); } /** @@ -618,8 +718,8 @@ static inline void free_vol_list(void) void dfs_cache_destroy(void) { cancel_delayed_work_sync(&refresh_task); - unload_nls(cache_nlsc); - free_vol_list(); + unload_nls(cache_cp); + free_mount_group_list(); flush_cache_ents(); kmem_cache_destroy(cache_slab); destroy_workqueue(dfscache_wq); @@ -627,18 +727,14 @@ void dfs_cache_destroy(void) cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__); } -/* Must be called with htable_rw_lock held */ -static int __update_cache_entry(const char *path, - const struct dfs_info3_param *refs, - int numrefs) +/* Update a cache entry with the new referral in @refs */ +static int update_cache_entry_locked(struct cache_entry *ce, const struct dfs_info3_param *refs, + int numrefs) { int rc; - struct cache_entry *ce; char *s, *th = NULL; - ce = lookup_cache_entry(path, NULL); - if (IS_ERR(ce)) - return PTR_ERR(ce); + WARN_ON(!rwsem_is_locked(&htable_rw_lock)); if (ce->tgthint) { s = ce->tgthint->name; @@ -657,37 +753,30 @@ static int __update_cache_entry(const char *path, return rc; } -static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_info3_param **refs, - int *numrefs) +static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const char *path, + struct dfs_info3_param **refs, int *numrefs) { - cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path); + int rc; + int i; - if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) - return -EOPNOTSUPP; - if (unlikely(!nls_codepage)) - return -EINVAL; + cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path); *refs = NULL; *numrefs = 0; - return ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, - nls_codepage, remap); -} - -/* Update an expired cache entry by getting a new DFS referral from server */ -static int update_cache_entry(const char *path, - const struct dfs_info3_param *refs, - int numrefs) -{ - - int rc; + if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) + return -EOPNOTSUPP; + if (unlikely(!cache_cp)) + return -EINVAL; - down_write(&htable_rw_lock); - rc = __update_cache_entry(path, refs, numrefs); - up_write(&htable_rw_lock); + rc = ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, cache_cp, + NO_MAP_UNI_RSVD); + if (!rc) { + struct dfs_info3_param *ref = *refs; + for (i = 0; i < *numrefs; i++) + convert_delimiter(ref[i].path_name, '\\'); + } return rc; } @@ -697,15 +786,12 @@ static int update_cache_entry(const char *path, * If the entry wasn't found, it will create a new one. Or if it was found but * expired, then it will update the entry accordingly. * - * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to + * For interlinks, cifs_mount() and expand_dfs_referral() are supposed to * handle them properly. */ -static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, bool noreq) +static int cache_refresh_path(const unsigned int xid, struct cifs_ses *ses, const char *path) { int rc; - unsigned int hash; struct cache_entry *ce; struct dfs_info3_param *refs = NULL; int numrefs = 0; @@ -713,62 +799,38 @@ static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "%s: search path: %s\n", __func__, path); - down_read(&htable_rw_lock); - - ce = lookup_cache_entry(path, &hash); - - /* - * If @noreq is set, no requests will be sent to the server. Just return - * the cache entry. - */ - if (noreq) { - up_read(&htable_rw_lock); - return PTR_ERR_OR_ZERO(ce); - } + down_write(&htable_rw_lock); + ce = lookup_cache_entry(path); if (!IS_ERR(ce)) { if (!cache_entry_expired(ce)) { dump_ce(ce); - up_read(&htable_rw_lock); + up_write(&htable_rw_lock); return 0; } } else { newent = true; } - up_read(&htable_rw_lock); - /* - * No entry was found. - * - * Request a new DFS referral in order to create a new cache entry, or - * updating an existing one. + * Either the entry was not found, or it is expired. + * Request a new DFS referral in order to create or update a cache entry. */ - rc = get_dfs_referral(xid, ses, nls_codepage, remap, path, - &refs, &numrefs); + rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); if (rc) - return rc; + goto out_unlock; dump_refs(refs, numrefs); if (!newent) { - rc = update_cache_entry(path, refs, numrefs); - goto out_free_refs; - } - - if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) { - cifs_dbg(FYI, "%s: reached max cache size (%d)\n", - __func__, CACHE_MAX_ENTRIES); - down_write(&htable_rw_lock); - remove_oldest_entry(); - up_write(&htable_rw_lock); + rc = update_cache_entry_locked(ce, refs, numrefs); + goto out_unlock; } - rc = add_cache_entry(path, hash, refs, numrefs); - if (!rc) - atomic_inc(&cache_count); + rc = add_cache_entry_locked(refs, numrefs); -out_free_refs: +out_unlock: + up_write(&htable_rw_lock); free_dfs_info_array(refs, numrefs); return rc; } @@ -868,7 +930,7 @@ err_free_it: * needs to be issued: * @xid: syscall xid * @ses: smb session to issue the request on - * @nls_codepage: charset conversion + * @cp: codepage * @remap: path character remapping type * @path: path to lookup in DFS referral cache. * @@ -877,26 +939,25 @@ err_free_it: * * Return zero if the target was found, otherwise non-zero. */ -int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_info3_param *ref, +int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *cp, + int remap, const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list) { int rc; const char *npath; struct cache_entry *ce; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; + npath = dfs_cache_canonical_path(path, cp, remap); + if (IS_ERR(npath)) + return PTR_ERR(npath); - rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + rc = cache_refresh_path(xid, ses, npath); if (rc) goto out_free_path; down_read(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(npath); if (IS_ERR(ce)) { up_read(&htable_rw_lock); rc = PTR_ERR(ce); @@ -913,7 +974,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, up_read(&htable_rw_lock); out_free_path: - free_normalized_path(path, npath); + kfree(npath); return rc; } @@ -925,7 +986,7 @@ out_free_path: * expired, nor create a new cache entry if @path hasn't been found. It heavily * relies on an existing cache entry. * - * @path: path to lookup in the DFS referral cache. + * @path: canonical DFS path to lookup in the DFS referral cache. * @ref: when non-NULL, store single DFS referral result in it. * @tgt_list: when non-NULL, store complete DFS target list in it. * @@ -937,18 +998,13 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list) { int rc; - const char *npath; struct cache_entry *ce; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); down_read(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(path); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -963,8 +1019,6 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, out_unlock: up_read(&htable_rw_lock); - free_normalized_path(path, npath); - return rc; } @@ -979,16 +1033,15 @@ out_unlock: * * @xid: syscall id * @ses: smb session - * @nls_codepage: charset conversion + * @cp: codepage * @remap: type of character remapping for paths - * @path: path to lookup in DFS referral cache. + * @path: path to lookup in DFS referral cache * @it: DFS target iterator * * Return zero if the target hint was updated successfully, otherwise non-zero. */ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, + const struct nls_table *cp, int remap, const char *path, const struct dfs_cache_tgt_iterator *it) { int rc; @@ -996,19 +1049,19 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, struct cache_entry *ce; struct cache_dfs_tgt *t; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; + npath = dfs_cache_canonical_path(path, cp, remap); + if (IS_ERR(npath)) + return PTR_ERR(npath); cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath); - rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + rc = cache_refresh_path(xid, ses, npath); if (rc) goto out_free_path; down_write(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(npath); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -1031,8 +1084,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, out_unlock: up_write(&htable_rw_lock); out_free_path: - free_normalized_path(path, npath); - + kfree(npath); return rc; } @@ -1044,32 +1096,26 @@ out_free_path: * expired, nor create a new cache entry if @path hasn't been found. It heavily * relies on an existing cache entry. * - * @path: path to lookup in DFS referral cache. + * @path: canonical DFS path to lookup in DFS referral cache. * @it: target iterator which contains the target hint to update the cache * entry with. * * Return zero if the target hint was updated successfully, otherwise non-zero. */ -int dfs_cache_noreq_update_tgthint(const char *path, - const struct dfs_cache_tgt_iterator *it) +int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it) { int rc; - const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; if (!it) return -EINVAL; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); down_write(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(path); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -1092,8 +1138,6 @@ int dfs_cache_noreq_update_tgthint(const char *path, out_unlock: up_write(&htable_rw_lock); - free_normalized_path(path, npath); - return rc; } @@ -1101,32 +1145,26 @@ out_unlock: * dfs_cache_get_tgt_referral - returns a DFS referral (@ref) from a given * target iterator (@it). * - * @path: path to lookup in DFS referral cache. + * @path: canonical DFS path to lookup in DFS referral cache. * @it: DFS target iterator. * @ref: DFS referral pointer to set up the gathered information. * * Return zero if the DFS referral was set up correctly, otherwise non-zero. */ -int dfs_cache_get_tgt_referral(const char *path, - const struct dfs_cache_tgt_iterator *it, +int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it, struct dfs_info3_param *ref) { int rc; - const char *npath; struct cache_entry *ce; if (!it || !ref) return -EINVAL; - rc = get_normalized_path(path, &npath); - if (rc) - return rc; - - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); down_read(&htable_rw_lock); - ce = lookup_cache_entry(npath, NULL); + ce = lookup_cache_entry(path); if (IS_ERR(ce)) { rc = PTR_ERR(ce); goto out_unlock; @@ -1138,132 +1176,55 @@ int dfs_cache_get_tgt_referral(const char *path, out_unlock: up_read(&htable_rw_lock); - free_normalized_path(path, npath); - return rc; } /** - * dfs_cache_add_vol - add a cifs context during mount() that will be handled by - * DFS cache refresh worker. - * - * @mntdata: mount data. - * @ctx: cifs context. - * @fullpath: origin full path. + * dfs_cache_add_refsrv_session - add SMB session of referral server * - * Return zero if context was set up correctly, otherwise non-zero. + * @mount_id: mount group uuid to lookup. + * @ses: reference counted SMB session of referral server. */ -int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, const char *fullpath) +void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses) { - int rc; - struct vol_info *vi; - - if (!ctx || !fullpath || !mntdata) - return -EINVAL; - - cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - - vi = kzalloc(sizeof(*vi), GFP_KERNEL); - if (!vi) - return -ENOMEM; + struct mount_group *mg; - vi->fullpath = kstrdup(fullpath, GFP_KERNEL); - if (!vi->fullpath) { - rc = -ENOMEM; - goto err_free_vi; - } - - rc = smb3_fs_context_dup(&vi->ctx, ctx); - if (rc) - goto err_free_fullpath; - - vi->mntdata = mntdata; - spin_lock_init(&vi->ctx_lock); - kref_init(&vi->refcnt); - - spin_lock(&vol_list_lock); - list_add_tail(&vi->list, &vol_list); - spin_unlock(&vol_list_lock); - - return 0; - -err_free_fullpath: - kfree(vi->fullpath); -err_free_vi: - kfree(vi); - return rc; -} + if (WARN_ON_ONCE(!mount_id || uuid_is_null(mount_id) || !ses)) + return; -/* Must be called with vol_list_lock held */ -static struct vol_info *find_vol(const char *fullpath) -{ - struct vol_info *vi; + mg = get_mount_group(mount_id); + if (WARN_ON_ONCE(IS_ERR(mg))) + return; - list_for_each_entry(vi, &vol_list, list) { - cifs_dbg(FYI, "%s: vi->fullpath: %s\n", __func__, vi->fullpath); - if (!strcasecmp(vi->fullpath, fullpath)) - return vi; - } - return ERR_PTR(-ENOENT); + spin_lock(&mg->lock); + if (mg->num_sessions < ARRAY_SIZE(mg->sessions)) + mg->sessions[mg->num_sessions++] = ses; + spin_unlock(&mg->lock); + kref_put(&mg->refcount, mount_group_release); } /** - * dfs_cache_update_vol - update vol info in DFS cache after failover + * dfs_cache_put_refsrv_sessions - put all referral server sessions * - * @fullpath: fullpath to look up in volume list. - * @server: TCP ses pointer. + * Put all SMB sessions from the given mount group id. * - * Return zero if volume was updated, otherwise non-zero. + * @mount_id: mount group uuid to lookup. */ -int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server) +void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id) { - struct vol_info *vi; - - if (!fullpath || !server) - return -EINVAL; - - cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - - spin_lock(&vol_list_lock); - vi = find_vol(fullpath); - if (IS_ERR(vi)) { - spin_unlock(&vol_list_lock); - return PTR_ERR(vi); - } - kref_get(&vi->refcnt); - spin_unlock(&vol_list_lock); - - cifs_dbg(FYI, "%s: updating volume info\n", __func__); - spin_lock(&vi->ctx_lock); - memcpy(&vi->ctx.dstaddr, &server->dstaddr, - sizeof(vi->ctx.dstaddr)); - spin_unlock(&vi->ctx_lock); + struct mount_group *mg; - kref_put(&vi->refcnt, vol_release); - - return 0; -} - -/** - * dfs_cache_del_vol - remove volume info in DFS cache during umount() - * - * @fullpath: fullpath to look up in volume list. - */ -void dfs_cache_del_vol(const char *fullpath) -{ - struct vol_info *vi; - - if (!fullpath || !*fullpath) + if (!mount_id || uuid_is_null(mount_id)) return; - cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - - spin_lock(&vol_list_lock); - vi = find_vol(fullpath); - spin_unlock(&vol_list_lock); - - if (!IS_ERR(vi)) - kref_put(&vi->refcnt, vol_release); + mutex_lock(&mount_group_list_lock); + mg = find_mount_group_locked(mount_id); + if (IS_ERR(mg)) { + mutex_unlock(&mount_group_list_lock); + return; + } + mutex_unlock(&mount_group_list_lock); + kref_put(&mg->refcount, mount_group_release); } /** @@ -1276,8 +1237,8 @@ void dfs_cache_del_vol(const char *fullpath) * * Return zero if target was parsed correctly, otherwise non-zero. */ -int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, - char **share, char **prefix) +int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share, + char **prefix) { char *s, sep, *p; size_t len; @@ -1332,278 +1293,190 @@ int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, return 0; } -/* Get all tcons that are within a DFS namespace and can be refreshed */ -static void get_tcons(struct TCP_Server_Info *server, struct list_head *head) +/* + * Refresh all active dfs mounts regardless of whether they are in cache or not. + * (cache can be cleared) + */ +static void refresh_mounts(struct cifs_ses **sessions) { + struct TCP_Server_Info *server; struct cifs_ses *ses; - struct cifs_tcon *tcon; + struct cifs_tcon *tcon, *ntcon; + struct list_head tcons; + unsigned int xid; - INIT_LIST_HEAD(head); + INIT_LIST_HEAD(&tcons); spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { - list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { - if (!tcon->need_reconnect && !tcon->need_reopen_files && - tcon->dfs_path) { - tcon->tc_count++; - list_add_tail(&tcon->ulist, head); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->dfs_path) { + tcon->tc_count++; + list_add_tail(&tcon->ulist, &tcons); + } } } - if (ses->tcon_ipc && !ses->tcon_ipc->need_reconnect && - ses->tcon_ipc->dfs_path) { - list_add_tail(&ses->tcon_ipc->ulist, head); - } } spin_unlock(&cifs_tcp_ses_lock); -} -static bool is_dfs_link(const char *path) -{ - char *s; - - s = strchr(path + 1, '\\'); - if (!s) - return false; - return !!strchr(s + 1, '\\'); -} - -static char *get_dfs_root(const char *path) -{ - char *s, *npath; - - s = strchr(path + 1, '\\'); - if (!s) - return ERR_PTR(-EINVAL); - - s = strchr(s + 1, '\\'); - if (!s) - return ERR_PTR(-EINVAL); - - npath = kstrndup(path, s - path, GFP_KERNEL); - if (!npath) - return ERR_PTR(-ENOMEM); + list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) { + const char *path = tcon->dfs_path + 1; + struct cache_entry *ce; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + bool needs_refresh = false; + int rc = 0; - return npath; -} + list_del_init(&tcon->ulist); -static inline void put_tcp_server(struct TCP_Server_Info *server) -{ - cifs_put_tcp_session(server, 0); -} + ses = find_ipc_from_server_path(sessions, path); + if (IS_ERR(ses)) + goto next_tcon; -static struct TCP_Server_Info *get_tcp_server(struct smb3_fs_context *ctx) -{ - struct TCP_Server_Info *server; + down_read(&htable_rw_lock); + ce = lookup_cache_entry(path); + needs_refresh = IS_ERR(ce) || cache_entry_expired(ce); + up_read(&htable_rw_lock); - server = cifs_find_tcp_session(ctx); - if (IS_ERR_OR_NULL(server)) - return NULL; + if (!needs_refresh) + goto next_tcon; + + xid = get_xid(); + rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); + free_xid(xid); + + /* Create or update a cache entry with the new referral */ + if (!rc) { + down_write(&htable_rw_lock); + ce = lookup_cache_entry(path); + if (IS_ERR(ce)) + add_cache_entry_locked(refs, numrefs); + else if (cache_entry_expired(ce)) + update_cache_entry_locked(ce, refs, numrefs); + up_write(&htable_rw_lock); + } - spin_lock(&GlobalMid_Lock); - if (server->tcpStatus != CifsGood) { - spin_unlock(&GlobalMid_Lock); - put_tcp_server(server); - return NULL; +next_tcon: + free_dfs_info_array(refs, numrefs); + cifs_put_tcon(tcon); } - spin_unlock(&GlobalMid_Lock); - - return server; } -/* Find root SMB session out of a DFS link path */ -static struct cifs_ses *find_root_ses(struct vol_info *vi, - struct cifs_tcon *tcon, - const char *path) +static void refresh_cache(struct cifs_ses **sessions) { - char *rpath; - int rc; - struct cache_entry *ce; - struct dfs_info3_param ref = {0}; - char *mdata = NULL, *devname = NULL; - struct TCP_Server_Info *server; + int i; struct cifs_ses *ses; - struct smb3_fs_context ctx = {NULL}; + unsigned int xid; + char *ref_paths[CACHE_MAX_ENTRIES]; + int count = 0; + struct cache_entry *ce; - rpath = get_dfs_root(path); - if (IS_ERR(rpath)) - return ERR_CAST(rpath); + /* + * Refresh all cached entries. Get all new referrals outside critical section to avoid + * starvation while performing SMB2 IOCTL on broken or slow connections. + * The cache entries may cover more paths than the active mounts + * (e.g. domain-based DFS referrals or multi tier DFS setups). + */ down_read(&htable_rw_lock); + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; - ce = lookup_cache_entry(rpath, NULL); - if (IS_ERR(ce)) { - up_read(&htable_rw_lock); - ses = ERR_CAST(ce); - goto out; - } - - rc = setup_referral(path, ce, &ref, get_tgt_name(ce)); - if (rc) { - up_read(&htable_rw_lock); - ses = ERR_PTR(rc); - goto out; + hlist_for_each_entry(ce, l, hlist) { + if (count == ARRAY_SIZE(ref_paths)) + goto out_unlock; + if (hlist_unhashed(&ce->hlist) || !cache_entry_expired(ce) || + IS_ERR(find_ipc_from_server_path(sessions, ce->path))) + continue; + ref_paths[count++] = kstrdup(ce->path, GFP_ATOMIC); + } } +out_unlock: up_read(&htable_rw_lock); - mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref, - &devname); - free_dfs_info_param(&ref); - - if (IS_ERR(mdata)) { - ses = ERR_CAST(mdata); - mdata = NULL; - goto out; - } - - rc = cifs_setup_volume_info(&ctx, NULL, devname); - - if (rc) { - ses = ERR_PTR(rc); - goto out; - } - - server = get_tcp_server(&ctx); - if (!server) { - ses = ERR_PTR(-EHOSTDOWN); - goto out; - } - - ses = cifs_get_smb_ses(server, &ctx); - -out: - smb3_cleanup_fs_context_contents(&ctx); - kfree(mdata); - kfree(rpath); - kfree(devname); - - return ses; -} - -/* Refresh DFS cache entry from a given tcon */ -static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon) -{ - int rc = 0; - unsigned int xid; - const char *path, *npath; - struct cache_entry *ce; - struct cifs_ses *root_ses = NULL, *ses; - struct dfs_info3_param *refs = NULL; - int numrefs = 0; - - xid = get_xid(); - - path = tcon->dfs_path + 1; + for (i = 0; i < count; i++) { + char *path = ref_paths[i]; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + int rc = 0; - rc = get_normalized_path(path, &npath); - if (rc) - goto out_free_xid; - - down_read(&htable_rw_lock); - - ce = lookup_cache_entry(npath, NULL); - if (IS_ERR(ce)) { - rc = PTR_ERR(ce); - up_read(&htable_rw_lock); - goto out_free_path; - } + if (!path) + continue; - if (!cache_entry_expired(ce)) { - up_read(&htable_rw_lock); - goto out_free_path; - } + ses = find_ipc_from_server_path(sessions, path); + if (IS_ERR(ses)) + goto next_referral; - up_read(&htable_rw_lock); + xid = get_xid(); + rc = get_dfs_referral(xid, ses, path, &refs, &numrefs); + free_xid(xid); - /* If it's a DFS Link, then use root SMB session for refreshing it */ - if (is_dfs_link(npath)) { - ses = root_ses = find_root_ses(vi, tcon, npath); - if (IS_ERR(ses)) { - rc = PTR_ERR(ses); - root_ses = NULL; - goto out_free_path; + if (!rc) { + down_write(&htable_rw_lock); + ce = lookup_cache_entry(path); + /* + * We need to re-check it because other tasks might have it deleted or + * updated. + */ + if (!IS_ERR(ce) && cache_entry_expired(ce)) + update_cache_entry_locked(ce, refs, numrefs); + up_write(&htable_rw_lock); } - } else { - ses = tcon->ses; - } - rc = get_dfs_referral(xid, ses, cache_nlsc, tcon->remap, npath, &refs, - &numrefs); - if (!rc) { - dump_refs(refs, numrefs); - rc = update_cache_entry(npath, refs, numrefs); +next_referral: + kfree(path); free_dfs_info_array(refs, numrefs); } - - if (root_ses) - cifs_put_smb_ses(root_ses); - -out_free_path: - free_normalized_path(path, npath); - -out_free_xid: - free_xid(xid); - return rc; } /* - * Worker that will refresh DFS cache based on lowest TTL value from a DFS + * Worker that will refresh DFS cache and active mounts based on lowest TTL value from a DFS * referral. */ static void refresh_cache_worker(struct work_struct *work) { - struct vol_info *vi, *nvi; - struct TCP_Server_Info *server; - LIST_HEAD(vols); - LIST_HEAD(tcons); - struct cifs_tcon *tcon, *ntcon; - int rc; - - /* - * Find SMB volumes that are eligible (server->tcpStatus == CifsGood) - * for refreshing. - */ - spin_lock(&vol_list_lock); - list_for_each_entry(vi, &vol_list, list) { - server = get_tcp_server(&vi->ctx); - if (!server) - continue; - - kref_get(&vi->refcnt); - list_add_tail(&vi->rlist, &vols); - put_tcp_server(server); + struct list_head mglist; + struct mount_group *mg, *tmp_mg; + struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL}; + int max_sessions = ARRAY_SIZE(sessions) - 1; + int i = 0, count; + + INIT_LIST_HEAD(&mglist); + + /* Get refereces of mount groups */ + mutex_lock(&mount_group_list_lock); + list_for_each_entry(mg, &mount_group_list, list) { + kref_get(&mg->refcount); + list_add(&mg->refresh_list, &mglist); } - spin_unlock(&vol_list_lock); - - /* Walk through all TCONs and refresh any expired cache entry */ - list_for_each_entry_safe(vi, nvi, &vols, rlist) { - spin_lock(&vi->ctx_lock); - server = get_tcp_server(&vi->ctx); - spin_unlock(&vi->ctx_lock); + mutex_unlock(&mount_group_list_lock); - if (!server) - goto next_vol; - - get_tcons(server, &tcons); - rc = 0; - - list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) { - /* - * Skip tcp server if any of its tcons failed to refresh - * (possibily due to reconnects). - */ - if (!rc) - rc = refresh_tcon(vi, tcon); + /* Fill in local array with an NULL-terminated list of all referral server sessions */ + list_for_each_entry(mg, &mglist, refresh_list) { + if (i >= max_sessions) + break; - list_del_init(&tcon->ulist); - cifs_put_tcon(tcon); - } + spin_lock(&mg->lock); + if (i + mg->num_sessions > max_sessions) + count = max_sessions - i; + else + count = mg->num_sessions; + memcpy(&sessions[i], mg->sessions, count * sizeof(mg->sessions[0])); + spin_unlock(&mg->lock); + i += count; + } - put_tcp_server(server); + if (sessions[0]) { + /* Refresh all active mounts and cached entries */ + refresh_mounts(sessions); + refresh_cache(sessions); + } -next_vol: - list_del_init(&vi->rlist); - kref_put(&vi->refcnt, vol_release); + list_for_each_entry_safe(mg, tmp_mg, &mglist, refresh_list) { + list_del_init(&mg->refresh_list); + kref_put(&mg->refcount, mount_group_release); } spin_lock(&cache_ttl_lock); diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index 1afc4f590c47..b29d3ae64829 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -10,6 +10,7 @@ #include <linux/nls.h> #include <linux/list.h> +#include <linux/uuid.h> #include "cifsglob.h" struct dfs_cache_tgt_list { @@ -23,34 +24,26 @@ struct dfs_cache_tgt_iterator { struct list_head it_list; }; -extern int dfs_cache_init(void); -extern void dfs_cache_destroy(void); +int dfs_cache_init(void); +void dfs_cache_destroy(void); extern const struct proc_ops dfscache_proc_ops; -extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_info3_param *ref, - struct dfs_cache_tgt_list *tgt_list); -extern int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, - struct dfs_cache_tgt_list *tgt_list); -extern int dfs_cache_update_tgthint(const unsigned int xid, - struct cifs_ses *ses, - const struct nls_table *nls_codepage, - int remap, const char *path, - const struct dfs_cache_tgt_iterator *it); -extern int -dfs_cache_noreq_update_tgthint(const char *path, - const struct dfs_cache_tgt_iterator *it); -extern int dfs_cache_get_tgt_referral(const char *path, - const struct dfs_cache_tgt_iterator *it, - struct dfs_info3_param *ref); -extern int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, - const char *fullpath); -extern int dfs_cache_update_vol(const char *fullpath, - struct TCP_Server_Info *server); -extern void dfs_cache_del_vol(const char *fullpath); -extern int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, - char **share, char **prefix); +int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *cp, + int remap, const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list); +int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, + struct dfs_cache_tgt_list *tgt_list); +int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *cp, int remap, const char *path, + const struct dfs_cache_tgt_iterator *it); +int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it); +int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it, + struct dfs_info3_param *ref); +int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share, + char **prefix); +void dfs_cache_put_refsrv_sessions(const uuid_t *mount_id); +void dfs_cache_add_refsrv_session(const uuid_t *mount_id, struct cifs_ses *ses); +char *dfs_cache_canonical_path(const char *path, const struct nls_table *cp, int remap); static inline struct dfs_cache_tgt_iterator * dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 6bcd3e8f7cda..79402ca0ddfa 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/dir.c * @@ -6,19 +7,6 @@ * Copyright (C) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> @@ -396,10 +384,11 @@ cifs_create_set_dentry: goto out_err; } - if (S_ISDIR(newinode->i_mode)) { - rc = -EISDIR; - goto out_err; - } + if (newinode) + if (S_ISDIR(newinode->i_mode)) { + rc = -EISDIR; + goto out_err; + } d_drop(direntry); d_add(direntry, newinode); @@ -630,6 +619,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct inode *newInode = NULL; const char *full_path; void *page; + int retry_count = 0; xid = get_xid(); @@ -673,6 +663,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, d_inode(direntry)); +again: if (pTcon->posix_extensions) rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid); else if (pTcon->unix_ext) { @@ -687,6 +678,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* since paths are not looked up by component - the parent directories are presumed to be good here */ renew_parental_timestamps(direntry); + } else if (rc == -EAGAIN && retry_count++ < 10) { + goto again; } else if (rc == -ENOENT) { cifs_set_time(direntry, jiffies); newInode = NULL; diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 534cbba72789..d15b82d569ef 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/dns_resolve.c * @@ -10,19 +11,6 @@ * Contains the CIFS DFS upcall routines used for hostname to * IP address translation. * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/slab.h> diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index d3f5d27f4d06..5be060b82b13 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS * Handles host name to IP address resolution @@ -5,19 +6,6 @@ * Copyright (c) International Business Machines Corp., 2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _DNS_RESOLVE_H diff --git a/fs/cifs/export.c b/fs/cifs/export.c index eb0bb8ca8e63..747a540db954 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/export.c * @@ -8,19 +9,6 @@ * * Operations related to support for exporting files via NFSD * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 379a427f3c2f..cd108607a070 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/file.c * @@ -7,19 +8,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Jeremy Allison (jra@samba.org) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/backing-dev.h> diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 20d24af33ee2..dd625033cd6b 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/fscache.c - CIFS filesystem cache interface * * Copyright (c) 2010 Novell, Inc. * Author(s): Suresh Jayaraman <sjayaraman@suse.de> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "fscache.h" #include "cifsglob.h" diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index e811f2dd7619..3d55cb2ef055 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/fscache.h - CIFS filesystem cache interface definitions * * Copyright (c) 2010 Novell, Inc. * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFS_FSCACHE_H #define _CIFS_FSCACHE_H diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 1dfa57982522..b96b253e7635 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/inode.c * * Copyright (C) International Business Machines Corp., 2002,2010 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> @@ -367,9 +355,12 @@ cifs_get_file_info_unix(struct file *filp) } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, inode->i_sb); rc = 0; - } + } else + goto cifs_gfiunix_out; rc = cifs_fattr_to_inode(inode, &fattr); + +cifs_gfiunix_out: free_xid(xid); return rc; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index d67d281ab863..42c6a0bac6c8 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/ioctl.c * @@ -6,19 +7,6 @@ * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 970fcf2adb08..f0a6d63bc08c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/link.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 7207a63819cb..184138b4eb8c 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/misc.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/slab.h> diff --git a/fs/cifs/netlink.c b/fs/cifs/netlink.c index 5aaabe4cc0a7..291cb606f149 100644 --- a/fs/cifs/netlink.c +++ b/fs/cifs/netlink.c @@ -30,7 +30,7 @@ static const struct nla_policy cifs_genl_policy[CIFS_GENL_ATTR_MAX + 1] = { [CIFS_GENL_ATTR_SWN_RESOURCE_NAME] = { .type = NLA_STRING}, }; -static struct genl_ops cifs_genl_ops[] = { +static const struct genl_ops cifs_genl_ops[] = { { .cmd = CIFS_GENL_CMD_SWN_NOTIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 3079b38f0afb..378133ce8869 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/ntlmssp.h * * Copyright (c) International Business Machines Corp., 2002,2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define NTLMSSP_SIGNATURE "NTLMSSP" diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 63bfc533c9fb..bfee176b901d 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/readdir.c * @@ -7,19 +8,6 @@ * Copyright (C) Red Hat, Inc., 2011 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/pagemap.h> @@ -321,7 +309,7 @@ static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr, { __dir_info_to_fattr(fattr, info); - /* See MS-FSCC 2.4.18 FileIdFullDirectoryInformation */ + /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */ if (fattr->cf_cifsattrs & ATTR_REPARSE) fattr->cf_cifstag = le32_to_cpu(info->EaSize); cifs_fill_common_info(fattr, cifs_sb); diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h index 8b69fcceb597..137f7c95afd6 100644 --- a/fs/cifs/rfc1002pdu.h +++ b/fs/cifs/rfc1002pdu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/rfc1002pdu.h * @@ -6,19 +7,6 @@ * Copyright (c) International Business Machines Corp., 2004 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* NB: unlike smb/cifs packets, the RFC1002 structures are big endian */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index a92a1fb7cb52..c5785fd3f52e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/sess.c * @@ -6,19 +7,6 @@ * Copyright (c) International Business Machines Corp., 2006, 2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "cifspdu.h" @@ -195,7 +183,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, ses, iface->speed, iface->rdma_capable ? "yes" : "no", &ipv4->sin_addr); else - cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI4)\n", + cifs_dbg(FYI, "adding channel to ses %p (speed:%zu bps rdma:%s ip:%pI6)\n", ses, iface->speed, iface->rdma_capable ? "yes" : "no", &ipv6->sin6_addr); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 2fa3ba354cc9..c9d8a50062b8 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2file.c * @@ -5,19 +6,6 @@ * Author(s): Steve French (sfrench@us.ibm.com), * Pavel Shilovsky ((pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index d9a990c99121..d0e9f3782bd9 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2glob.h * @@ -9,16 +10,6 @@ * Jeremy Allison (jra@samba.org) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * */ #ifndef _SMB2_GLOB_H #define _SMB2_GLOB_H diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 9a61209a283e..957b2594f02e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2inode.c * @@ -6,19 +7,6 @@ * Author(s): Pavel Shilovsky (pshilovsky@samba.org), * Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> #include <linux/stat.h> diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index c775682ee973..cea39bcecbab 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/smb2/smb2maperror.c * @@ -6,19 +7,6 @@ * Copyright (C) International Business Machines Corp., 2009 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/errno.h> #include "cifsglob.h" diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 06d555d4da9a..668f77108831 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2misc.c * @@ -6,19 +7,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/ctype.h> #include "smb2pdu.h" @@ -164,19 +152,16 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) struct smb2_transform_hdr *thdr = (struct smb2_transform_hdr *)buf; struct cifs_ses *ses = NULL; - struct list_head *tmp; /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &srvr->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) { if (ses->Suid == thdr->SessionId) break; - - ses = NULL; } spin_unlock(&cifs_tcp_ses_lock); - if (ses == NULL) { + if (list_entry_is_head(ses, &srvr->smb_ses_list, + smb_ses_list)) { cifs_dbg(VFS, "no decryption - session id not found\n"); return 1; } @@ -548,7 +533,6 @@ static bool smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp) { __u8 lease_state; - struct list_head *tmp; struct cifsFileInfo *cfile; struct cifsInodeInfo *cinode; int ack_req = le32_to_cpu(rsp->Flags & @@ -556,8 +540,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp) lease_state = le32_to_cpu(rsp->NewLeaseState); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, tlist); + list_for_each_entry(cfile, &tcon->openFileList, tlist) { cinode = CIFS_I(d_inode(cfile->dentry)); if (memcmp(cinode->lease_key, rsp->LeaseKey, @@ -618,7 +601,6 @@ static bool smb2_is_valid_lease_break(char *buffer) { struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer; - struct list_head *tmp, *tmp1, *tmp2; struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -628,15 +610,9 @@ smb2_is_valid_lease_break(char *buffer) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &cifs_tcp_ses_list) { - server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list); - - list_for_each(tmp1, &server->smb_ses_list) { - ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); - - list_for_each(tmp2, &ses->tcon_list) { - tcon = list_entry(tmp2, struct cifs_tcon, - tcon_list); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); @@ -687,7 +663,6 @@ bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) { struct smb2_oplock_break *rsp = (struct smb2_oplock_break *)buffer; - struct list_head *tmp, *tmp1, *tmp2; struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsInodeInfo *cinode; @@ -710,16 +685,11 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) /* look up tcon based on tid & uid */ spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - - list_for_each(tmp1, &ses->tcon_list) { - tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { spin_lock(&tcon->open_file_lock); - list_for_each(tmp2, &tcon->openFileList) { - cfile = list_entry(tmp2, struct cifsFileInfo, - tlist); + list_for_each_entry(cfile, &tcon->openFileList, tlist) { if (rsp->PersistentFid != cfile->fid.persistent_fid || rsp->VolatileFid != diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 21ef51d338e0..e4c8f603dd58 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -388,7 +388,9 @@ smb2_negotiate(const unsigned int xid, struct cifs_ses *ses) { int rc; + spin_lock(&GlobalMid_Lock); cifs_ses_server(ses)->CurrentMid = 0; + spin_unlock(&GlobalMid_Lock); rc = SMB2_negotiate(xid, ses); /* BB we probably don't need to retry with modern servers */ if (rc == -EAGAIN) @@ -2325,6 +2327,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct smb2_query_directory_rsp *qd_rsp = NULL; struct smb2_create_rsp *op_rsp = NULL; struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses); + int retry_count = 0; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) @@ -2372,10 +2375,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, smb2_set_related(&rqst[1]); +again: rc = compound_send_recv(xid, tcon->ses, server, flags, 2, rqst, resp_buftype, rsp_iov); + if (rc == -EAGAIN && retry_count++ < 10) + goto again; + /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) { @@ -3601,6 +3608,119 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } +static int smb3_simple_fallocate_write_range(unsigned int xid, + struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, + loff_t off, loff_t len, + char *buf) +{ + struct cifs_io_parms io_parms = {0}; + int nbytes; + struct kvec iov[2]; + + io_parms.netfid = cfile->fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.persistent_fid = cfile->fid.persistent_fid; + io_parms.volatile_fid = cfile->fid.volatile_fid; + io_parms.offset = off; + io_parms.length = len; + + /* iov[0] is reserved for smb header */ + iov[1].iov_base = buf; + iov[1].iov_len = io_parms.length; + return SMB2_write(xid, &io_parms, &nbytes, iov, 1); +} + +static int smb3_simple_fallocate_range(unsigned int xid, + struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, + loff_t off, loff_t len) +{ + struct file_allocated_range_buffer in_data, *out_data = NULL, *tmp_data; + u32 out_data_len; + char *buf = NULL; + loff_t l; + int rc; + + in_data.file_offset = cpu_to_le64(off); + in_data.length = cpu_to_le64(len); + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_QUERY_ALLOCATED_RANGES, true, + (char *)&in_data, sizeof(in_data), + 1024 * sizeof(struct file_allocated_range_buffer), + (char **)&out_data, &out_data_len); + if (rc) + goto out; + /* + * It is already all allocated + */ + if (out_data_len == 0) + goto out; + + buf = kzalloc(1024 * 1024, GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + tmp_data = out_data; + while (len) { + /* + * The rest of the region is unmapped so write it all. + */ + if (out_data_len == 0) { + rc = smb3_simple_fallocate_write_range(xid, tcon, + cfile, off, len, buf); + goto out; + } + + if (out_data_len < sizeof(struct file_allocated_range_buffer)) { + rc = -EINVAL; + goto out; + } + + if (off < le64_to_cpu(tmp_data->file_offset)) { + /* + * We are at a hole. Write until the end of the region + * or until the next allocated data, + * whichever comes next. + */ + l = le64_to_cpu(tmp_data->file_offset) - off; + if (len < l) + l = len; + rc = smb3_simple_fallocate_write_range(xid, tcon, + cfile, off, l, buf); + if (rc) + goto out; + off = off + l; + len = len - l; + if (len == 0) + goto out; + } + /* + * We are at a section of allocated data, just skip forward + * until the end of the data or the end of the region + * we are supposed to fallocate, whichever comes first. + */ + l = le64_to_cpu(tmp_data->length); + if (len < l) + l = len; + off += l; + len -= l; + + tmp_data = &tmp_data[1]; + out_data_len -= sizeof(struct file_allocated_range_buffer); + } + + out: + kfree(out_data); + kfree(buf); + return rc; +} + + static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, loff_t off, loff_t len, bool keep_size) { @@ -3662,6 +3782,26 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if ((keep_size == true) || (i_size_read(inode) >= off + len)) { /* + * At this point, we are trying to fallocate an internal + * regions of a sparse file. Since smb2 does not have a + * fallocate command we have two otions on how to emulate this. + * We can either turn the entire file to become non-sparse + * which we only do if the fallocate is for virtually + * the whole file, or we can overwrite the region with zeroes + * using SMB2_write, which could be prohibitevly expensive + * if len is large. + */ + /* + * We are only trying to fallocate a small region so + * just write it with zero. + */ + if (len <= 1024 * 1024) { + rc = smb3_simple_fallocate_range(xid, tcon, cfile, + off, len); + goto out; + } + + /* * Check if falloc starts within first few pages of file * and ends within a few pages of the end of file to * ensure that most of file is being forced to be diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c205f93e0a10..962826dc3316 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2pdu.c * @@ -8,19 +9,6 @@ * * Contains the routines for constructing the SMB2 PDUs themselves * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* SMB2 PDU handling routines here - except for leftovers (eg session setup) */ @@ -1791,10 +1779,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; trace_smb3_tcon(xid, tcon->tid, ses->Suid, tree, rc); if (rc != 0) { - if (tcon) { - cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE); - tcon->need_reconnect = true; - } + cifs_stats_fail_inc(tcon, SMB2_TREE_CONNECT_HE); + tcon->need_reconnect = true; goto tcon_error_exit; } @@ -2906,7 +2892,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, #endif /* CIFS_DEBUG2 */ if (buf) { - memcpy(buf, &rsp->CreationTime, 32); + buf->CreationTime = rsp->CreationTime; + buf->LastAccessTime = rsp->LastAccessTime; + buf->LastWriteTime = rsp->LastWriteTime; + buf->ChangeTime = rsp->ChangeTime; buf->AllocationSize = rsp->AllocationSize; buf->EndOfFile = rsp->EndofFile; buf->Attributes = rsp->FileAttributes; @@ -3484,6 +3473,8 @@ int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, NULL); } +#if 0 +/* currently unused, as now we are doing compounding instead (see smb311_posix_query_path_info) */ int SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct smb311_posix_qinfo *data, u32 *plen) @@ -3495,7 +3486,9 @@ SMB311_posix_query_info(const unsigned int xid, struct cifs_tcon *tcon, return query_info(xid, tcon, persistent_fid, volatile_fid, SMB_FIND_FILE_POSIX_INFO, SMB2_O_INFO_FILE, 0, output_len, sizeof(struct smb311_posix_qinfo), (void **)&data, plen); + /* Note caller must free "data" (passed in above). It may be allocated in query_info call */ } +#endif int SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, @@ -4498,7 +4491,7 @@ int posix_info_parse(const void *beg, const void *end, { int total_len = 0; - int sid_len; + int owner_len, group_len; int name_len; const void *owner_sid; const void *group_sid; @@ -4521,17 +4514,17 @@ int posix_info_parse(const void *beg, const void *end, /* check owner sid */ owner_sid = beg + total_len; - sid_len = posix_info_sid_size(owner_sid, end); - if (sid_len < 0) + owner_len = posix_info_sid_size(owner_sid, end); + if (owner_len < 0) return -1; - total_len += sid_len; + total_len += owner_len; /* check group sid */ group_sid = beg + total_len; - sid_len = posix_info_sid_size(group_sid, end); - if (sid_len < 0) + group_len = posix_info_sid_size(group_sid, end); + if (group_len < 0) return -1; - total_len += sid_len; + total_len += group_len; /* check name len */ if (beg + total_len + 4 > end) @@ -4552,10 +4545,8 @@ int posix_info_parse(const void *beg, const void *end, out->size = total_len; out->name_len = name_len; out->name = name; - memcpy(&out->owner, owner_sid, - posix_info_sid_size(owner_sid, end)); - memcpy(&out->group, group_sid, - posix_info_sid_size(group_sid, end)); + memcpy(&out->owner, owner_sid, owner_len); + memcpy(&out->group, group_sid, group_len); } return total_len; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 6442dc1c292b..a5c48b85549a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2pdu.h * @@ -6,19 +7,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _SMB2PDU_H @@ -276,7 +264,7 @@ struct share_redirect_error_context_rsp { __le32 NotificationType; __le32 ResourceNameOffset; __le32 ResourceNameLength; - __le16 Flags; + __le16 Reserved; __le16 TargetType; __le32 IPAddrCount; struct move_dst_ipaddr IpAddrMoveList[]; @@ -1460,6 +1448,22 @@ struct smb2_echo_rsp { #define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 +/* + * Valid FileInformation classes. + * + * Note that these are a subset of the (file) QUERY_INFO levels defined + * later in this file (but since QUERY_DIRECTORY uses equivalent numbers + * we do not redefine them here) + * + * FileDirectoryInfomation 0x01 + * FileFullDirectoryInformation 0x02 + * FileIdFullDirectoryInformation 0x26 + * FileBothDirectoryInformation 0x03 + * FileIdBothDirectoryInformation 0x25 + * FileNamesInformation 0x0C + * FileIdExtdDirectoryInformation 0x3C + */ + struct smb2_query_directory_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ @@ -1696,6 +1700,7 @@ struct smb3_fs_vol_info { #define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 #define FILE_STANDARD_LINK_INFORMATION 54 #define FILE_ID_INFORMATION 59 +#define FILE_ID_EXTD_DIRECTORY_INFORMATION 60 struct smb2_file_internal_info { __le64 IndexNumber; @@ -1776,13 +1781,31 @@ struct smb2_file_network_open_info { __le32 Reserved; } __packed; /* level 34 Query also similar returned in close rsp and open rsp */ -/* See MS-FSCC 2.4.43 */ +/* See MS-FSCC 2.4.21 */ struct smb2_file_id_information { __le64 VolumeSerialNumber; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ } __packed; /* level 59 */ +/* See MS-FSCC 2.4.18 */ +struct smb2_file_id_extd_directory_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 FileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* EA size */ + __le32 ReparsePointTag; /* valid if FILE_ATTR_REPARSE_POINT set in FileAttributes */ + __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit */ + char FileName[1]; +} __packed; /* level 60 */ + extern char smb2_padding[7]; /* equivalent of the contents of SMB3.1.1 POSIX open context response */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a5f87b02cfaf..263767f644f8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2proto.h * @@ -6,19 +7,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _SMB2PROTO_H #define _SMB2PROTO_H @@ -64,8 +52,6 @@ extern void smb2_echo_request(struct work_struct *work); extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode); extern bool smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv); -extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, - __u64 ses_id); extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h index 7505056e9580..0215ef36e240 100644 --- a/fs/cifs/smb2status.h +++ b/fs/cifs/smb2status.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smb2status.h * @@ -7,19 +8,6 @@ * Copyright (c) International Business Machines Corp., 2009,2011 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index e6fa76ab70be..6f7952ea4941 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/smb2transport.c * @@ -7,19 +8,6 @@ * Jeremy Allison (jra@samba.org) 2006 * Pavel Shilovsky (pshilovsky@samba.org) 2012 * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> @@ -154,6 +142,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { if (ses->Suid != ses_id) continue; + ++ses->ses_count; return ses; } @@ -205,7 +194,14 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) return NULL; } tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); + if (!tcon) { + cifs_put_smb_ses(ses); + spin_unlock(&cifs_tcp_ses_lock); + return NULL; + } spin_unlock(&cifs_tcp_ses_lock); + /* tcon already has a ref to ses, so we don't need ses anymore */ + cifs_put_smb_ses(ses); return tcon; } @@ -239,7 +235,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, if (rc) { cifs_server_dbg(VFS, "%s: sha256 alloc failed\n", __func__); - return rc; + goto out; } shash = &sdesc->shash; } else { @@ -290,6 +286,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, out: if (allocate_crypto) cifs_free_hash(&hash, &sdesc); + if (ses) + cifs_put_smb_ses(ses); return rc; } diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 10dfe5006792..31ef64eb7fbb 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -572,8 +572,13 @@ static struct rdma_cm_id *smbd_create_id( log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); goto out; } - wait_for_completion_interruptible_timeout( + rc = wait_for_completion_interruptible_timeout( &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + /* e.g. if interrupted returns -ERESTARTSYS */ + if (rc < 0) { + log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); + goto out; + } rc = info->ri_rc; if (rc) { log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); @@ -586,8 +591,13 @@ static struct rdma_cm_id *smbd_create_id( log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); goto out; } - wait_for_completion_interruptible_timeout( + rc = wait_for_completion_interruptible_timeout( &info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT)); + /* e.g. if interrupted returns -ERESTARTSYS */ + if (rc < 0) { + log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); + goto out; + } rc = info->ri_rc; if (rc) { log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h index 7f16cb825fe5..60189efb3236 100644 --- a/fs/cifs/smberr.h +++ b/fs/cifs/smberr.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smberr.h * @@ -7,19 +8,6 @@ * See Error Codes section of the SNIA CIFS Specification * for more information * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define SUCCESS 0x00 /* The request was successful. */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index a0e84747f567..d0fc42061f49 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -1,22 +1,10 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions * * Copyright (c) International Business Machines Corp., 2002,2013 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* IOCTL information */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index c1725b55f364..f65f9a692ca2 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/transport.c * @@ -5,19 +6,6 @@ * Author(s): Steve French (sfrench@us.ibm.com) * Jeremy Allison (jra@samba.org) 2006. * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index aa3e8ca0457c..9ed481e79ce0 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * fs/cifs/xattr.c * * Copyright (c) International Business Machines Corp., 2003, 2007 * Author(s): Steve French (sfrench@us.ibm.com) * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6ca7d16593ff..d00455440d08 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -344,13 +344,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, offsetof(struct fscrypt_nokey_name, sha256)); BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX); - if (hash) { - nokey_name.dirhash[0] = hash; - nokey_name.dirhash[1] = minor_hash; - } else { - nokey_name.dirhash[0] = 0; - nokey_name.dirhash[1] = 0; - } + nokey_name.dirhash[0] = hash; + nokey_name.dirhash[1] = minor_hash; + if (iname->len <= sizeof(nokey_name.bytes)) { memcpy(nokey_name.bytes, iname->name, iname->len); size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 261293fb7097..bca9c6658a7c 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -210,15 +210,40 @@ out_unlock: return err; } +/* + * Derive a SipHash key from the given fscrypt master key and the given + * application-specific information string. + * + * Note that the KDF produces a byte array, but the SipHash APIs expect the key + * as a pair of 64-bit words. Therefore, on big endian CPUs we have to do an + * endianness swap in order to get the same results as on little endian CPUs. + */ +static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, + u8 context, const u8 *info, + unsigned int infolen, siphash_key_t *key) +{ + int err; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, context, info, infolen, + (u8 *)key, sizeof(*key)); + if (err) + return err; + + BUILD_BUG_ON(sizeof(*key) != 16); + BUILD_BUG_ON(ARRAY_SIZE(key->key) != 2); + le64_to_cpus(&key->key[0]); + le64_to_cpus(&key->key[1]); + return 0; +} + int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, const struct fscrypt_master_key *mk) { int err; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY, - ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, - (u8 *)&ci->ci_dirhash_key, - sizeof(ci->ci_dirhash_key)); + err = fscrypt_derive_siphash_key(mk, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE, + &ci->ci_dirhash_key); if (err) return err; ci->ci_dirhash_key_initialized = true; @@ -253,10 +278,9 @@ static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, if (mk->mk_ino_hash_key_initialized) goto unlock; - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, - (u8 *)&mk->mk_ino_hash_key, - sizeof(mk->mk_ino_hash_key)); + err = fscrypt_derive_siphash_key(mk, + HKDF_CONTEXT_INODE_HASH_KEY, + NULL, 0, &mk->mk_ino_hash_key); if (err) goto unlock; /* pairs with smp_load_acquire() above */ diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 88d95d96e36c..42eee2783756 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -20,6 +20,7 @@ #include <net/sock.h> #include "config.h" +#include "midcomms.h" #include "lowcomms.h" /* @@ -79,6 +80,9 @@ struct dlm_cluster { unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; + + struct dlm_spaces *sps; + struct dlm_comms *cms; }; static struct dlm_cluster *config_item_to_cluster(struct config_item *i) @@ -204,7 +208,7 @@ static int dlm_check_zero(unsigned int x) static int dlm_check_buffer_size(unsigned int x) { - if (x < DEFAULT_BUFFER_SIZE) + if (x < DLM_MAX_SOCKET_BUFSIZE) return -EINVAL; return 0; @@ -409,6 +413,9 @@ static struct config_group *make_cluster(struct config_group *g, if (!cl || !sps || !cms) goto fail; + cl->sps = sps; + cl->cms = cms; + config_group_init_type_name(&cl->group, name, &cluster_type); config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); config_group_init_type_name(&cms->cs_group, "comms", &comms_type); @@ -458,6 +465,9 @@ static void drop_cluster(struct config_group *g, struct config_item *i) static void release_cluster(struct config_item *i) { struct dlm_cluster *cl = config_item_to_cluster(i); + + kfree(cl->sps); + kfree(cl->cms); kfree(cl); } @@ -532,7 +542,7 @@ static void drop_comm(struct config_group *g, struct config_item *i) struct dlm_comm *cm = config_item_to_comm(i); if (local_comm == cm) local_comm = NULL; - dlm_lowcomms_close(cm->nodeid); + dlm_midcomms_close(cm->nodeid); while (cm->addr_count--) kfree(cm->addr[cm->addr_count]); config_item_put(i); @@ -942,7 +952,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_LOG_INFO 1 -#define DEFAULT_PROTOCOL 0 +#define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 @@ -952,7 +962,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, - .ci_buffer_size = DEFAULT_BUFFER_SIZE, + .ci_buffer_size = DLM_MAX_SOCKET_BUFSIZE, .ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE, .ci_recover_timer = DEFAULT_RECOVER_TIMER, .ci_toss_secs = DEFAULT_TOSS_SECS, diff --git a/fs/dlm/config.h b/fs/dlm/config.h index d2cd4bd20313..df92b0a07fc6 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,7 +12,7 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ -#define DEFAULT_BUFFER_SIZE 4096 +#define DLM_MAX_SOCKET_BUFSIZE 4096 struct dlm_config_node { int nodeid; @@ -23,6 +23,9 @@ struct dlm_config_node { #define DLM_MAX_ADDR_COUNT 3 +#define DLM_PROTO_TCP 0 +#define DLM_PROTO_SCTP 1 + struct dlm_config_info { int ci_tcp_port; int ci_buffer_size; diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index d5bd990bcab8..47e9d57e4cae 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include "dlm_internal.h" +#include "midcomms.h" #include "lock.h" #define DLM_DEBUG_BUF_LEN 4096 @@ -23,6 +24,7 @@ static char debug_buf[DLM_DEBUG_BUF_LEN]; static struct mutex debug_buf_lock; static struct dentry *dlm_root; +static struct dentry *dlm_comms; static char *print_lockmode(int mode) { @@ -738,6 +740,57 @@ void dlm_delete_debug_file(struct dlm_ls *ls) debugfs_remove(ls->ls_debug_toss_dentry); } +static int dlm_state_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%s\n", dlm_midcomms_state(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_state); + +static int dlm_flags_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%lu\n", dlm_midcomms_flags(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_flags); + +static int dlm_send_queue_cnt_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "%d\n", dlm_midcomms_send_queue_cnt(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_send_queue_cnt); + +static int dlm_version_show(struct seq_file *file, void *offset) +{ + seq_printf(file, "0x%08x\n", dlm_midcomms_version(file->private)); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(dlm_version); + +void *dlm_create_debug_comms_file(int nodeid, void *data) +{ + struct dentry *d_node; + char name[256]; + + memset(name, 0, sizeof(name)); + snprintf(name, 256, "%d", nodeid); + + d_node = debugfs_create_dir(name, dlm_comms); + debugfs_create_file("state", 0444, d_node, data, &dlm_state_fops); + debugfs_create_file("flags", 0444, d_node, data, &dlm_flags_fops); + debugfs_create_file("send_queue_count", 0444, d_node, data, + &dlm_send_queue_cnt_fops); + debugfs_create_file("version", 0444, d_node, data, &dlm_version_fops); + + return d_node; +} + +void dlm_delete_debug_comms_file(void *ctx) +{ + debugfs_remove(ctx); +} + void dlm_create_debug_file(struct dlm_ls *ls) { char name[DLM_LOCKSPACE_LEN + 8]; @@ -797,6 +850,7 @@ void __init dlm_register_debugfs(void) { mutex_init(&debug_buf_lock); dlm_root = debugfs_create_dir("dlm", NULL); + dlm_comms = debugfs_create_dir("comms", dlm_root); } void dlm_unregister_debugfs(void) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 04fe9f525ac7..91d1ca3a121a 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -57,9 +57,12 @@ struct dlm_header; struct dlm_message; struct dlm_rcom; struct dlm_mhandle; +struct dlm_msg; #define log_print(fmt, args...) \ printk(KERN_ERR "dlm: "fmt"\n" , ##args) +#define log_print_ratelimited(fmt, args...) \ + printk_ratelimited(KERN_ERR "dlm: "fmt"\n", ##args) #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) @@ -368,23 +371,33 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) /* dlm_header is first element of all structs sent between nodes */ #define DLM_HEADER_MAJOR 0x00030000 -#define DLM_HEADER_MINOR 0x00000001 +#define DLM_HEADER_MINOR 0x00000002 + +#define DLM_VERSION_3_1 0x00030001 +#define DLM_VERSION_3_2 0x00030002 #define DLM_HEADER_SLOTS 0x00000001 #define DLM_MSG 1 #define DLM_RCOM 2 +#define DLM_OPTS 3 +#define DLM_ACK 4 +#define DLM_FIN 5 struct dlm_header { uint32_t h_version; - uint32_t h_lockspace; + union { + /* for DLM_MSG and DLM_RCOM */ + uint32_t h_lockspace; + /* for DLM_ACK and DLM_OPTS */ + uint32_t h_seq; + } u; uint32_t h_nodeid; /* nodeid of sender */ uint16_t h_length; uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ uint8_t h_pad; }; - #define DLM_MSG_REQUEST 1 #define DLM_MSG_CONVERT 2 #define DLM_MSG_UNLOCK 3 @@ -452,10 +465,29 @@ struct dlm_rcom { char rc_buf[]; }; +struct dlm_opt_header { + uint16_t t_type; + uint16_t t_length; + uint32_t o_pad; + /* need to be 8 byte aligned */ + char t_value[]; +}; + +/* encapsulation header */ +struct dlm_opts { + struct dlm_header o_header; + uint8_t o_nextcmd; + uint8_t o_pad; + uint16_t o_optlen; + uint32_t o_pad2; + char o_opts[]; +}; + union dlm_packet { struct dlm_header header; /* common to other two */ struct dlm_message message; struct dlm_rcom rcom; + struct dlm_opts opts; }; #define DLM_RSF_NEED_SLOTS 0x00000001 @@ -722,11 +754,15 @@ void dlm_register_debugfs(void); void dlm_unregister_debugfs(void); void dlm_create_debug_file(struct dlm_ls *ls); void dlm_delete_debug_file(struct dlm_ls *ls); +void *dlm_create_debug_comms_file(int nodeid, void *data); +void dlm_delete_debug_comms_file(void *ctx); #else static inline void dlm_register_debugfs(void) { } static inline void dlm_unregister_debugfs(void) { } static inline void dlm_create_debug_file(struct dlm_ls *ls) { } static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } +static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { return NULL; } +static inline void dlm_delete_debug_comms_file(void *ctx) { } #endif #endif /* __DLM_INTERNAL_DOT_H__ */ diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index b93df39d0915..c502c065d007 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -59,7 +59,7 @@ #include "dlm_internal.h" #include <linux/dlm_device.h> #include "memory.h" -#include "lowcomms.h" +#include "midcomms.h" #include "requestqueue.h" #include "util.h" #include "dir.h" @@ -3534,17 +3534,17 @@ static int _create_message(struct dlm_ls *ls, int mb_len, char *mb; /* get_buffer gives us a message handle (mh) that we need to - pass into lowcomms_commit and a message buffer (mb) that we + pass into midcomms_commit and a message buffer (mb) that we write our data into */ - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; ms = (struct dlm_message *) mb; ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - ms->m_header.h_lockspace = ls->ls_global_id; + ms->m_header.u.h_lockspace = ls->ls_global_id; ms->m_header.h_nodeid = dlm_our_nodeid(); ms->m_header.h_length = mb_len; ms->m_header.h_cmd = DLM_MSG; @@ -3589,7 +3589,7 @@ static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) { dlm_message_out(ms); - dlm_lowcomms_commit_buffer(mh); + dlm_midcomms_commit_mhandle(mh); return 0; } @@ -5038,16 +5038,16 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) if (hd->h_nodeid != nodeid) { log_print("invalid h_nodeid %d from %d lockspace %x", - hd->h_nodeid, nodeid, hd->h_lockspace); + hd->h_nodeid, nodeid, hd->u.h_lockspace); return; } - ls = dlm_find_lockspace_global(hd->h_lockspace); + ls = dlm_find_lockspace_global(hd->u.h_lockspace); if (!ls) { if (dlm_config.ci_log_debug) { printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace " "%u from %d cmd %d type %d\n", - hd->h_lockspace, nodeid, hd->h_cmd, type); + hd->u.h_lockspace, nodeid, hd->h_cmd, type); } if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index c14cf2b7faab..d71aba8c3e64 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -16,6 +16,7 @@ #include "member.h" #include "recoverd.h" #include "dir.h" +#include "midcomms.h" #include "lowcomms.h" #include "config.h" #include "memory.h" @@ -390,7 +391,7 @@ static int threads_start(void) } /* Thread for sending/receiving messages for all lockspace's */ - error = dlm_lowcomms_start(); + error = dlm_midcomms_start(); if (error) { log_print("cannot start dlm lowcomms %d", error); goto scand_fail; @@ -566,7 +567,12 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_requestqueue_mutex); mutex_init(&ls->ls_clear_proc_locks); - ls->ls_recover_buf = kmalloc(LOWCOMMS_MAX_TX_BUFFER_LEN, GFP_NOFS); + /* Due backwards compatibility with 3.1 we need to use maximum + * possible dlm message size to be sure the message will fit and + * not having out of bounds issues. However on sending side 3.2 + * might send less. + */ + ls->ls_recover_buf = kmalloc(DLM_MAX_SOCKET_BUFSIZE, GFP_NOFS); if (!ls->ls_recover_buf) goto out_lkbidr; @@ -698,7 +704,7 @@ int dlm_new_lockspace(const char *name, const char *cluster, error = 0; if (!ls_count) { dlm_scand_stop(); - dlm_lowcomms_shutdown(); + dlm_midcomms_shutdown(); dlm_lowcomms_stop(); } out: @@ -787,7 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) if (ls_count == 1) { dlm_scand_stop(); - dlm_lowcomms_shutdown(); + dlm_midcomms_shutdown(); } dlm_callback_stop(ls); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 166e36fcf3e4..0ea9ae35da0b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -59,7 +59,6 @@ #include "config.h" #define NEEDED_RMEM (4*1024*1024) -#define CONN_HASH_SIZE 32 /* Number of messages to send before rescheduling */ #define MAX_SEND_MSG_COUNT 25 @@ -79,14 +78,20 @@ struct connection { #define CF_CLOSING 8 #define CF_SHUTDOWN 9 #define CF_CONNECTED 10 +#define CF_RECONNECT 11 +#define CF_DELAY_CONNECT 12 +#define CF_EOF 13 struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; + atomic_t writequeue_cnt; void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ + bool (*eof_condition)(struct connection *con); /* What to do to eof check */ int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; struct connection *othercon; + struct connection *sendcon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ @@ -113,7 +118,22 @@ struct writequeue_entry { int len; int end; int users; + bool dirty; struct connection *con; + struct list_head msgs; + struct kref ref; +}; + +struct dlm_msg { + struct writequeue_entry *entry; + struct dlm_msg *orig_msg; + bool retransmit; + void *ppc; + int len; + int idx; /* new()/commit() idx exchange */ + + struct list_head list; + struct kref ref; }; struct dlm_node_addr { @@ -155,33 +175,23 @@ static void sctp_connect_to_sock(struct connection *con); static void tcp_connect_to_sock(struct connection *con); static void dlm_tcp_shutdown(struct connection *con); -/* This is deliberately very simple because most clusters have simple - sequential nodeids, so we should be able to go straight to a connection - struct in the array */ -static inline int nodeid_hash(int nodeid) +static struct connection *__find_con(int nodeid, int r) { - return nodeid & (CONN_HASH_SIZE-1); -} - -static struct connection *__find_con(int nodeid) -{ - int r, idx; struct connection *con; - r = nodeid_hash(nodeid); - - idx = srcu_read_lock(&connections_srcu); hlist_for_each_entry_rcu(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) { - srcu_read_unlock(&connections_srcu, idx); + if (con->nodeid == nodeid) return con; - } } - srcu_read_unlock(&connections_srcu, idx); return NULL; } +static bool tcp_eof_condition(struct connection *con) +{ + return atomic_read(&con->writequeue_cnt); +} + static int dlm_con_init(struct connection *con, int nodeid) { con->rx_buflen = dlm_config.ci_buffer_size; @@ -193,15 +203,23 @@ static int dlm_con_init(struct connection *con, int nodeid) mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); spin_lock_init(&con->writequeue_lock); + atomic_set(&con->writequeue_cnt, 0); INIT_WORK(&con->swork, process_send_sockets); INIT_WORK(&con->rwork, process_recv_sockets); init_waitqueue_head(&con->shutdown_wait); - if (dlm_config.ci_protocol == 0) { + switch (dlm_config.ci_protocol) { + case DLM_PROTO_TCP: con->connect_action = tcp_connect_to_sock; con->shutdown_action = dlm_tcp_shutdown; - } else { + con->eof_condition = tcp_eof_condition; + break; + case DLM_PROTO_SCTP: con->connect_action = sctp_connect_to_sock; + break; + default: + kfree(con->rx_buf); + return -EINVAL; } return 0; @@ -216,7 +234,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) struct connection *con, *tmp; int r, ret; - con = __find_con(nodeid); + r = nodeid_hash(nodeid); + con = __find_con(nodeid, r); if (con || !alloc) return con; @@ -230,8 +249,6 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } - r = nodeid_hash(nodeid); - spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -239,7 +256,7 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) * under protection of connections_lock. If this is the case we * abort our connection creation and return the existing connection. */ - tmp = __find_con(nodeid); + tmp = __find_con(nodeid, r); if (tmp) { spin_unlock(&connections_lock); kfree(con->rx_buf); @@ -256,15 +273,13 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i, idx; + int i; struct connection *con; - idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } - srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -462,6 +477,9 @@ static void lowcomms_data_ready(struct sock *sk) static void lowcomms_listen_data_ready(struct sock *sk) { + if (!dlm_allow_conn) + return; + queue_work(recv_workqueue, &listen_con.rwork); } @@ -518,14 +536,21 @@ static void lowcomms_state_change(struct sock *sk) int dlm_lowcomms_connect_node(int nodeid) { struct connection *con; + int idx; if (nodeid == dlm_our_nodeid()) return 0; + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, GFP_NOFS); - if (!con) + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return -ENOMEM; + } + lowcomms_connect_sock(con); + srcu_read_unlock(&connections_srcu, idx); + return 0; } @@ -587,6 +612,22 @@ static void lowcomms_error_report(struct sock *sk) dlm_config.ci_tcp_port, sk->sk_err, sk->sk_err_soft); } + + /* below sendcon only handling */ + if (test_bit(CF_IS_OTHERCON, &con->flags)) + con = con->sendcon; + + switch (sk->sk_err) { + case ECONNREFUSED: + set_bit(CF_DELAY_CONNECT, &con->flags); + break; + default: + break; + } + + if (!test_and_set_bit(CF_RECONNECT, &con->flags)) + queue_work(send_workqueue, &con->swork); + out: read_unlock_bh(&sk->sk_callback_lock); if (orig_report) @@ -669,6 +710,42 @@ static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, memset((char *)saddr + *addr_len, 0, sizeof(struct sockaddr_storage) - *addr_len); } +static void dlm_page_release(struct kref *kref) +{ + struct writequeue_entry *e = container_of(kref, struct writequeue_entry, + ref); + + __free_page(e->page); + kfree(e); +} + +static void dlm_msg_release(struct kref *kref) +{ + struct dlm_msg *msg = container_of(kref, struct dlm_msg, ref); + + kref_put(&msg->entry->ref, dlm_page_release); + kfree(msg); +} + +static void free_entry(struct writequeue_entry *e) +{ + struct dlm_msg *msg, *tmp; + + list_for_each_entry_safe(msg, tmp, &e->msgs, list) { + if (msg->orig_msg) { + msg->orig_msg->retransmit = false; + kref_put(&msg->orig_msg->ref, dlm_msg_release); + } + + list_del(&msg->list); + kref_put(&msg->ref, dlm_msg_release); + } + + list_del(&e->list); + atomic_dec(&e->con->writequeue_cnt); + kref_put(&e->ref, dlm_page_release); +} + static void dlm_close_sock(struct socket **sock) { if (*sock) { @@ -683,6 +760,7 @@ static void close_connection(struct connection *con, bool and_other, bool tx, bool rx) { bool closing = test_and_set_bit(CF_CLOSING, &con->flags); + struct writequeue_entry *e; if (tx && !closing && cancel_work_sync(&con->swork)) { log_print("canceled swork for node %d", con->nodeid); @@ -698,12 +776,35 @@ static void close_connection(struct connection *con, bool and_other, if (con->othercon && and_other) { /* Will only re-enter once. */ - close_connection(con->othercon, false, true, true); + close_connection(con->othercon, false, tx, rx); + } + + /* if we send a writequeue entry only a half way, we drop the + * whole entry because reconnection and that we not start of the + * middle of a msg which will confuse the other end. + * + * we can always drop messages because retransmits, but what we + * cannot allow is to transmit half messages which may be processed + * at the other side. + * + * our policy is to start on a clean state when disconnects, we don't + * know what's send/received on transport layer in this case. + */ + spin_lock(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_first_entry(&con->writequeue, struct writequeue_entry, + list); + if (e->dirty) + free_entry(e); } + spin_unlock(&con->writequeue_lock); con->rx_leftover = 0; con->retries = 0; clear_bit(CF_CONNECTED, &con->flags); + clear_bit(CF_DELAY_CONNECT, &con->flags); + clear_bit(CF_RECONNECT, &con->flags); + clear_bit(CF_EOF, &con->flags); mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); } @@ -841,19 +942,26 @@ out_resched: return -EAGAIN; out_close: - mutex_unlock(&con->sock_mutex); - if (ret != -EAGAIN) { - /* Reconnect when there is something to send */ - close_connection(con, false, true, false); - if (ret == 0) { - log_print("connection %p got EOF from %d", - con, con->nodeid); + if (ret == 0) { + log_print("connection %p got EOF from %d", + con, con->nodeid); + + if (con->eof_condition && con->eof_condition(con)) { + set_bit(CF_EOF, &con->flags); + mutex_unlock(&con->sock_mutex); + } else { + mutex_unlock(&con->sock_mutex); + close_connection(con, false, true, false); + /* handling for tcp shutdown */ clear_bit(CF_SHUTDOWN, &con->flags); wake_up(&con->shutdown_wait); - /* signal to breaking receive worker */ - ret = -1; } + + /* signal to breaking receive worker */ + ret = -1; + } else { + mutex_unlock(&con->sock_mutex); } return ret; } @@ -864,16 +972,12 @@ static int accept_from_sock(struct listen_connection *con) int result; struct sockaddr_storage peeraddr; struct socket *newsock; - int len; + int len, idx; int nodeid; struct connection *newcon; struct connection *addcon; unsigned int mark; - if (!dlm_allow_conn) { - return -1; - } - if (!con->sock) return -ENOTCONN; @@ -907,8 +1011,10 @@ static int accept_from_sock(struct listen_connection *con) * the same time and the connections cross on the wire. * In this case we store the incoming one in "othercon" */ + idx = srcu_read_lock(&connections_srcu); newcon = nodeid2con(nodeid, GFP_NOFS); if (!newcon) { + srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } @@ -924,6 +1030,7 @@ static int accept_from_sock(struct listen_connection *con) if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); + srcu_read_unlock(&connections_srcu, idx); result = -ENOMEM; goto accept_err; } @@ -932,11 +1039,14 @@ static int accept_from_sock(struct listen_connection *con) if (result < 0) { kfree(othercon); mutex_unlock(&newcon->sock_mutex); + srcu_read_unlock(&connections_srcu, idx); goto accept_err; } lockdep_set_subclass(&othercon->sock_mutex, 1); + set_bit(CF_IS_OTHERCON, &othercon->flags); newcon->othercon = othercon; + othercon->sendcon = newcon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); @@ -966,6 +1076,8 @@ static int accept_from_sock(struct listen_connection *con) if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) queue_work(recv_workqueue, &addcon->rwork); + srcu_read_unlock(&connections_srcu, idx); + return 0; accept_err: @@ -977,12 +1089,6 @@ accept_err: return result; } -static void free_entry(struct writequeue_entry *e) -{ - __free_page(e->page); - kfree(e); -} - /* * writequeue_entry_complete - try to delete and free write queue entry * @e: write queue entry to try to delete @@ -994,11 +1100,11 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) { e->offset += completed; e->len -= completed; + /* signal that page was half way transmitted */ + e->dirty = true; - if (e->len == 0 && e->users == 0) { - list_del(&e->list); + if (e->len == 0 && e->users == 0) free_entry(e); - } } /* @@ -1075,7 +1181,7 @@ static void sctp_connect_to_sock(struct connection *con) make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - log_print("connecting to %d", con->nodeid); + log_print_ratelimited("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ sctp_sock_set_nodelay(sock->sk); @@ -1171,7 +1277,7 @@ static void tcp_connect_to_sock(struct connection *con) make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); - log_print("connecting to %d", con->nodeid); + log_print_ratelimited("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ tcp_sock_set_nodelay(sock->sk); @@ -1364,12 +1470,16 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, entry->con = con; entry->users = 1; + kref_init(&entry->ref); + INIT_LIST_HEAD(&entry->msgs); return entry; } static struct writequeue_entry *new_wq_entry(struct connection *con, int len, - gfp_t allocation, char **ppc) + gfp_t allocation, char **ppc, + void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) { struct writequeue_entry *e; @@ -1377,7 +1487,12 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, if (!list_empty(&con->writequeue)) { e = list_last_entry(&con->writequeue, struct writequeue_entry, list); if (DLM_WQ_REMAIN_BYTES(e) >= len) { + kref_get(&e->ref); + *ppc = page_address(e->page) + e->end; + if (cb) + cb(mh); + e->end += len; e->users++; spin_unlock(&con->writequeue_lock); @@ -1391,42 +1506,92 @@ static struct writequeue_entry *new_wq_entry(struct connection *con, int len, if (!e) return NULL; + kref_get(&e->ref); *ppc = page_address(e->page); e->end += len; + atomic_inc(&con->writequeue_cnt); spin_lock(&con->writequeue_lock); + if (cb) + cb(mh); + list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); return e; }; -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, + gfp_t allocation, char **ppc, + void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) +{ + struct writequeue_entry *e; + struct dlm_msg *msg; + + msg = kzalloc(sizeof(*msg), allocation); + if (!msg) + return NULL; + + kref_init(&msg->ref); + + e = new_wq_entry(con, len, allocation, ppc, cb, mh); + if (!e) { + kfree(msg); + return NULL; + } + + msg->ppc = *ppc; + msg->len = len; + msg->entry = e; + + return msg; +} + +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh) { struct connection *con; + struct dlm_msg *msg; + int idx; - if (len > DEFAULT_BUFFER_SIZE || + if (len > DLM_MAX_SOCKET_BUFSIZE || len < sizeof(struct dlm_header)) { - BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE); + BUILD_BUG_ON(PAGE_SIZE < DLM_MAX_SOCKET_BUFSIZE); log_print("failed to allocate a buffer of size %d", len); WARN_ON(1); return NULL; } + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, allocation); - if (!con) + if (!con) { + srcu_read_unlock(&connections_srcu, idx); return NULL; + } - return new_wq_entry(con, len, allocation, ppc); + msg = dlm_lowcomms_new_msg_con(con, len, allocation, ppc, cb, mh); + if (!msg) { + srcu_read_unlock(&connections_srcu, idx); + return NULL; + } + + /* we assume if successful commit must called */ + msg->idx = idx; + return msg; } -void dlm_lowcomms_commit_buffer(void *mh) +static void _dlm_lowcomms_commit_msg(struct dlm_msg *msg) { - struct writequeue_entry *e = (struct writequeue_entry *)mh; + struct writequeue_entry *e = msg->entry; struct connection *con = e->con; int users; spin_lock(&con->writequeue_lock); + kref_get(&msg->ref); + list_add(&msg->list, &e->msgs); + users = --e->users; if (users) goto out; @@ -1442,6 +1607,42 @@ out: return; } +void dlm_lowcomms_commit_msg(struct dlm_msg *msg) +{ + _dlm_lowcomms_commit_msg(msg); + srcu_read_unlock(&connections_srcu, msg->idx); +} + +void dlm_lowcomms_put_msg(struct dlm_msg *msg) +{ + kref_put(&msg->ref, dlm_msg_release); +} + +/* does not held connections_srcu, usage workqueue only */ +int dlm_lowcomms_resend_msg(struct dlm_msg *msg) +{ + struct dlm_msg *msg_resend; + char *ppc; + + if (msg->retransmit) + return 1; + + msg_resend = dlm_lowcomms_new_msg_con(msg->entry->con, msg->len, + GFP_ATOMIC, &ppc, NULL, NULL); + if (!msg_resend) + return -ENOMEM; + + msg->retransmit = true; + kref_get(&msg->ref); + msg_resend->orig_msg = msg; + + memcpy(ppc, msg->ppc, msg->len); + _dlm_lowcomms_commit_msg(msg_resend); + dlm_lowcomms_put_msg(msg_resend); + + return 0; +} + /* Send a message */ static void send_to_sock(struct connection *con) { @@ -1483,7 +1684,7 @@ static void send_to_sock(struct connection *con) cond_resched(); goto out; } else if (ret < 0) - goto send_error; + goto out; } /* Don't starve people filling buffers */ @@ -1496,16 +1697,23 @@ static void send_to_sock(struct connection *con) writequeue_entry_complete(e, ret); } spin_unlock(&con->writequeue_lock); -out: - mutex_unlock(&con->sock_mutex); + + /* close if we got EOF */ + if (test_and_clear_bit(CF_EOF, &con->flags)) { + mutex_unlock(&con->sock_mutex); + close_connection(con, false, false, true); + + /* handling for tcp shutdown */ + clear_bit(CF_SHUTDOWN, &con->flags); + wake_up(&con->shutdown_wait); + } else { + mutex_unlock(&con->sock_mutex); + } + return; -send_error: +out: mutex_unlock(&con->sock_mutex); - close_connection(con, false, false, true); - /* Requeue the send work. When the work daemon runs again, it will try - a new connection, then call this function again. */ - queue_work(send_workqueue, &con->swork); return; out_connect: @@ -1520,7 +1728,6 @@ static void clean_one_writequeue(struct connection *con) spin_lock(&con->writequeue_lock); list_for_each_entry_safe(e, safe, &con->writequeue, list) { - list_del(&e->list); free_entry(e); } spin_unlock(&con->writequeue_lock); @@ -1532,8 +1739,10 @@ int dlm_lowcomms_close(int nodeid) { struct connection *con; struct dlm_node_addr *na; + int idx; log_print("closing connection to node %d", nodeid); + idx = srcu_read_lock(&connections_srcu); con = nodeid2con(nodeid, 0); if (con) { set_bit(CF_CLOSE, &con->flags); @@ -1542,6 +1751,7 @@ int dlm_lowcomms_close(int nodeid) if (con->othercon) clean_one_writequeue(con->othercon); } + srcu_read_unlock(&connections_srcu, idx); spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); @@ -1578,35 +1788,50 @@ static void process_send_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, swork); + WARN_ON(test_bit(CF_IS_OTHERCON, &con->flags)); + clear_bit(CF_WRITE_PENDING, &con->flags); - if (con->sock == NULL) /* not mutex protected so check it inside too */ + + if (test_and_clear_bit(CF_RECONNECT, &con->flags)) { + close_connection(con, false, false, true); + dlm_midcomms_unack_msg_resend(con->nodeid); + } + + if (con->sock == NULL) { /* not mutex protected so check it inside too */ + if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) + msleep(1000); con->connect_action(con); + } if (!list_empty(&con->writequeue)) send_to_sock(con); } static void work_stop(void) { - if (recv_workqueue) + if (recv_workqueue) { destroy_workqueue(recv_workqueue); - if (send_workqueue) + recv_workqueue = NULL; + } + + if (send_workqueue) { destroy_workqueue(send_workqueue); + send_workqueue = NULL; + } } static int work_start(void) { - recv_workqueue = alloc_workqueue("dlm_recv", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + recv_workqueue = alloc_ordered_workqueue("dlm_recv", WQ_MEM_RECLAIM); if (!recv_workqueue) { log_print("can't start dlm_recv"); return -ENOMEM; } - send_workqueue = alloc_workqueue("dlm_send", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1); + send_workqueue = alloc_ordered_workqueue("dlm_send", WQ_MEM_RECLAIM); if (!send_workqueue) { log_print("can't start dlm_send"); destroy_workqueue(recv_workqueue); + recv_workqueue = NULL; return -ENOMEM; } @@ -1621,6 +1846,8 @@ static void shutdown_conn(struct connection *con) void dlm_lowcomms_shutdown(void) { + int idx; + /* Set all the flags to prevent any * socket activity. */ @@ -1633,7 +1860,9 @@ void dlm_lowcomms_shutdown(void) dlm_close_sock(&listen_con.sock); + idx = srcu_read_lock(&connections_srcu); foreach_conn(shutdown_conn); + srcu_read_unlock(&connections_srcu, idx); } static void _stop_conn(struct connection *con, bool and_other) @@ -1682,7 +1911,7 @@ static void free_conn(struct connection *con) static void work_flush(void) { - int ok, idx; + int ok; int i; struct connection *con; @@ -1693,7 +1922,6 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); - idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { hlist_for_each_entry_rcu(con, &connection_hash[i], list) { @@ -1707,14 +1935,17 @@ static void work_flush(void) } } } - srcu_read_unlock(&connections_srcu, idx); } while (!ok); } void dlm_lowcomms_stop(void) { + int idx; + + idx = srcu_read_lock(&connections_srcu); work_flush(); foreach_conn(free_conn); + srcu_read_unlock(&connections_srcu, idx); work_stop(); deinit_local(); } @@ -1738,15 +1969,24 @@ int dlm_lowcomms_start(void) error = work_start(); if (error) - goto fail; + goto fail_local; dlm_allow_conn = 1; /* Start listening */ - if (dlm_config.ci_protocol == 0) + switch (dlm_config.ci_protocol) { + case DLM_PROTO_TCP: error = tcp_listen_for_all(); - else + break; + case DLM_PROTO_SCTP: error = sctp_listen_for_all(&listen_con); + break; + default: + log_print("Invalid protocol identifier %d set", + dlm_config.ci_protocol); + error = -EINVAL; + break; + } if (error) goto fail_unlisten; @@ -1755,6 +1995,9 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; dlm_close_sock(&listen_con.sock); + work_stop(); +fail_local: + deinit_local(); fail: return error; } diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 48bbc4e18761..aaae7115c00d 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -12,7 +12,22 @@ #ifndef __LOWCOMMS_DOT_H__ #define __LOWCOMMS_DOT_H__ -#define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 +#include "dlm_internal.h" + +#define DLM_MIDCOMMS_OPT_LEN sizeof(struct dlm_opts) +#define DLM_MAX_APP_BUFSIZE (DLM_MAX_SOCKET_BUFSIZE - \ + DLM_MIDCOMMS_OPT_LEN) + +#define CONN_HASH_SIZE 32 + +/* This is deliberately very simple because most clusters have simple + * sequential nodeids, so we should be able to go straight to a connection + * struct in the array + */ +static inline int nodeid_hash(int nodeid) +{ + return nodeid & (CONN_HASH_SIZE-1); +} /* switch to check if dlm is running */ extern int dlm_allow_conn; @@ -22,8 +37,12 @@ void dlm_lowcomms_shutdown(void); void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); -void dlm_lowcomms_commit_buffer(void *mh); +struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation, + char **ppc, void (*cb)(struct dlm_mhandle *mh), + struct dlm_mhandle *mh); +void dlm_lowcomms_commit_msg(struct dlm_msg *msg); +void dlm_lowcomms_put_msg(struct dlm_msg *msg); +int dlm_lowcomms_resend_msg(struct dlm_msg *msg); int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index ceef3f2074ff..d9e1e4170eb1 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -15,6 +15,7 @@ #include "recover.h" #include "rcom.h" #include "config.h" +#include "midcomms.h" #include "lowcomms.h" int dlm_slots_version(struct dlm_header *h) @@ -270,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, log_slots(ls, gen, num, NULL, array, array_size); - max_slots = (LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom) - + max_slots = (DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom) - sizeof(struct rcom_config)) / sizeof(struct rcom_slot); if (num > max_slots) { @@ -329,6 +330,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) memb->nodeid = node->nodeid; memb->weight = node->weight; memb->comm_seq = node->comm_seq; + dlm_midcomms_add_member(node->nodeid); add_ordered_member(ls, memb); ls->ls_num_nodes++; return 0; @@ -359,26 +361,34 @@ int dlm_is_removed(struct dlm_ls *ls, int nodeid) return 0; } -static void clear_memb_list(struct list_head *head) +static void clear_memb_list(struct list_head *head, + void (*after_del)(int nodeid)) { struct dlm_member *memb; while (!list_empty(head)) { memb = list_entry(head->next, struct dlm_member, list); list_del(&memb->list); + if (after_del) + after_del(memb->nodeid); kfree(memb); } } +static void clear_members_cb(int nodeid) +{ + dlm_midcomms_remove_member(nodeid); +} + void dlm_clear_members(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes); + clear_memb_list(&ls->ls_nodes, clear_members_cb); ls->ls_num_nodes = 0; } void dlm_clear_members_gone(struct dlm_ls *ls) { - clear_memb_list(&ls->ls_nodes_gone); + clear_memb_list(&ls->ls_nodes_gone, NULL); } static void make_member_array(struct dlm_ls *ls) @@ -552,6 +562,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) neg++; list_move(&memb->list, &ls->ls_nodes_gone); + dlm_midcomms_remove_member(memb->nodeid); ls->ls_num_nodes--; dlm_lsop_recover_slot(ls, memb); } @@ -576,12 +587,18 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) *neg_out = neg; error = ping_members(ls); - if (!error || error == -EPROTO) { - /* new_lockspace() may be waiting to know if the config - is good or bad */ - ls->ls_members_result = error; - complete(&ls->ls_members_done); - } + /* error -EINTR means that a new recovery action is triggered. + * We ignore this recovery action and let run the new one which might + * have new member configuration. + */ + if (error == -EINTR) + error = 0; + + /* new_lockspace() may be waiting to know if the config + * is good or bad + */ + ls->ls_members_result = error; + complete(&ls->ls_members_done); log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes); return error; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 1c6654a21ec4..e3de268898ed 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -3,7 +3,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2021 Red Hat, Inc. All rights reserved. ** ** ******************************************************************************* @@ -12,22 +12,866 @@ /* * midcomms.c * - * This is the appallingly named "mid-level" comms layer. + * This is the appallingly named "mid-level" comms layer. It takes care about + * deliver an on application layer "reliable" communication above the used + * lowcomms transport layer. * - * Its purpose is to take packets from the "real" comms layer, - * split them up into packets and pass them to the interested - * part of the locking mechanism. + * How it works: * - * It also takes messages from the locking layer, formats them - * into packets and sends them to the comms layer. + * Each nodes keeps track of all send DLM messages in send_queue with a sequence + * number. The receive will send an DLM_ACK message back for every DLM message + * received at the other side. If a reconnect happens in lowcomms we will send + * all unacknowledged dlm messages again. The receiving side might drop any already + * received message by comparing sequence numbers. + * + * How version detection works: + * + * Due the fact that dlm has pre-configured node addresses on every side + * it is in it's nature that every side connects at starts to transmit + * dlm messages which ends in a race. However DLM_RCOM_NAMES, DLM_RCOM_STATUS + * and their replies are the first messages which are exchanges. Due backwards + * compatibility these messages are not covered by the midcomms re-transmission + * layer. These messages have their own re-transmission handling in the dlm + * application layer. The version field of every node will be set on these RCOM + * messages as soon as they arrived and the node isn't yet part of the nodes + * hash. There exists also logic to detect version mismatched if something weird + * going on or the first messages isn't an expected one. + * + * Termination: + * + * The midcomms layer does a 4 way handshake for termination on DLM protocol + * like TCP supports it with half-closed socket support. SCTP doesn't support + * half-closed socket, so we do it on DLM layer. Also socket shutdown() can be + * interrupted by .e.g. tcp reset itself. Additional there exists the othercon + * paradigm in lowcomms which cannot be easily without breaking backwards + * compatibility. A node cannot send anything to another node when a DLM_FIN + * message was send. There exists additional logic to print a warning if + * DLM wants to do it. There exists a state handling like RFC 793 but reduced + * to termination only. The event "member removal event" describes the cluster + * manager removed the node from internal lists, at this point DLM does not + * send any message to the other node. There exists two cases: + * + * 1. The cluster member was removed and we received a FIN + * OR + * 2. We received a FIN but the member was not removed yet + * + * One of these cases will do the CLOSE_WAIT to LAST_ACK change. + * + * + * +---------+ + * | CLOSED | + * +---------+ + * | add member/receive RCOM version + * | detection msg + * V + * +---------+ + * | ESTAB | + * +---------+ + * CLOSE | | rcv FIN + * ------- | | ------- + * +---------+ snd FIN / \ snd ACK +---------+ + * | FIN |<----------------- ------------------>| CLOSE | + * | WAIT-1 |------------------ | WAIT | + * +---------+ rcv FIN \ +---------+ + * | rcv ACK of FIN ------- | CLOSE | member + * | -------------- snd ACK | ------- | removal + * V x V snd FIN V event + * +---------+ +---------+ +---------+ + * |FINWAIT-2| | CLOSING | | LAST-ACK| + * +---------+ +---------+ +---------+ + * | rcv ACK of FIN | rcv ACK of FIN | + * | rcv FIN -------------- | -------------- | + * | ------- x V x V + * \ snd ACK +---------+ +---------+ + * ------------------------>| CLOSED | | CLOSED | + * +---------+ +---------+ + * + * NOTE: any state can interrupted by midcomms_close() and state will be + * switched to CLOSED in case of fencing. There exists also some timeout + * handling when we receive the version detection RCOM messages which is + * made by observation. + * + * Future improvements: + * + * There exists some known issues/improvements of the dlm handling. Some + * of them should be done in a next major dlm version bump which makes + * it incompatible with previous versions. + * + * Unaligned memory access: + * + * There exists cases when the dlm message buffer length is not aligned + * to 8 byte. However seems nobody detected any problem with it. This + * can be fixed in the next major version bump of dlm. + * + * Version detection: + * + * The version detection and how it's done is related to backwards + * compatibility. There exists better ways to make a better handling. + * However this should be changed in the next major version bump of dlm. + * + * Ack handling: + * + * Currently we send an ack message for every dlm message. However we + * can ack multiple dlm messages with one ack by just delaying the ack + * message. Will reduce some traffic but makes the drop detection slower. + * + * Tail Size checking: + * + * There exists a message tail payload in e.g. DLM_MSG however we don't + * check it against the message length yet regarding to the receive buffer + * length. That need to be validated. + * + * Fencing bad nodes: + * + * At timeout places or weird sequence number behaviours we should send + * a fencing request to the cluster manager. + */ + +/* Debug switch to enable a 5 seconds sleep waiting of a termination. + * This can be useful to test fencing while termination is running. + * This requires a setup with only gfs2 as dlm user, so that the + * last umount will terminate the connection. + * + * However it became useful to test, while the 5 seconds block in umount + * just press the reset button. In a lot of dropping the termination + * process can could take several seconds. */ +#define DLM_DEBUG_FENCE_TERMINATION 0 + +#include <net/tcp.h> #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" +#include "util.h" #include "midcomms.h" +/* init value for sequence numbers for testing purpose only e.g. overflows */ +#define DLM_SEQ_INIT 0 +/* 3 minutes wait to sync ending of dlm */ +#define DLM_SHUTDOWN_TIMEOUT msecs_to_jiffies(3 * 60 * 1000) +#define DLM_VERSION_NOT_SET 0 + +struct midcomms_node { + int nodeid; + uint32_t version; + uint32_t seq_send; + uint32_t seq_next; + /* These queues are unbound because we cannot drop any message in dlm. + * We could send a fence signal for a specific node to the cluster + * manager if queues hits some maximum value, however this handling + * not supported yet. + */ + struct list_head send_queue; + spinlock_t send_queue_lock; + atomic_t send_queue_cnt; +#define DLM_NODE_FLAG_CLOSE 1 +#define DLM_NODE_FLAG_STOP_TX 2 +#define DLM_NODE_FLAG_STOP_RX 3 + unsigned long flags; + wait_queue_head_t shutdown_wait; + + /* dlm tcp termination state */ +#define DLM_CLOSED 1 +#define DLM_ESTABLISHED 2 +#define DLM_FIN_WAIT1 3 +#define DLM_FIN_WAIT2 4 +#define DLM_CLOSE_WAIT 5 +#define DLM_LAST_ACK 6 +#define DLM_CLOSING 7 + int state; + spinlock_t state_lock; + + /* counts how many lockspaces are using this node + * this refcount is necessary to determine if the + * node wants to disconnect. + */ + int users; + + /* not protected by srcu, node_hash lifetime */ + void *debugfs; + + struct hlist_node hlist; + struct rcu_head rcu; +}; + +struct dlm_mhandle { + const struct dlm_header *inner_hd; + struct midcomms_node *node; + struct dlm_opts *opts; + struct dlm_msg *msg; + bool committed; + uint32_t seq; + + void (*ack_rcv)(struct midcomms_node *node); + + /* get_mhandle/commit srcu idx exchange */ + int idx; + + struct list_head list; + struct rcu_head rcu; +}; + +static struct hlist_head node_hash[CONN_HASH_SIZE]; +static DEFINE_SPINLOCK(nodes_lock); +DEFINE_STATIC_SRCU(nodes_srcu); + +/* This mutex prevents that midcomms_close() is running while + * stop() or remove(). As I experienced invalid memory access + * behaviours when DLM_DEBUG_FENCE_TERMINATION is enabled and + * resetting machines. I will end in some double deletion in nodes + * datastructure. + */ +static DEFINE_MUTEX(close_lock); + +static inline const char *dlm_state_str(int state) +{ + switch (state) { + case DLM_CLOSED: + return "CLOSED"; + case DLM_ESTABLISHED: + return "ESTABLISHED"; + case DLM_FIN_WAIT1: + return "FIN_WAIT1"; + case DLM_FIN_WAIT2: + return "FIN_WAIT2"; + case DLM_CLOSE_WAIT: + return "CLOSE_WAIT"; + case DLM_LAST_ACK: + return "LAST_ACK"; + case DLM_CLOSING: + return "CLOSING"; + default: + return "UNKNOWN"; + } +} + +const char *dlm_midcomms_state(struct midcomms_node *node) +{ + return dlm_state_str(node->state); +} + +unsigned long dlm_midcomms_flags(struct midcomms_node *node) +{ + return node->flags; +} + +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node) +{ + return atomic_read(&node->send_queue_cnt); +} + +uint32_t dlm_midcomms_version(struct midcomms_node *node) +{ + return node->version; +} + +static struct midcomms_node *__find_node(int nodeid, int r) +{ + struct midcomms_node *node; + + hlist_for_each_entry_rcu(node, &node_hash[r], hlist) { + if (node->nodeid == nodeid) + return node; + } + + return NULL; +} + +static void dlm_mhandle_release(struct rcu_head *rcu) +{ + struct dlm_mhandle *mh = container_of(rcu, struct dlm_mhandle, rcu); + + dlm_lowcomms_put_msg(mh->msg); + kfree(mh); +} + +static void dlm_mhandle_delete(struct midcomms_node *node, + struct dlm_mhandle *mh) +{ + list_del_rcu(&mh->list); + atomic_dec(&node->send_queue_cnt); + call_rcu(&mh->rcu, dlm_mhandle_release); +} + +static void dlm_send_queue_flush(struct midcomms_node *node) +{ + struct dlm_mhandle *mh; + + pr_debug("flush midcomms send queue of node %d\n", node->nodeid); + + rcu_read_lock(); + spin_lock(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + dlm_mhandle_delete(node, mh); + } + spin_unlock(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void midcomms_node_reset(struct midcomms_node *node) +{ + pr_debug("reset node %d\n", node->nodeid); + + node->seq_next = DLM_SEQ_INIT; + node->seq_send = DLM_SEQ_INIT; + node->version = DLM_VERSION_NOT_SET; + node->flags = 0; + + dlm_send_queue_flush(node); + node->state = DLM_CLOSED; + wake_up(&node->shutdown_wait); +} + +static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc) +{ + struct midcomms_node *node, *tmp; + int r = nodeid_hash(nodeid); + + node = __find_node(nodeid, r); + if (node || !alloc) + return node; + + node = kmalloc(sizeof(*node), alloc); + if (!node) + return NULL; + + node->nodeid = nodeid; + spin_lock_init(&node->state_lock); + spin_lock_init(&node->send_queue_lock); + atomic_set(&node->send_queue_cnt, 0); + INIT_LIST_HEAD(&node->send_queue); + init_waitqueue_head(&node->shutdown_wait); + node->users = 0; + midcomms_node_reset(node); + + spin_lock(&nodes_lock); + /* check again if there was somebody else + * earlier here to add the node + */ + tmp = __find_node(nodeid, r); + if (tmp) { + spin_unlock(&nodes_lock); + kfree(node); + return tmp; + } + + hlist_add_head_rcu(&node->hlist, &node_hash[r]); + spin_unlock(&nodes_lock); + + node->debugfs = dlm_create_debug_comms_file(nodeid, node); + return node; +} + +static int dlm_send_ack(int nodeid, uint32_t seq) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_msg *msg; + char *ppc; + + msg = dlm_lowcomms_new_msg(nodeid, mb_len, GFP_NOFS, &ppc, + NULL, NULL); + if (!msg) + return -ENOMEM; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = dlm_our_nodeid(); + m_header->h_length = mb_len; + m_header->h_cmd = DLM_ACK; + m_header->u.h_seq = seq; + + header_out(m_header); + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); + + return 0; +} + +static int dlm_send_fin(struct midcomms_node *node, + void (*ack_rcv)(struct midcomms_node *node)) +{ + int mb_len = sizeof(struct dlm_header); + struct dlm_header *m_header; + struct dlm_mhandle *mh; + char *ppc; + + mh = dlm_midcomms_get_mhandle(node->nodeid, mb_len, GFP_NOFS, &ppc); + if (!mh) + return -ENOMEM; + + mh->ack_rcv = ack_rcv; + + m_header = (struct dlm_header *)ppc; + + m_header->h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + m_header->h_nodeid = dlm_our_nodeid(); + m_header->h_length = mb_len; + m_header->h_cmd = DLM_FIN; + + header_out(m_header); + + pr_debug("sending fin msg to node %d\n", node->nodeid); + dlm_midcomms_commit_mhandle(mh); + set_bit(DLM_NODE_FLAG_STOP_TX, &node->flags); + + return 0; +} + +static void dlm_receive_ack(struct midcomms_node *node, uint32_t seq) +{ + struct dlm_mhandle *mh; + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + if (mh->ack_rcv) + mh->ack_rcv(node); + } else { + /* send queue should be ordered */ + break; + } + } + + spin_lock(&node->send_queue_lock); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (before(mh->seq, seq)) { + dlm_mhandle_delete(node, mh); + } else { + /* send queue should be ordered */ + break; + } + } + spin_unlock(&node->send_queue_lock); + rcu_read_unlock(); +} + +static void dlm_pas_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive passive fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_LAST_ACK: + /* DLM_CLOSED */ + midcomms_node_reset(node); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); +} + +static void dlm_midcomms_receive_buffer(union dlm_packet *p, + struct midcomms_node *node, + uint32_t seq) +{ + if (seq == node->seq_next) { + node->seq_next++; + /* send ack before fin */ + dlm_send_ack(node->nodeid, node->seq_next); + + switch (p->header.h_cmd) { + case DLM_FIN: + spin_lock(&node->state_lock); + pr_debug("receive fin msg from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_ESTABLISHED: + node->state = DLM_CLOSE_WAIT; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + /* passive shutdown DLM_LAST_ACK case 1 + * additional we check if the node is used by + * cluster manager events at all. + */ + if (node->users == 0) { + node->state = DLM_LAST_ACK; + pr_debug("switch node %d to state %s case 1\n", + node->nodeid, dlm_state_str(node->state)); + spin_unlock(&node->state_lock); + goto send_fin; + } + break; + case DLM_FIN_WAIT1: + node->state = DLM_CLOSING; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_FIN_WAIT2: + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + wake_up(&node->shutdown_wait); + break; + case DLM_LAST_ACK: + /* probably remove_member caught it, do nothing */ + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); + + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + break; + default: + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer(p, node->nodeid); + break; + } + } else { + /* retry to ack message which we already have by sending back + * current node->seq_next number as ack. + */ + if (seq < node->seq_next) + dlm_send_ack(node->nodeid, node->seq_next); + + log_print_ratelimited("ignore dlm msg because seq mismatch, seq: %u, expected: %u, nodeid: %d", + seq, node->seq_next, node->nodeid); + } + + return; + +send_fin: + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); +} + +static struct midcomms_node * +dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p, + uint16_t msglen, int (*cb)(struct midcomms_node *node)) +{ + struct midcomms_node *node = NULL; + gfp_t allocation = 0; + int ret; + + switch (p->header.h_cmd) { + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("rcom msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return NULL; + } + + switch (le32_to_cpu(p->rcom.rc_type)) { + case DLM_RCOM_NAMES: + fallthrough; + case DLM_RCOM_NAMES_REPLY: + fallthrough; + case DLM_RCOM_STATUS: + fallthrough; + case DLM_RCOM_STATUS_REPLY: + node = nodeid2node(nodeid, 0); + if (node) { + spin_lock(&node->state_lock); + if (node->state != DLM_ESTABLISHED) + pr_debug("receive begin RCOM msg from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_ESTABLISHED: + break; + default: + /* some invalid state passive shutdown + * was failed, we try to reset and + * hope it will go on. + */ + log_print("reset node %d because shutdown stuck", + node->nodeid); + + midcomms_node_reset(node); + node->state = DLM_ESTABLISHED; + break; + } + spin_unlock(&node->state_lock); + } + + allocation = GFP_NOFS; + break; + default: + break; + } + + break; + default: + break; + } + + node = nodeid2node(nodeid, allocation); + if (!node) { + switch (p->header.h_cmd) { + case DLM_OPTS: + if (msglen < sizeof(struct dlm_opts)) { + log_print("opts msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return NULL; + } + + log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence", + p->opts.o_nextcmd, nodeid); + break; + default: + log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence", + p->header.h_cmd, nodeid); + break; + } + + return NULL; + } + + ret = cb(node); + if (ret < 0) + return NULL; + + return node; +} + +static int dlm_midcomms_version_check_3_2(struct midcomms_node *node) +{ + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_2; + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2, + node->nodeid); + break; + case DLM_VERSION_3_2: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_2, node->nodeid, node->version); + return -1; + } + + return 0; +} + +static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid) +{ + int len = msglen; + + /* we only trust outer header msglen because + * it's checked against receive buffer length. + */ + if (len < sizeof(struct dlm_opts)) + return -1; + len -= sizeof(struct dlm_opts); + + if (len < le16_to_cpu(p->opts.o_optlen)) + return -1; + len -= le16_to_cpu(p->opts.o_optlen); + + switch (p->opts.o_nextcmd) { + case DLM_FIN: + if (len < sizeof(struct dlm_header)) { + log_print("fin too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + case DLM_MSG: + if (len < sizeof(struct dlm_message)) { + log_print("msg too small: %d, will skip this message from node %d", + msglen, nodeid); + return -1; + } + + break; + case DLM_RCOM: + if (len < sizeof(struct dlm_rcom)) { + log_print("rcom msg too small: %d, will skip this message from node %d", + len, nodeid); + return -1; + } + + break; + default: + log_print("unsupported o_nextcmd received: %u, will skip this message from node %d", + p->opts.o_nextcmd, nodeid); + return -1; + } + + return 0; +} + +static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + uint32_t seq; + int ret, idx; + + idx = srcu_read_lock(&nodes_srcu); + node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, + dlm_midcomms_version_check_3_2); + if (!node) + goto out; + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* these rcom message we use to determine version. + * they have their own retransmission handling and + * are the first messages of dlm. + * + * length already checked. + */ + switch (le32_to_cpu(p->rcom.rc_type)) { + case DLM_RCOM_NAMES: + fallthrough; + case DLM_RCOM_NAMES_REPLY: + fallthrough; + case DLM_RCOM_STATUS: + fallthrough; + case DLM_RCOM_STATUS_REPLY: + break; + default: + log_print("unsupported rcom type received: %u, will skip this message from node %d", + le32_to_cpu(p->rcom.rc_type), nodeid); + goto out; + } + + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); + dlm_receive_buffer(p, nodeid); + break; + case DLM_OPTS: + seq = le32_to_cpu(p->header.u.h_seq); + + ret = dlm_opts_check_msglen(p, msglen, nodeid); + if (ret < 0) { + log_print("opts msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + p = (union dlm_packet *)((unsigned char *)p->opts.o_opts + + le16_to_cpu(p->opts.o_optlen)); + + /* recheck inner msglen just if it's not garbage */ + msglen = le16_to_cpu(p->header.h_length); + switch (p->header.h_cmd) { + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("inner rcom msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("inner msg too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + case DLM_FIN: + if (msglen < sizeof(struct dlm_header)) { + log_print("inner fin too small: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + break; + default: + log_print("unsupported inner h_cmd received: %u, will skip this message from node %d", + msglen, nodeid); + goto out; + } + + dlm_midcomms_receive_buffer(p, node, seq); + break; + case DLM_ACK: + seq = le32_to_cpu(p->header.u.h_seq); + dlm_receive_ack(node, seq); + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + break; + } + +out: + srcu_read_unlock(&nodes_srcu, idx); +} + +static int dlm_midcomms_version_check_3_1(struct midcomms_node *node) +{ + switch (node->version) { + case DLM_VERSION_NOT_SET: + node->version = DLM_VERSION_3_1; + log_print("version 0x%08x for node %d detected", DLM_VERSION_3_1, + node->nodeid); + break; + case DLM_VERSION_3_1: + break; + default: + log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x", + DLM_VERSION_3_1, node->nodeid, node->version); + return -1; + } + + return 0; +} + +static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid) +{ + uint16_t msglen = le16_to_cpu(p->header.h_length); + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen, + dlm_midcomms_version_check_3_1); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + srcu_read_unlock(&nodes_srcu, idx); + + switch (p->header.h_cmd) { + case DLM_RCOM: + /* length already checked */ + break; + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("msg too small: %u, will skip this message from node %d", + msglen, nodeid); + return; + } + + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message from node %d", + p->header.h_cmd, nodeid); + return; + } + + dlm_receive_buffer(p, nodeid); +} + /* * Called from the low-level comms layer to process a buffer of * commands. @@ -43,7 +887,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; - /* no message should be more than DEFAULT_BUFFER_SIZE or + /* no message should be more than DLM_MAX_SOCKET_BUFSIZE or * less than dlm_header size. * * Some messages does not have a 8 byte length boundary yet @@ -55,7 +899,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) * the next major version bump. */ msglen = le16_to_cpu(hd->h_length); - if (msglen > DEFAULT_BUFFER_SIZE || + if (msglen > DLM_MAX_SOCKET_BUFSIZE || msglen < sizeof(struct dlm_header)) { log_print("received invalid length header: %u from node %d, will abort message parsing", msglen, nodeid); @@ -68,32 +912,19 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) if (msglen > len) break; - switch (hd->h_cmd) { - case DLM_MSG: - if (msglen < sizeof(struct dlm_message)) { - log_print("dlm msg too small: %u, will skip this message", - msglen); - goto skip; - } - + switch (le32_to_cpu(hd->h_version)) { + case DLM_VERSION_3_1: + dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid); break; - case DLM_RCOM: - if (msglen < sizeof(struct dlm_rcom)) { - log_print("dlm rcom msg too small: %u, will skip this message", - msglen); - goto skip; - } - + case DLM_VERSION_3_2: + dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid); break; default: - log_print("unsupported h_cmd received: %u, will skip this message", - hd->h_cmd); - goto skip; + log_print("received invalid version header: %u from node %d, will skip this message", + le32_to_cpu(hd->h_version), nodeid); + break; } - dlm_receive_buffer((union dlm_packet *)ptr, nodeid); - -skip: ret += msglen; len -= msglen; ptr += msglen; @@ -102,3 +933,455 @@ skip: return ret; } +void dlm_midcomms_unack_msg_resend(int nodeid) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + int idx, ret; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* old protocol, we don't support to retransmit on failure */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + rcu_read_lock(); + list_for_each_entry_rcu(mh, &node->send_queue, list) { + if (!mh->committed) + continue; + + ret = dlm_lowcomms_resend_msg(mh->msg); + if (!ret) + log_print_ratelimited("retransmit dlm msg, seq %u, nodeid %d", + mh->seq, node->nodeid); + } + rcu_read_unlock(); + srcu_read_unlock(&nodes_srcu, idx); +} + +static void dlm_fill_opts_header(struct dlm_opts *opts, uint16_t inner_len, + uint32_t seq) +{ + opts->o_header.h_cmd = DLM_OPTS; + opts->o_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); + opts->o_header.h_nodeid = dlm_our_nodeid(); + opts->o_header.h_length = DLM_MIDCOMMS_OPT_LEN + inner_len; + opts->o_header.u.h_seq = seq; + header_out(&opts->o_header); +} + +static void midcomms_new_msg_cb(struct dlm_mhandle *mh) +{ + atomic_inc(&mh->node->send_queue_cnt); + + spin_lock(&mh->node->send_queue_lock); + list_add_tail_rcu(&mh->list, &mh->node->send_queue); + spin_unlock(&mh->node->send_queue_lock); + + mh->seq = mh->node->seq_send++; +} + +static struct dlm_msg *dlm_midcomms_get_msg_3_2(struct dlm_mhandle *mh, int nodeid, + int len, gfp_t allocation, char **ppc) +{ + struct dlm_opts *opts; + struct dlm_msg *msg; + + msg = dlm_lowcomms_new_msg(nodeid, len + DLM_MIDCOMMS_OPT_LEN, + allocation, ppc, midcomms_new_msg_cb, mh); + if (!msg) + return NULL; + + opts = (struct dlm_opts *)*ppc; + mh->opts = opts; + + /* add possible options here */ + dlm_fill_opts_header(opts, len, mh->seq); + + *ppc += sizeof(*opts); + mh->inner_hd = (const struct dlm_header *)*ppc; + return msg; +} + +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc) +{ + struct midcomms_node *node; + struct dlm_mhandle *mh; + struct dlm_msg *msg; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + WARN_ON_ONCE(1); + goto err; + } + + /* this is a bug, however we going on and hope it will be resolved */ + WARN_ON(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags)); + + mh = kzalloc(sizeof(*mh), GFP_NOFS); + if (!mh) + goto err; + + mh->idx = idx; + mh->node = node; + + switch (node->version) { + case DLM_VERSION_3_1: + msg = dlm_lowcomms_new_msg(nodeid, len, allocation, ppc, + NULL, NULL); + if (!msg) { + kfree(mh); + goto err; + } + + break; + case DLM_VERSION_3_2: + msg = dlm_midcomms_get_msg_3_2(mh, nodeid, len, allocation, + ppc); + if (!msg) { + kfree(mh); + goto err; + } + + break; + default: + kfree(mh); + WARN_ON(1); + goto err; + } + + mh->msg = msg; + + /* keep in mind that is a must to call + * dlm_midcomms_commit_msg() which releases + * nodes_srcu using mh->idx which is assumed + * here that the application will call it. + */ + return mh; + +err: + srcu_read_unlock(&nodes_srcu, idx); + return NULL; +} + +static void dlm_midcomms_commit_msg_3_2(struct dlm_mhandle *mh) +{ + /* nexthdr chain for fast lookup */ + mh->opts->o_nextcmd = mh->inner_hd->h_cmd; + mh->committed = true; + dlm_lowcomms_commit_msg(mh->msg); +} + +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh) +{ + switch (mh->node->version) { + case DLM_VERSION_3_1: + srcu_read_unlock(&nodes_srcu, mh->idx); + + dlm_lowcomms_commit_msg(mh->msg); + dlm_lowcomms_put_msg(mh->msg); + /* mh is not part of rcu list in this case */ + kfree(mh); + break; + case DLM_VERSION_3_2: + dlm_midcomms_commit_msg_3_2(mh); + srcu_read_unlock(&nodes_srcu, mh->idx); + break; + default: + srcu_read_unlock(&nodes_srcu, mh->idx); + WARN_ON(1); + break; + } +} + +int dlm_midcomms_start(void) +{ + int i; + + for (i = 0; i < CONN_HASH_SIZE; i++) + INIT_HLIST_HEAD(&node_hash[i]); + + return dlm_lowcomms_start(); +} + +static void dlm_act_fin_ack_rcv(struct midcomms_node *node) +{ + spin_lock(&node->state_lock); + pr_debug("receive active fin ack from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + + switch (node->state) { + case DLM_FIN_WAIT1: + node->state = DLM_FIN_WAIT2; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSING: + midcomms_node_reset(node); + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + wake_up(&node->shutdown_wait); + break; + case DLM_CLOSED: + /* not valid but somehow we got what we want */ + wake_up(&node->shutdown_wait); + break; + default: + spin_unlock(&node->state_lock); + log_print("%s: unexpected state: %d\n", + __func__, node->state); + WARN_ON(1); + return; + } + spin_unlock(&node->state_lock); +} + +void dlm_midcomms_add_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + if (nodeid == dlm_our_nodeid()) + return; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, GFP_NOFS); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + if (!node->users) { + pr_debug("receive add member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSED: + node->state = DLM_ESTABLISHED; + pr_debug("switch node %d to state %s\n", + node->nodeid, dlm_state_str(node->state)); + break; + default: + /* some invalid state passive shutdown + * was failed, we try to reset and + * hope it will go on. + */ + log_print("reset node %d because shutdown stuck", + node->nodeid); + + midcomms_node_reset(node); + node->state = DLM_ESTABLISHED; + break; + } + } + + node->users++; + pr_debug("users inc count %d\n", node->users); + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); +} + +void dlm_midcomms_remove_member(int nodeid) +{ + struct midcomms_node *node; + int idx; + + if (nodeid == dlm_our_nodeid()) + return; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + node->users--; + pr_debug("users dec count %d\n", node->users); + + /* hitting users count to zero means the + * other side is running dlm_midcomms_stop() + * we meet us to have a clean disconnect. + */ + if (node->users == 0) { + pr_debug("receive remove member from node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + break; + case DLM_CLOSE_WAIT: + /* passive shutdown DLM_LAST_ACK case 2 */ + node->state = DLM_LAST_ACK; + spin_unlock(&node->state_lock); + + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + goto send_fin; + case DLM_LAST_ACK: + /* probably receive fin caught it, do nothing */ + break; + case DLM_CLOSED: + /* already gone, do nothing */ + break; + default: + log_print("%s: unexpected state: %d\n", + __func__, node->state); + break; + } + } + spin_unlock(&node->state_lock); + + srcu_read_unlock(&nodes_srcu, idx); + return; + +send_fin: + set_bit(DLM_NODE_FLAG_STOP_RX, &node->flags); + dlm_send_fin(node, dlm_pas_fin_ack_rcv); + srcu_read_unlock(&nodes_srcu, idx); +} + +static void midcomms_node_release(struct rcu_head *rcu) +{ + struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu); + + WARN_ON(atomic_read(&node->send_queue_cnt)); + kfree(node); +} + +static void midcomms_shutdown(struct midcomms_node *node) +{ + int ret; + + /* old protocol, we don't wait for pending operations */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + return; + } + + spin_lock(&node->state_lock); + pr_debug("receive active shutdown for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + switch (node->state) { + case DLM_ESTABLISHED: + node->state = DLM_FIN_WAIT1; + pr_debug("switch node %d to state %s case 2\n", + node->nodeid, dlm_state_str(node->state)); + break; + case DLM_CLOSED: + /* we have what we want */ + spin_unlock(&node->state_lock); + return; + default: + /* busy to enter DLM_FIN_WAIT1, wait until passive + * done in shutdown_wait to enter DLM_CLOSED. + */ + break; + } + spin_unlock(&node->state_lock); + + if (node->state == DLM_FIN_WAIT1) { + dlm_send_fin(node, dlm_act_fin_ack_rcv); + + if (DLM_DEBUG_FENCE_TERMINATION) + msleep(5000); + } + + /* wait for other side dlm + fin */ + ret = wait_event_timeout(node->shutdown_wait, + node->state == DLM_CLOSED || + test_bit(DLM_NODE_FLAG_CLOSE, &node->flags), + DLM_SHUTDOWN_TIMEOUT); + if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags)) { + pr_debug("active shutdown timed out for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); + midcomms_node_reset(node); + return; + } + + pr_debug("active shutdown done for node %d with state %s\n", + node->nodeid, dlm_state_str(node->state)); +} + +void dlm_midcomms_shutdown(void) +{ + struct midcomms_node *node; + int i, idx; + + mutex_lock(&close_lock); + idx = srcu_read_lock(&nodes_srcu); + for (i = 0; i < CONN_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(node, &node_hash[i], hlist) { + midcomms_shutdown(node); + + dlm_delete_debug_comms_file(node->debugfs); + + spin_lock(&nodes_lock); + hlist_del_rcu(&node->hlist); + spin_unlock(&nodes_lock); + + call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release); + } + } + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); + + dlm_lowcomms_shutdown(); +} + +int dlm_midcomms_close(int nodeid) +{ + struct midcomms_node *node; + int idx, ret; + + if (nodeid == dlm_our_nodeid()) + return 0; + + idx = srcu_read_lock(&nodes_srcu); + /* Abort pending close/remove operation */ + node = nodeid2node(nodeid, 0); + if (node) { + /* let shutdown waiters leave */ + set_bit(DLM_NODE_FLAG_CLOSE, &node->flags); + wake_up(&node->shutdown_wait); + } + srcu_read_unlock(&nodes_srcu, idx); + + synchronize_srcu(&nodes_srcu); + + idx = srcu_read_lock(&nodes_srcu); + mutex_lock(&close_lock); + node = nodeid2node(nodeid, 0); + if (!node) { + mutex_unlock(&close_lock); + srcu_read_unlock(&nodes_srcu, idx); + return dlm_lowcomms_close(nodeid); + } + + ret = dlm_lowcomms_close(nodeid); + spin_lock(&node->state_lock); + midcomms_node_reset(node); + spin_unlock(&node->state_lock); + srcu_read_unlock(&nodes_srcu, idx); + mutex_unlock(&close_lock); + + return ret; +} diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 61e90a921849..579abc6929be 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,7 +12,22 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ +struct midcomms_node; + int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); +struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len, + gfp_t allocation, char **ppc); +void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh); +int dlm_midcomms_close(int nodeid); +int dlm_midcomms_start(void); +void dlm_midcomms_shutdown(void); +void dlm_midcomms_add_member(int nodeid); +void dlm_midcomms_remove_member(int nodeid); +void dlm_midcomms_unack_msg_resend(int nodeid); +const char *dlm_midcomms_state(struct midcomms_node *node); +unsigned long dlm_midcomms_flags(struct midcomms_node *node); +int dlm_midcomms_send_queue_cnt(struct midcomms_node *node); +uint32_t dlm_midcomms_version(struct midcomms_node *node); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index f5b1bd65728d..5651933f54a4 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -27,25 +27,15 @@ static int rcom_response(struct dlm_ls *ls) return test_bit(LSFL_RCOM_READY, &ls->ls_flags); } -static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, - struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, char *mb, int mb_len) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; - char *mb; - int mb_len = sizeof(struct dlm_rcom) + len; - - mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); - if (!mh) { - log_print("create_rcom to %d type %d len %d ENOBUFS", - to_nodeid, type, len); - return -ENOBUFS; - } rc = (struct dlm_rcom *) mb; rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - rc->rc_header.h_lockspace = ls->ls_global_id; + rc->rc_header.u.h_lockspace = ls->ls_global_id; rc->rc_header.h_nodeid = dlm_our_nodeid(); rc->rc_header.h_length = mb_len; rc->rc_header.h_cmd = DLM_RCOM; @@ -56,16 +46,67 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, rc->rc_seq = ls->ls_recover_seq; spin_unlock(&ls->ls_recover_lock); - *mh_ret = mh; *rc_ret = rc; +} + +static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_mhandle *mh; + char *mb; + + mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) { + log_print("%s to %d type %d len %d ENOBUFS", + __func__, to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + *mh_ret = mh; + return 0; +} + +static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, + int len, struct dlm_rcom **rc_ret, + struct dlm_msg **msg_ret) +{ + int mb_len = sizeof(struct dlm_rcom) + len; + struct dlm_msg *msg; + char *mb; + + msg = dlm_lowcomms_new_msg(to_nodeid, mb_len, GFP_NOFS, &mb, + NULL, NULL); + if (!msg) { + log_print("create_rcom to %d type %d len %d ENOBUFS", + to_nodeid, type, len); + return -ENOBUFS; + } + + _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len); + *msg_ret = msg; return 0; } +static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc) +{ + dlm_rcom_out(rc); +} + static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, struct dlm_rcom *rc) { - dlm_rcom_out(rc); - dlm_lowcomms_commit_buffer(mh); + _send_rcom(ls, rc); + dlm_midcomms_commit_mhandle(mh); +} + +static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg, + struct dlm_rcom *rc) +{ + _send_rcom(ls, rc); + dlm_lowcomms_commit_msg(msg); + dlm_lowcomms_put_msg(msg); } static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs, @@ -141,7 +182,7 @@ static void disallow_sync_reply(struct dlm_ls *ls) int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; + struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; @@ -153,17 +194,17 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags) } retry: - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, - sizeof(struct rcom_status), &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS, + sizeof(struct rcom_status), &rc, &msg); if (error) goto out; set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -191,11 +232,11 @@ retry: static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; struct rcom_status *rs; uint32_t status; int nodeid = rc_in->rc_header.h_nodeid; int len = sizeof(struct rcom_config); + struct dlm_msg *msg; int num_slots = 0; int error; @@ -218,8 +259,8 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) len += num_slots * sizeof(struct rcom_slot); do_create: - error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, - len, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY, + len, &rc, &msg); if (error) return; @@ -246,7 +287,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_recover_lock); do_send: - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); } static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -271,21 +312,22 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; + struct dlm_msg *msg; int error = 0; ls->ls_recover_nodeid = nodeid; retry: - error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len, + &rc, &msg); if (error) goto out; memcpy(rc->rc_buf, last_name, last_len); allow_sync_reply(ls, &rc->rc_id); - memset(ls->ls_recover_buf, 0, LOWCOMMS_MAX_TX_BUFFER_LEN); + memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -298,14 +340,15 @@ retry: static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; - struct dlm_mhandle *mh; int error, inlen, outlen, nodeid; + struct dlm_msg *msg; nodeid = rc_in->rc_header.h_nodeid; inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - outlen = LOWCOMMS_MAX_TX_BUFFER_LEN - sizeof(struct dlm_rcom); + outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom); - error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); + error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, + &rc, &msg); if (error) return; rc->rc_id = rc_in->rc_id; @@ -313,7 +356,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom(ls, mh, rc); + send_rcom_stateless(ls, msg, rc); } int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) @@ -342,10 +385,6 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid; int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); - error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); - if (error) - return; - /* Old code would send this special id to trigger a debug dump. */ if (rc_in->rc_id == 0xFFFFFFFF) { log_error(ls, "receive_rcom_lookup dump from %d", nodeid); @@ -353,6 +392,10 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) return; } + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); + if (error) + return; + error = dlm_master_lookup(ls, nodeid, rc_in->rc_buf, len, DLM_LU_RECOVER_MASTER, &ret_nodeid, NULL); if (error) @@ -458,14 +501,14 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) char *mb; int mb_len = sizeof(struct dlm_rcom) + sizeof(struct rcom_config); - mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); + mh = dlm_midcomms_get_mhandle(nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; rc = (struct dlm_rcom *) mb; rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); - rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace; + rc->rc_header.u.h_lockspace = rc_in->rc_header.u.h_lockspace; rc->rc_header.h_nodeid = dlm_our_nodeid(); rc->rc_header.h_length = mb_len; rc->rc_header.h_cmd = DLM_RCOM; @@ -479,7 +522,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) rf->rf_lvblen = cpu_to_le32(~0U); dlm_rcom_out(rc); - dlm_lowcomms_commit_buffer(mh); + dlm_midcomms_commit_mhandle(mh); return 0; } diff --git a/fs/dlm/util.c b/fs/dlm/util.c index cfd0d00b19ae..58acbcc2081a 100644 --- a/fs/dlm/util.c +++ b/fs/dlm/util.c @@ -20,18 +20,20 @@ #define DLM_ERRNO_ETIMEDOUT 110 #define DLM_ERRNO_EINPROGRESS 115 -static void header_out(struct dlm_header *hd) +void header_out(struct dlm_header *hd) { hd->h_version = cpu_to_le32(hd->h_version); - hd->h_lockspace = cpu_to_le32(hd->h_lockspace); + /* does it for others u32 in union as well */ + hd->u.h_lockspace = cpu_to_le32(hd->u.h_lockspace); hd->h_nodeid = cpu_to_le32(hd->h_nodeid); hd->h_length = cpu_to_le16(hd->h_length); } -static void header_in(struct dlm_header *hd) +void header_in(struct dlm_header *hd) { hd->h_version = le32_to_cpu(hd->h_version); - hd->h_lockspace = le32_to_cpu(hd->h_lockspace); + /* does it for others u32 in union as well */ + hd->u.h_lockspace = le32_to_cpu(hd->u.h_lockspace); hd->h_nodeid = le32_to_cpu(hd->h_nodeid); hd->h_length = le16_to_cpu(hd->h_length); } diff --git a/fs/dlm/util.h b/fs/dlm/util.h index cc719ca9397e..d46f23c7a6a0 100644 --- a/fs/dlm/util.h +++ b/fs/dlm/util.h @@ -15,6 +15,8 @@ void dlm_message_out(struct dlm_message *ms); void dlm_message_in(struct dlm_message *ms); void dlm_rcom_out(struct dlm_rcom *rc); void dlm_rcom_in(struct dlm_rcom *rc); +void header_out(struct dlm_header *hd); +void header_in(struct dlm_header *hd); #endif diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 858b3339f381..906af0c1998c 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -75,4 +75,3 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. - diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index aea129ddda74..3701c72bacb2 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_COMPRESS_H #define __EROFS_FS_COMPRESS_H @@ -85,4 +84,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq, struct list_head *pagepool); #endif - diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ebac756cb2a3..3787a5fb0a42 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <linux/prefetch.h> @@ -315,4 +314,3 @@ const struct address_space_operations erofs_raw_access_aops = { .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; - diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 88e33addf229..a5bc4b1b7813 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "compress.h" #include <linux/module.h> @@ -407,4 +406,3 @@ int z_erofs_decompress(struct z_erofs_decompress_req *rq, return z_erofs_shifted_transform(rq, pagepool); return z_erofs_decompress_generic(rq, pagepool); } - diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index 2776bb832127..eee9b0b31b63 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" @@ -139,4 +138,3 @@ const struct file_operations erofs_dir_fops = { .read = generic_read_dir, .iterate_shared = erofs_readdir, }; - diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 8739d3adf51f..0f8da74570b4 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -4,7 +4,6 @@ * * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_H #define __EROFS_FS_H @@ -348,4 +347,3 @@ static inline void erofs_check_ondisk_layout_definitions(void) } #endif - diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 7ed2d7391692..aa8a0d770ba3 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "xattr.h" @@ -374,4 +373,3 @@ const struct inode_operations erofs_fast_symlink_iops = { .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, }; - diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f92e3e32b9f4..543c2ff97d30 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_INTERNAL_H #define __EROFS_INTERNAL_H @@ -469,4 +468,3 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ - diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 3a81e1f7fc06..a8271ce5e13f 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "xattr.h" @@ -247,4 +246,3 @@ const struct inode_operations erofs_dir_iops = { .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, }; - diff --git a/fs/erofs/super.c b/fs/erofs/super.c index bbf3bbd908e0..8fc6c04b54f4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include <linux/module.h> #include <linux/buffer_head.h> @@ -285,6 +284,7 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } + ret = -EINVAL; blkszbits = dsb->blkszbits; /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ if (blkszbits != LOG_BLOCK_SIZE) { @@ -751,4 +751,3 @@ module_exit(erofs_module_exit); MODULE_DESCRIPTION("Enhanced ROM File System"); MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); MODULE_LICENSE("GPL"); - diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h index a72897c86744..64ceb7270b5c 100644 --- a/fs/erofs/tagptr.h +++ b/fs/erofs/tagptr.h @@ -1,8 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * A tagged pointer implementation - * - * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_TAGPTR_H #define __EROFS_FS_TAGPTR_H @@ -107,4 +105,3 @@ tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) *ptptr; }) #endif /* __EROFS_FS_TAGPTR_H */ - diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 6758c5b19f7c..bd86067a63f7 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <linux/pagevec.h> @@ -278,4 +277,3 @@ void erofs_exit_shrinker(void) unregister_shrinker(&erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */ - diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 47314a26767a..8dd54b420a1d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include <linux/security.h> #include "xattr.h" @@ -709,4 +708,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type) return acl; } #endif - diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 815304bd335f..366dcb400525 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_XATTR_H #define __EROFS_XATTR_H diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 78e4b598ecca..cb4d0889eca9 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "zdata.h" #include "compress.h" @@ -380,7 +379,6 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, enum z_erofs_page_type type) { int ret; - bool occupied; /* give priority for inplaceio */ if (clt->mode >= COLLECT_PRIMARY && @@ -388,8 +386,7 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, z_erofs_try_inplace_io(clt, page)) return 0; - ret = z_erofs_pagevec_enqueue(&clt->vector, - page, type, &occupied); + ret = z_erofs_pagevec_enqueue(&clt->vector, page, type); clt->cl->vcnt += (unsigned int)ret; return ret ? 0 : -EAGAIN; @@ -1471,4 +1468,3 @@ const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, .readahead = z_erofs_readahead, }; - diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 942ee69dff6a..3a008f1b9f78 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_ZDATA_H #define __EROFS_FS_ZDATA_H diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index efaf32596b97..f68aea4baed7 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -2,7 +2,6 @@ /* * Copyright (C) 2018-2019 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #include "internal.h" #include <asm/unaligned.h> @@ -597,4 +596,3 @@ out: DBG_BUGON(err < 0 && err != -ENOMEM); return err; } - diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h index 1d67cbd38704..dfd7fe0503bb 100644 --- a/fs/erofs/zpvec.h +++ b/fs/erofs/zpvec.h @@ -2,7 +2,6 @@ /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> */ #ifndef __EROFS_FS_ZPVEC_H #define __EROFS_FS_ZPVEC_H @@ -107,10 +106,8 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, struct page *page, - enum z_erofs_page_type type, - bool *occupied) + enum z_erofs_page_type type) { - *occupied = false; if (!ctor->next && type) if (ctor->index + 1 == ctor->nr) return false; @@ -125,7 +122,6 @@ static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, /* should remind that collector->next never equal to 1, 2 */ if (type == (uintptr_t)ctor->next) { ctor->next = page; - *occupied = true; } ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); return true; @@ -154,4 +150,3 @@ z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, return tagptr_unfold_ptr(t); } #endif - diff --git a/fs/exec.c b/fs/exec.c index 379afcf5d106..38f63451b928 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1357,6 +1357,10 @@ int begin_new_exec(struct linux_binprm * bprm) WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); + retval = set_cred_ucounts(bprm->cred); + if (retval < 0) + goto out_unlock; + /* * install the new credentials for this executable */ @@ -1871,7 +1875,7 @@ static int do_execveat_common(int fd, struct filename *filename, * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && - atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { + is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 37002663d521..3c51e243450d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -720,6 +720,7 @@ enum { #define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) +#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) @@ -741,6 +742,14 @@ enum { #define EXT4_STATE_FLAG_NEWENTRY 0x00000004 #define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 +/* flags for ioctl EXT4_IOC_CHECKPOINT */ +#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 +#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 +#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 +#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ + EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -1477,7 +1486,7 @@ struct ext4_sb_info { unsigned int s_inode_goal; u32 s_hash_seed[4]; int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; @@ -1488,6 +1497,7 @@ struct ext4_sb_info { struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; + struct buffer_head *s_mmp_bh; /* Journaling */ struct journal_s *s_journal; @@ -3614,6 +3624,7 @@ extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; /* sysfs.c */ +extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); extern int ext4_register_sysfs(struct super_block *sb); extern void ext4_unregister_sysfs(struct super_block *sb); extern int __init ext4_init_sysfs(void); @@ -3720,6 +3731,9 @@ extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); +/* mmp.c */ +extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); + /* verity.c */ extern const struct fsverity_operations ext4_verityops; @@ -3784,7 +3798,7 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) * have to read the block because we may read the old data * successfully. */ - if (!buffer_uptodate(bh) && buffer_write_io_error(bh)) + if (buffer_write_io_error(bh)) set_buffer_uptodate(bh); return buffer_uptodate(bh); } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cbf37b2cf871..92ad64b89d9b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -825,6 +825,7 @@ void ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_entries = 0; eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); + eh->eh_generation = 0; ext4_mark_inode_dirty(handle, inode); } @@ -1090,6 +1091,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_depth = 0; + neh->eh_generation = 0; /* move remainder of path[depth] to the new leaf */ if (unlikely(path[depth].p_hdr->eh_entries != @@ -1167,6 +1169,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, neh->eh_magic = EXT4_EXT_MAGIC; neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0)); neh->eh_depth = cpu_to_le16(depth - i); + neh->eh_generation = 0; fidx = EXT_FIRST_INDEX(neh); fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); @@ -1306,6 +1309,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, neh->eh_magic = EXT4_EXT_MAGIC; ext4_extent_block_csum_set(inode, neh); set_buffer_uptodate(bh); + set_buffer_verified(bh); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, inode, bh); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a729027322d..9a3a8996aacf 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1574,11 +1574,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); - if (!nr_to_scan) - return ret; - nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h index 68c8001fee85..ac642be2302e 100644 --- a/fs/ext4/fsmap.h +++ b/fs/ext4/fsmap.h @@ -50,7 +50,7 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, #define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ #define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ #define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ -#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */ -#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */ +#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* block bitmap */ +#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* inode bitmap */ #endif /* __EXT4_FSMAP_H__ */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 9bab7fd4ccd5..e89fc0f770b0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -402,7 +402,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, * * We always try to spread first-level directories. * - * If there are blockgroups with both free inodes and free blocks counts + * If there are blockgroups with both free inodes and free clusters counts * not worse than average we return one with smallest directory count. * Otherwise we simply return a random group. * @@ -411,7 +411,7 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g, * It's OK to put directory into a group unless * it has too many directories already (max_dirs) or * it has too few free inodes left (min_inodes) or - * it has too few free blocks left (min_blocks) or + * it has too few free clusters left (min_clusters) or * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more @@ -427,7 +427,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei, grp_free; - ext4_fsblk_t freeb, avefreec; + ext4_fsblk_t freec, avefreec; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_clusters; @@ -446,9 +446,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); avefreei = freei / ngroups; - freeb = EXT4_C2B(sbi, - percpu_counter_read_positive(&sbi->s_freeclusters_counter)); - avefreec = freeb; + freec = percpu_counter_read_positive(&sbi->s_freeclusters_counter); + avefreec = freec; do_div(avefreec, ngroups); ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3cf01629010d..70cb64db33f7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -204,7 +204,7 @@ out: /* * write the buffer to the inline inode. * If 'create' is set, we don't need to do the extra copy in the xattr - * value since it is already handled by ext4_xattr_ibody_inline_set. + * value since it is already handled by ext4_xattr_ibody_set. * That saves us one memcpy. */ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, @@ -286,7 +286,7 @@ static int ext4_create_inline_data(handle_t *handle, BUG_ON(!is.s.not_found); - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) { if (error == -ENOSPC) ext4_clear_inode_state(inode, @@ -358,7 +358,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, i.value = value; i.value_len = len; - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; @@ -431,7 +431,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, if (error) goto out; - error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (error) goto out; @@ -1925,8 +1925,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) i.value = value; i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ? i_size - EXT4_MIN_INLINE_DATA_SIZE : 0; - err = ext4_xattr_ibody_inline_set(handle, inode, - &i, &is); + err = ext4_xattr_ibody_set(handle, inode, &i, &is); if (err) goto out_error; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b8170a008590..d8de607849df 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -374,7 +374,7 @@ void ext4_da_update_reserve_space(struct inode *inode, ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) @@ -3223,7 +3223,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) ext4_clear_inode_state(inode, EXT4_STATE_JDATA); journal = EXT4_JOURNAL(inode); jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal); + err = jbd2_journal_flush(journal, 0); jbd2_journal_unlock_updates(journal); if (err) @@ -3418,7 +3418,7 @@ retry: * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ - else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode)) + else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -6005,7 +6005,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { - err = jbd2_journal_flush(journal); + err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_writepages_rwsem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 31627f7dc5cd..e27f34bceb8d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -659,10 +659,9 @@ static int ext4_ioc_getfsmap(struct super_block *sb, info.gi_sb = sb; info.gi_data = arg; error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); - if (error == EXT4_QUERY_RANGE_ABORT) { - error = 0; + if (error == EXT4_QUERY_RANGE_ABORT) aborted = true; - } else if (error) + else if (error) return error; /* If we didn't abort, set the "last" flag in the last fmx */ @@ -693,13 +692,6 @@ static long ext4_ioctl_group_add(struct file *file, if (err) return err; - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_add_out; - } - err = mnt_want_write_file(file); if (err) goto group_add_out; @@ -707,7 +699,7 @@ static long ext4_ioctl_group_add(struct file *file, err = ext4_group_add(sb, input); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) @@ -800,6 +792,57 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) return error; } +static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) +{ + int err = 0; + __u32 flags = 0; + unsigned int flush_flags = 0; + struct super_block *sb = file_inode(filp)->i_sb; + struct request_queue *q; + + if (copy_from_user(&flags, (__u32 __user *)arg, + sizeof(__u32))) + return -EFAULT; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* check for invalid bits set */ + if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if (!EXT4_SB(sb)->s_journal) + return -ENODEV; + + if (flags & ~JBD2_JOURNAL_FLUSH_VALID) + return -EINVAL; + + q = bdev_get_queue(EXT4_SB(sb)->s_journal->j_dev); + if (!q) + return -ENXIO; + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) + return 0; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD) + flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD; + + if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) { + flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT; + pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow"); + } + + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + + return err; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -871,13 +914,6 @@ setversion_out: goto group_extend_out; } - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_extend_out; - } - err = mnt_want_write_file(filp); if (err) goto group_extend_out; @@ -885,7 +921,7 @@ setversion_out: err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) @@ -1028,7 +1064,7 @@ mext_out: if (EXT4_SB(sb)->s_journal) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) @@ -1211,6 +1247,9 @@ resizefs_out: return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); + case EXT4_IOC_CHECKPOINT: + return ext4_ioctl_checkpoint(filp, arg); + default: return -ENOTTY; } @@ -1291,6 +1330,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: + case EXT4_IOC_CHECKPOINT: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 68fbeedd627b..6cb598b549ca 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -127,9 +127,9 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, */ static int kmmpd(void *data) { - struct super_block *sb = ((struct mmpd_data *) data)->sb; - struct buffer_head *bh = ((struct mmpd_data *) data)->bh; + struct super_block *sb = (struct super_block *) data; struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh; struct mmp_struct *mmp; ext4_fsblk_t mmp_block; u32 seq = 0; @@ -245,12 +245,18 @@ static int kmmpd(void *data) retval = write_mmp_block(sb, bh); exit_thread: - EXT4_SB(sb)->s_mmp_tsk = NULL; - kfree(data); - brelse(bh); return retval; } +void ext4_stop_mmpd(struct ext4_sb_info *sbi) +{ + if (sbi->s_mmp_tsk) { + kthread_stop(sbi->s_mmp_tsk); + brelse(sbi->s_mmp_bh); + sbi->s_mmp_tsk = NULL; + } +} + /* * Get a random new sequence number but make sure it is not greater than * EXT4_MMP_SEQ_MAX. @@ -275,7 +281,6 @@ int ext4_multi_mount_protect(struct super_block *sb, struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *bh = NULL; struct mmp_struct *mmp = NULL; - struct mmpd_data *mmpd_data; u32 seq; unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval); unsigned int wait_time = 0; @@ -364,24 +369,17 @@ skip: goto failed; } - mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL); - if (!mmpd_data) { - ext4_warning(sb, "not enough memory for mmpd_data"); - goto failed; - } - mmpd_data->sb = sb; - mmpd_data->bh = bh; + EXT4_SB(sb)->s_mmp_bh = bh; /* * Start a kernel thread to update the MMP block periodically. */ - EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s", + EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, sb, "kmmpd-%.*s", (int)sizeof(mmp->mmp_bdevname), bdevname(bh->b_bdev, mmp->mmp_bdevname)); if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) { EXT4_SB(sb)->s_mmp_tsk = NULL; - kfree(mmpd_data); ext4_warning(sb, "Unable to create kmmpd thread for %s.", sb->s_id); goto failed; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a4af26d4459a..5fd56f616cf0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2499,7 +2499,7 @@ again: /* Which index block gets the new entry? */ if (at - entries >= icount1) { - frame->at = at = at - entries - icount1 + entries2; + frame->at = at - entries - icount1 + entries2; frame->entries = entries = entries2; swap(frame->bh, bh2); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bd0d185654f3..fc885914c88a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -74,6 +74,15 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; } + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); + return -EOPNOTSUPP; + } + if (ext4_has_feature_sparse_super2(sb)) { + ext4_msg(sb, KERN_ERR, "Online resizing not supported with sparse_super2"); + return -EOPNOTSUPP; + } + if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags)) ret = -EBUSY; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d29f6aa7d96e..20344633bdd9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -718,6 +718,7 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); + ext4_notify_error_sysfs(sbi); return; } write_directly: @@ -726,6 +727,7 @@ write_directly: * out and hope for the best. */ ext4_commit_super(sbi->s_sb); + ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ @@ -1174,6 +1176,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_sysfs(sb); if (sbi->s_journal) { + jbd2_journal_unregister_shrinker(sbi->s_journal); aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -1245,8 +1248,8 @@ static void ext4_put_super(struct super_block *sb) ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + ext4_stop_mmpd(sbi); + brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* @@ -1441,26 +1444,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode) return ext4_write_inode(inode, &wbc); } -/* - * Try to release metadata pages (indirect blocks, directories) which are - * mapped via the block device. Since these pages could have journal heads - * which would prevent try_to_free_buffers() from freeing them, we must use - * jbd2 layer's try_to_free_buffers() function to release them. - */ -static int bdev_try_to_free_page(struct super_block *sb, struct page *page, - gfp_t wait) -{ - journal_t *journal = EXT4_SB(sb)->s_journal; - - WARN_ON(PageChecked(page)); - if (!page_has_buffers(page)) - return 0; - if (journal) - return jbd2_journal_try_to_free_buffers(journal, page); - - return try_to_free_buffers(page); -} - #ifdef CONFIG_FS_ENCRYPTION static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { @@ -1655,7 +1638,6 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif - .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext4_export_ops = { @@ -3101,8 +3083,15 @@ static void ext4_orphan_cleanup(struct super_block *sb, inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); - if (ret) + if (ret) { + /* + * We need to clean up the in-core orphan list + * manually if ext4_truncate() failed to get a + * transaction handle. + */ + ext4_orphan_del(NULL, inode); ext4_std_error(inode->i_sb, ret); + } inode_unlock(inode); nr_truncates++; } else { @@ -5058,6 +5047,7 @@ no_journal: ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); + ret = -ENOMEM; goto failed_mount6; } @@ -5178,6 +5168,7 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { + jbd2_journal_unregister_shrinker(sbi->s_journal); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } @@ -5186,8 +5177,7 @@ failed_mount3a: failed_mount3: flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + ext4_stop_mmpd(sbi); failed_mount2: rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); @@ -5504,6 +5494,12 @@ static int ext4_load_journal(struct super_block *sb, ext4_commit_super(sb); } + err = jbd2_journal_register_shrinker(journal); + if (err) { + EXT4_SB(sb)->s_journal = NULL; + goto err_out; + } + return 0; err_out: @@ -5646,7 +5642,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb, return 0; } jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal); + err = jbd2_journal_flush(journal, 0); if (err < 0) goto out; @@ -5788,7 +5784,7 @@ static int ext4_freeze(struct super_block *sb) * Don't clear the needs_recovery flag if we failed to * flush the journal. */ - error = jbd2_journal_flush(journal); + error = jbd2_journal_flush(journal, 0); if (error < 0) goto out; @@ -5989,8 +5985,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) */ ext4_mark_recovery_complete(sb, es); } - if (sbi->s_mmp_tsk) - kthread_stop(sbi->s_mmp_tsk); + ext4_stop_mmpd(sbi); } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || @@ -6383,7 +6378,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * otherwise be livelocked... */ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); if (err) return err; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 55fcab60a59a..2314f7446592 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -506,6 +506,11 @@ static struct kobj_type ext4_feat_ktype = { .release = (void (*)(struct kobject *))kfree, }; +void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) +{ + sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); +} + static struct kobject *ext4_root; static struct kobject *ext4_feat; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 10ba4b24a0aa..6dd5c05c444a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2190,31 +2190,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return 0; } -int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is) -{ - struct ext4_xattr_ibody_header *header; - struct ext4_xattr_search *s = &is->s; - int error; - - if (EXT4_I(inode)->i_extra_isize == 0) - return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) - return error; - header = IHDR(inode, ext4_raw_inode(&is->iloc)); - if (!IS_LAST_ENTRY(s->first)) { - header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); - ext4_set_inode_state(inode, EXT4_STATE_XATTR); - } else { - header->h_magic = cpu_to_le32(0); - ext4_clear_inode_state(inode, EXT4_STATE_XATTR); - } - return 0; -} - -static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, +int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 730b91fa0dd7..77efb9a627ad 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -186,9 +186,9 @@ extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size); -extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, - struct ext4_xattr_info *i, - struct ext4_xattr_ibody_find *is); +extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_info *i, + struct ext4_xattr_ibody_find *is); extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 62193106683d..06d04a74ab6c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -567,9 +567,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); - if (memcg_css) - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + if (memcg_css && !css_tryget(memcg_css)) + memcg_css = NULL; rcu_read_unlock(); + if (!memcg_css) + goto out_free; + + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); if (!isw->new_wb) goto out_free; @@ -2343,28 +2348,6 @@ int dirtytime_interval_handler(struct ctl_table *table, int write, return ret; } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) -{ - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; - - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } -} - /** * __mark_inode_dirty - internal function to mark an inode dirty * @@ -2434,9 +2417,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) (dirtytime && (inode->i_state & I_DIRTY_INODE))) return; - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); - spin_lock(&inode->i_lock); if (dirtytime && (inode->i_state & I_DIRTY_INODE)) goto out_unlock_inode; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 0bcf11a9987b..ed8b67b21718 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -56,14 +56,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, u64 block, struct page *page) { struct inode *inode = &ip->i_inode; - int release = 0; - - if (!page || page->index) { - page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); - if (!page) - return -ENOMEM; - release = 1; - } if (!PageUptodate(page)) { void *kaddr = kmap(page); @@ -97,26 +89,10 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, gfs2_ordered_add_inode(ip); } - if (release) { - unlock_page(page); - put_page(page); - } - return 0; } -/** - * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big - * @ip: The GFS2 inode to unstuff - * @page: The (optional) page. This is looked up if the @page is NULL - * - * This routine unstuffs a dinode and returns it to a "normal" state such - * that the height can be grown in the traditional way. - * - * Returns: errno - */ - -int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) +static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) { struct buffer_head *bh, *dibh; struct gfs2_dinode *di; @@ -124,11 +100,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) int isdir = gfs2_is_dir(ip); int error; - down_write(&ip->i_rw_mutex); - error = gfs2_meta_inode_buffer(ip, &dibh); if (error) - goto out; + return error; if (i_size_read(&ip->i_inode)) { /* Get a free block, fill it with the stuffed data, @@ -170,12 +144,38 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) out_brelse: brelse(dibh); + return error; +} + +/** + * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big + * @ip: The GFS2 inode to unstuff + * + * This routine unstuffs a dinode and returns it to a "normal" state such + * that the height can be grown in the traditional way. + * + * Returns: errno + */ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct page *page; + int error; + + down_write(&ip->i_rw_mutex); + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); + error = -ENOMEM; + if (!page) + goto out; + error = __gfs2_unstuff_inode(ip, page); + unlock_page(page); + put_page(page); out: up_write(&ip->i_rw_mutex); return error; } - /** * find_metapath - Find path through the metadata tree * @sdp: The superblock @@ -1079,7 +1079,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, goto out_trans_fail; if (unstuff) { - ret = gfs2_unstuff_dinode(ip, NULL); + ret = gfs2_unstuff_dinode(ip); if (ret) goto out_trans_end; release_metapath(mp); @@ -2143,7 +2143,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_grow_release; if (unstuff) { - error = gfs2_unstuff_dinode(ip, NULL); + error = gfs2_unstuff_dinode(ip); if (error) goto do_end_trans; } diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 6676d863faef..53cce6c08e81 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -46,7 +46,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, extern const struct iomap_ops gfs2_iomap_ops; extern const struct iomap_writeback_ops gfs2_writeback_ops; -extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip); extern int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 18f67b37d6f8..42b7dfffb5e7 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -172,7 +172,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, return -EINVAL; if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); + error = gfs2_unstuff_dinode(ip); if (error) return error; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 493a83e3f590..84ec053d43b4 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -210,7 +210,7 @@ void gfs2_set_inode_flags(struct inode *inode) /** * do_gfs2_set_flags - set flags on an inode - * @filp: file pointer + * @inode: The inode * @reqflags: The flags to set * @mask: Indicates which flags are valid * @fsflags: The FS_* inode flags passed in @@ -427,22 +427,25 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) struct gfs2_alloc_parms ap = { .aflags = 0, }; u64 offset = page_offset(page); unsigned int data_blocks, ind_blocks, rblocks; + vm_fault_t ret = VM_FAULT_LOCKED; struct gfs2_holder gh; unsigned int length; loff_t size; - int ret; + int err; sb_start_pagefault(inode->i_sb); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (ret) + err = gfs2_glock_nq(&gh); + if (err) { + ret = block_page_mkwrite_return(err); goto out_uninit; + } /* Check page index against inode size */ size = i_size_read(inode); if (offset >= size) { - ret = -EINVAL; + ret = VM_FAULT_SIGBUS; goto out_unlock; } @@ -450,8 +453,8 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) file_update_time(vmf->vma->vm_file); /* page is wholly or partially inside EOF */ - if (offset > size - PAGE_SIZE) - length = offset_in_page(size); + if (size - offset < PAGE_SIZE) + length = size - offset; else length = PAGE_SIZE; @@ -469,24 +472,30 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) !gfs2_write_alloc_required(ip, offset, length)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { - ret = -EAGAIN; + ret = VM_FAULT_NOPAGE; unlock_page(page); } goto out_unlock; } - ret = gfs2_rindex_update(sdp); - if (ret) + err = gfs2_rindex_update(sdp); + if (err) { + ret = block_page_mkwrite_return(err); goto out_unlock; + } gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; - ret = gfs2_quota_lock_check(ip, &ap); - if (ret) + err = gfs2_quota_lock_check(ip, &ap); + if (err) { + ret = block_page_mkwrite_return(err); goto out_unlock; - ret = gfs2_inplace_reserve(ip, &ap); - if (ret) + } + err = gfs2_inplace_reserve(ip, &ap); + if (err) { + ret = block_page_mkwrite_return(err); goto out_quota_unlock; + } rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) @@ -495,28 +504,38 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) rblocks += RES_STATFS + RES_QUOTA; rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); } - ret = gfs2_trans_begin(sdp, rblocks, 0); - if (ret) + err = gfs2_trans_begin(sdp, rblocks, 0); + if (err) { + ret = block_page_mkwrite_return(err); goto out_trans_fail; + } + + /* Unstuff, if required, and allocate backing blocks for page */ + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip); + if (err) { + ret = block_page_mkwrite_return(err); + goto out_trans_end; + } + } lock_page(page); - ret = -EAGAIN; /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. */ - if (!PageUptodate(page) || page->mapping != inode->i_mapping) - goto out_trans_end; + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = VM_FAULT_NOPAGE; + goto out_page_locked; + } - /* Unstuff, if required, and allocate backing blocks for page */ - ret = 0; - if (gfs2_is_stuffed(ip)) - ret = gfs2_unstuff_dinode(ip, page); - if (ret == 0) - ret = gfs2_allocate_page_backing(page, length); + err = gfs2_allocate_page_backing(page, length); + if (err) + ret = block_page_mkwrite_return(err); -out_trans_end: - if (ret) +out_page_locked: + if (ret != VM_FAULT_LOCKED) unlock_page(page); +out_trans_end: gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); @@ -526,12 +545,12 @@ out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (ret == 0) { + if (ret == VM_FAULT_LOCKED) { set_page_dirty(page); wait_for_stable_page(page); } sb_end_pagefault(inode->i_sb); - return block_page_mkwrite_return(ret); + return ret; } static vm_fault_t gfs2_fault(struct vm_fault *vmf) @@ -962,7 +981,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, gfs2_trans_add_meta(ip->i_gl, dibh); if (gfs2_is_stuffed(ip)) { - error = gfs2_unstuff_dinode(ip, NULL); + error = gfs2_unstuff_dinode(ip); if (unlikely(error)) goto out; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d9cb261f55b0..1f3902ecdded 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -212,8 +212,7 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) spin_lock(&lru_lock); - list_del(&gl->gl_lru); - list_add_tail(&gl->gl_lru, &lru_list); + list_move_tail(&gl->gl_lru, &lru_list); if (!test_bit(GLF_LRU, &gl->gl_flags)) { set_bit(GLF_LRU, &gl->gl_flags); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 826f77d9cff5..5f4504dd0875 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -687,6 +687,7 @@ static int init_statfs(struct gfs2_sbd *sdp) } iput(pn); + pn = NULL; ip = GFS2_I(sdp->sd_sc_inode); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_sc_gh); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 94637c307cc8..be0997e24d60 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -825,7 +825,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, u64 size; if (gfs2_is_stuffed(ip)) { - err = gfs2_unstuff_dinode(ip, NULL); + err = gfs2_unstuff_dinode(ip); if (err) return err; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 30dee68458c7..926eeb9bf4eb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1446,7 +1446,7 @@ static int get_hstate_idx(int page_size_log) * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, struct user_struct **user, + vm_flags_t acctflag, struct ucounts **ucounts, int creat_flags, int page_size_log) { struct inode *inode; @@ -1458,20 +1458,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size, if (hstate_idx < 0) return ERR_PTR(-ENODEV); - *user = NULL; + *ucounts = NULL; mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { - *user = current_user(); - if (user_shm_lock(size, *user)) { + *ucounts = current_ucounts(); + if (user_shm_lock(size, *ucounts)) { task_lock(current); pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { - *user = NULL; + *ucounts = NULL; return ERR_PTR(-EPERM); } } @@ -1498,9 +1498,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, iput(inode); out: - if (*user) { - user_shm_unlock(size, *user); - *user = NULL; + if (*ucounts) { + user_shm_unlock(size, *ucounts); + *ucounts = NULL; } return file; } diff --git a/fs/io-wq.c b/fs/io-wq.c index b3e8624a37d0..843d4a7bcd6e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -9,8 +9,6 @@ #include <linux/init.h> #include <linux/errno.h> #include <linux/sched/signal.h> -#include <linux/mm.h> -#include <linux/sched/mm.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/rculist_nulls.h> @@ -96,13 +94,14 @@ struct io_wqe { struct io_wq *wq; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; + + cpumask_var_t cpu_mask; }; /* * Per io_wq state */ struct io_wq { - struct io_wqe **wqes; unsigned long state; free_work_fn *free_work; @@ -110,14 +109,14 @@ struct io_wq { struct io_wq_hash *hash; - refcount_t refs; - atomic_t worker_refs; struct completion worker_done; struct hlist_node cpuhp_node; struct task_struct *task; + + struct io_wqe *wqes[]; }; static enum cpuhp_state io_wq_online; @@ -241,7 +240,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) * Most likely an attempt to queue unbounded work on an io_wq that * wasn't setup with any unbounded workers. */ - WARN_ON_ONCE(!acct->max_workers); + if (unlikely(!acct->max_workers)) + pr_warn_once("io-wq is not configured for unbound workers"); rcu_read_lock(); ret = io_wqe_activate_free_worker(wqe); @@ -560,17 +560,13 @@ loop: if (ret) continue; /* timed out, exit unless we're the fixed worker */ - if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || - !(worker->flags & IO_WORKER_F_FIXED)) + if (!(worker->flags & IO_WORKER_F_FIXED)) break; } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { raw_spin_lock_irq(&wqe->lock); - if (!wq_list_empty(&wqe->work_list)) - io_worker_handle_work(worker); - else - raw_spin_unlock_irq(&wqe->lock); + io_worker_handle_work(worker); } io_worker_exit(worker); @@ -645,7 +641,7 @@ fail: tsk->pf_io_worker = worker; worker->task = tsk; - set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); + set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); @@ -901,23 +897,20 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret = -ENOMEM, node; + int ret, node; struct io_wq *wq; if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(!bounded)) + return ERR_PTR(-EINVAL); - wq = kzalloc(sizeof(*wq), GFP_KERNEL); + wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); - - wq->wqes = kcalloc(nr_node_ids, sizeof(struct io_wqe *), GFP_KERNEL); - if (!wq->wqes) - goto err_wq; - ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); if (ret) - goto err_wqes; + goto err_wq; refcount_inc(&data->hash->refs); wq->hash = data->hash; @@ -934,6 +927,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); if (!wqe) goto err; + if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) + goto err; + cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); wq->wqes[node] = wqe; wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND; @@ -953,17 +949,18 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } wq->task = get_task_struct(data->task); - refcount_set(&wq->refs, 1); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); return wq; err: io_wq_put_hash(data->hash); cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - for_each_node(node) + for_each_node(node) { + if (!wq->wqes[node]) + continue; + free_cpumask_var(wq->wqes[node]->cpu_mask); kfree(wq->wqes[node]); -err_wqes: - kfree(wq->wqes); + } err_wq: kfree(wq); return ERR_PTR(ret); @@ -1033,10 +1030,10 @@ static void io_wq_destroy(struct io_wq *wq) .cancel_all = true, }; io_wqe_cancel_pending_work(wqe, &match); + free_cpumask_var(wqe->cpu_mask); kfree(wqe); } io_wq_put_hash(wq->hash); - kfree(wq->wqes); kfree(wq); } @@ -1045,25 +1042,67 @@ void io_wq_put_and_exit(struct io_wq *wq) WARN_ON_ONCE(!test_bit(IO_WQ_BIT_EXIT, &wq->state)); io_wq_exit_workers(wq); - if (refcount_dec_and_test(&wq->refs)) - io_wq_destroy(wq); + io_wq_destroy(wq); } +struct online_data { + unsigned int cpu; + bool online; +}; + static bool io_wq_worker_affinity(struct io_worker *worker, void *data) { - set_cpus_allowed_ptr(worker->task, cpumask_of_node(worker->wqe->node)); + struct online_data *od = data; + if (od->online) + cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask); + else + cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask); return false; } +static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) +{ + struct online_data od = { + .cpu = cpu, + .online = online + }; + int i; + + rcu_read_lock(); + for_each_node(i) + io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od); + rcu_read_unlock(); + return 0; +} + static int io_wq_cpu_online(unsigned int cpu, struct hlist_node *node) { struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); + + return __io_wq_cpu_online(wq, cpu, true); +} + +static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) +{ + struct io_wq *wq = hlist_entry_safe(node, struct io_wq, cpuhp_node); + + return __io_wq_cpu_online(wq, cpu, false); +} + +int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) +{ int i; rcu_read_lock(); - for_each_node(i) - io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, NULL); + for_each_node(i) { + struct io_wqe *wqe = wq->wqes[i]; + + if (mask) + cpumask_copy(wqe->cpu_mask, mask); + else + cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); + } rcu_read_unlock(); return 0; } @@ -1073,7 +1112,7 @@ static __init int io_wq_init(void) int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "io-wq/online", - io_wq_cpu_online, NULL); + io_wq_cpu_online, io_wq_cpu_offline); if (ret < 0) return ret; io_wq_online = ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index af2df0680ee2..3999ee58ff26 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -87,7 +87,6 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - const struct cred *creds; unsigned flags; }; @@ -128,6 +127,8 @@ void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); +int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); + static inline bool io_wq_is_hashed(struct io_wq_work *work) { return work->flags & IO_WQ_WORK_HASHED; diff --git a/fs/io_uring.c b/fs/io_uring.c index fa8794c61af7..e55b21fc0ab2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -11,7 +11,7 @@ * before writing the tail (using smp_load_acquire to read the tail will * do). It also needs a smp_mb() before updating CQ head (ordering the * entry load(s) with the head store), pairing with an implicit barrier - * through a control-dependency in io_get_cqring (smp_store_release to + * through a control-dependency in io_get_cqe (smp_store_release to * store head will do). Failure to do so could lead to reading invalid * CQ entries. * @@ -89,6 +89,7 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) +#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* * Shift of 9 is 512 entries, or exactly one page on 64-bit archs @@ -100,11 +101,19 @@ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) +#define IO_RSRC_TAG_TABLE_SHIFT 9 +#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) +#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) + #define IORING_MAX_REG_BUFFERS (1U << 14) #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ IOSQE_BUFFER_SELECT) +#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) + +#define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -164,7 +173,7 @@ struct io_rings { * Written by the application, shouldn't be modified by the * kernel. */ - u32 cq_flags; + u32 cq_flags; /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure @@ -243,7 +252,8 @@ typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); struct io_rsrc_data { struct io_ring_ctx *ctx; - u64 *tags; + u64 **tags; + unsigned int nr; rsrc_put_fn *do_put; atomic_t refs; struct completion done; @@ -288,7 +298,6 @@ struct io_sq_data { unsigned long state; struct completion exited; - struct callback_head *park_task_work; }; #define IO_IOPOLL_BATCH 8 @@ -299,11 +308,8 @@ struct io_sq_data { struct io_comp_state { struct io_kiocb *reqs[IO_COMPL_BATCH]; unsigned int nr; - unsigned int locked_free_nr; /* inline/task_work completion list, under ->uring_lock */ struct list_head free_list; - /* IRQ completion list, under ->completion_lock */ - struct list_head locked_free_list; }; struct io_submit_link { @@ -338,16 +344,23 @@ struct io_submit_state { }; struct io_ring_ctx { + /* const or read-mostly hot data */ struct { struct percpu_ref refs; - } ____cacheline_aligned_in_smp; - struct { + struct io_rings *rings; unsigned int flags; unsigned int compat: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; + unsigned int off_timeout_used: 1; + unsigned int drain_active: 1; + } ____cacheline_aligned_in_smp; + + /* submission data */ + struct { + struct mutex uring_lock; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -361,35 +374,33 @@ struct io_ring_ctx { * array. */ u32 *sq_array; + struct io_uring_sqe *sq_sqes; unsigned cached_sq_head; unsigned sq_entries; - unsigned sq_mask; - unsigned sq_thread_idle; - unsigned cached_sq_dropped; - unsigned cached_cq_overflow; - unsigned long sq_check_overflow; + struct list_head defer_list; - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; + /* + * Fixed resources fast path, should be accessed only under + * uring_lock, and updated through io_uring_register(2) + */ + struct io_rsrc_node *rsrc_node; + struct io_file_table file_table; + unsigned nr_user_files; + unsigned nr_user_bufs; + struct io_mapped_ubuf **user_bufs; - struct list_head defer_list; + struct io_submit_state submit_state; struct list_head timeout_list; struct list_head cq_overflow_list; - - struct io_uring_sqe *sq_sqes; - } ____cacheline_aligned_in_smp; - - struct { - struct mutex uring_lock; - wait_queue_head_t wait; + struct xarray io_buffers; + struct xarray personalities; + u32 pers_next; + unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; - struct io_submit_state submit_state; - - struct io_rings *rings; - - /* Only used for accounting purposes */ - struct mm_struct *mm_account; + /* IRQ completion list, under ->completion_lock */ + struct list_head locked_free_list; + unsigned int locked_free_nr; const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ @@ -397,44 +408,18 @@ struct io_ring_ctx { struct wait_queue_head sqo_sq_wait; struct list_head sqd_list; - /* - * If used, fixed file set. Writers must ensure that ->refs is dead, - * readers must ensure that ->refs is alive as long as the file* is - * used. Only updated through io_uring_register(2). - */ - struct io_rsrc_data *file_data; - struct io_file_table file_table; - unsigned nr_user_files; - - /* if used, fixed mapped user buffers */ - struct io_rsrc_data *buf_data; - unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; - - struct user_struct *user; - - struct completion ref_comp; - -#if defined(CONFIG_UNIX) - struct socket *ring_sock; -#endif - - struct xarray io_buffers; - - struct xarray personalities; - u32 pers_next; + unsigned long check_cq_overflow; struct { unsigned cached_cq_tail; unsigned cq_entries; - unsigned cq_mask; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; - unsigned cq_extra; - unsigned long cq_check_overflow; + struct eventfd_ctx *cq_ev_fd; + struct wait_queue_head poll_wait; struct wait_queue_head cq_wait; + unsigned cq_extra; + atomic_t cq_timeouts; struct fasync_struct *cq_fasync; - struct eventfd_ctx *cq_ev_fd; + unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; struct { @@ -449,29 +434,47 @@ struct io_ring_ctx { struct list_head iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; - bool poll_multi_file; + bool poll_multi_queue; } ____cacheline_aligned_in_smp; - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - struct io_rsrc_node *rsrc_node; - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_restriction restrictions; - /* exit task_work */ - struct callback_head *exit_task_work; + /* slow path rsrc auxilary data, used by update/register */ + struct { + struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; + struct io_rsrc_data *file_data; + struct io_rsrc_data *buf_data; + + struct delayed_work rsrc_put_work; + struct llist_head rsrc_put_llist; + struct list_head rsrc_ref_list; + spinlock_t rsrc_ref_lock; + }; /* Keep this last, we don't need it for the fast path */ - struct work_struct exit_work; - struct list_head tctx_list; + struct { + #if defined(CONFIG_UNIX) + struct socket *ring_sock; + #endif + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + + /* Only used for accounting purposes */ + struct user_struct *user; + struct mm_struct *mm_account; + + /* ctx exit and cancelation */ + struct callback_head *exit_task_work; + struct work_struct exit_work; + struct list_head tctx_list; + struct completion ref_comp; + }; }; struct io_uring_task { /* submission side */ + int cached_refs; struct xarray xa; struct wait_queue_head wait; const struct io_ring_ctx *last; @@ -706,7 +709,7 @@ enum { REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, /* first byte is taken by user flags, shift it to not overlap */ - REQ_F_FAIL_LINK_BIT = 8, + REQ_F_FAIL_BIT = 8, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, @@ -718,6 +721,7 @@ enum { REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_DONT_REISSUE_BIT, + REQ_F_CREDS_BIT, /* keep async read/write and isreg together and in order */ REQ_F_ASYNC_READ_BIT, REQ_F_ASYNC_WRITE_BIT, @@ -742,7 +746,7 @@ enum { REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), /* fail rest of links */ - REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), + REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), /* on inflight list, should be cancelled and waited on exit reliably */ REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), /* read/write uses file position */ @@ -771,6 +775,8 @@ enum { REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* has creds assigned */ + REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), }; struct async_poll { @@ -851,6 +857,8 @@ struct io_kiocb { struct hlist_node hash_node; struct async_poll *apoll; struct io_wq_work work; + const struct cred *creds; + /* store used ubuf, so we can prevent reloading */ struct io_mapped_ubuf *imu; }; @@ -1034,11 +1042,11 @@ static const struct io_op_def io_op_defs[] = { }; static bool io_disarm_next(struct io_kiocb *req); -static void io_uring_del_task_file(unsigned long index); +static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, - struct files_struct *files); -static void io_uring_cancel_sqpoll(struct io_sq_data *sqd); + bool cancel_all); +static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, @@ -1059,8 +1067,7 @@ static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); -static void io_submit_flush_completions(struct io_comp_state *cs, - struct io_ring_ctx *ctx); +static void io_submit_flush_completions(struct io_ring_ctx *ctx); static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); @@ -1106,15 +1113,14 @@ static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) percpu_ref_put(ref); } -static bool io_match_task(struct io_kiocb *head, - struct task_struct *task, - struct files_struct *files) +static bool io_match_task(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) { struct io_kiocb *req; if (task && head->task != task) return false; - if (!files) + if (cancel_all) return true; io_for_each_link(req, head) { @@ -1124,10 +1130,9 @@ static bool io_match_task(struct io_kiocb *head, return false; } -static inline void req_set_fail_links(struct io_kiocb *req) +static inline void req_set_fail(struct io_kiocb *req) { - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req->flags |= REQ_F_FAIL; } static void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1179,13 +1184,13 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); - init_waitqueue_head(&ctx->cq_wait); + init_waitqueue_head(&ctx->poll_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); - init_waitqueue_head(&ctx->wait); + init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); @@ -1196,7 +1201,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); - INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); + INIT_LIST_HEAD(&ctx->locked_free_list); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1205,13 +1210,20 @@ err: return NULL; } +static void io_account_cq_overflow(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); + ctx->cq_extra--; +} + static bool req_need_defer(struct io_kiocb *req, u32 seq) { if (unlikely(req->flags & REQ_F_IO_DRAIN)) { struct io_ring_ctx *ctx = req->ctx; - return seq + ctx->cq_extra != ctx->cached_cq_tail - + READ_ONCE(ctx->cached_cq_overflow); + return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; } return false; @@ -1230,8 +1242,10 @@ static void io_prep_async_work(struct io_kiocb *req) const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; - if (!req->work.creds) - req->work.creds = get_current_cred(); + if (!(req->flags & REQ_F_CREDS)) { + req->flags |= REQ_F_CREDS; + req->creds = get_current_cred(); + } req->work.list.next = NULL; req->work.flags = 0; @@ -1295,9 +1309,9 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -static void __io_queue_deferred(struct io_ring_ctx *ctx) +static void io_queue_deferred(struct io_ring_ctx *ctx) { - do { + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -1306,19 +1320,14 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) list_del_init(&de->list); io_req_task_queue(de->req); kfree(de); - } while (!list_empty(&ctx->defer_list)); + } } static void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; + u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - if (list_empty(&ctx->timeout_list)) - return; - - seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - - do { + while (!list_empty(&ctx->timeout_list)) { u32 events_needed, events_got; struct io_kiocb *req = list_first_entry(&ctx->timeout_list, struct io_kiocb, timeout.list); @@ -1340,27 +1349,31 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) list_del_init(&req->timeout.list); io_kill_timeout(req, 0); - } while (!list_empty(&ctx->timeout_list)); - + } ctx->cq_last_tm_flush = seq; } -static void io_commit_cqring(struct io_ring_ctx *ctx) +static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - io_flush_timeouts(ctx); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); +} +static inline void io_commit_cqring(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active)) + __io_commit_cqring_flush(ctx); /* order cqe stores with ring update */ smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); - - if (unlikely(!list_empty(&ctx->defer_list))) - __io_queue_deferred(ctx); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == r->sq_ring_entries; + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; } static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) @@ -1368,21 +1381,21 @@ static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); } -static inline struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; - unsigned tail; + unsigned tail, mask = ctx->cq_entries - 1; /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ - if (__io_cqring_events(ctx) == rings->cq_ring_entries) + if (__io_cqring_events(ctx) == ctx->cq_entries) return NULL; tail = ctx->cached_cq_tail++; - return &rings->cqes[tail & ctx->cq_mask]; + return &rings->cqes[tail & mask]; } static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) @@ -1399,14 +1412,14 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) /* see waitqueue_active() comment */ smp_mb(); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->cq_wait)) + wake_up(&ctx->cq_wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); + if (waitqueue_active(&ctx->poll_wait)) { + wake_up_interruptible(&ctx->poll_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } } @@ -1417,13 +1430,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) smp_mb(); if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->cq_wait)) + wake_up(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->cq_wait)) { - wake_up_interruptible(&ctx->cq_wait); + if (waitqueue_active(&ctx->poll_wait)) { + wake_up_interruptible(&ctx->poll_wait); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); } } @@ -1431,17 +1444,16 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - struct io_rings *rings = ctx->rings; unsigned long flags; bool all_flushed, posted; - if (!force && __io_cqring_events(ctx) == rings->cq_ring_entries) + if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; posted = false; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqring(ctx); + struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; if (!cqe && !force) @@ -1451,8 +1463,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (cqe) memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); else - WRITE_ONCE(ctx->rings->cq_overflow, - ++ctx->cached_cq_overflow); + io_account_cq_overflow(ctx); + posted = true; list_del(&ocqe->list); kfree(ocqe); @@ -1460,8 +1472,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { - clear_bit(0, &ctx->sq_check_overflow); - clear_bit(0, &ctx->cq_check_overflow); + clear_bit(0, &ctx->check_cq_overflow); ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } @@ -1477,7 +1488,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool ret = true; - if (test_bit(0, &ctx->cq_check_overflow)) { + if (test_bit(0, &ctx->check_cq_overflow)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); @@ -1536,12 +1547,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - WRITE_ONCE(ctx->rings->cq_overflow, ++ctx->cached_cq_overflow); + io_account_cq_overflow(ctx); return false; } if (list_empty(&ctx->cq_overflow_list)) { - set_bit(0, &ctx->sq_check_overflow); - set_bit(0, &ctx->cq_check_overflow); + set_bit(0, &ctx->check_cq_overflow); ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; } ocqe->cqe.user_data = user_data; @@ -1563,7 +1573,7 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data * submission (by quite a lot). Increment the overflow count in * the ring. */ - cqe = io_get_cqring(ctx); + cqe = io_get_cqe(ctx); if (likely(cqe)) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); @@ -1593,10 +1603,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res, * free_list cache. */ if (req_ref_put_and_test(req)) { - struct io_comp_state *cs = &ctx->submit_state.comp; - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); @@ -1605,8 +1613,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res, } io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->compl.list, &cs->locked_free_list); - cs->locked_free_nr++; + list_add(&req->compl.list, &ctx->locked_free_list); + ctx->locked_free_nr++; } else { if (!percpu_ref_tryget(&ctx->refs)) req = NULL; @@ -1622,8 +1630,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, static inline bool io_req_needs_clean(struct io_kiocb *req) { - return req->flags & (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | - REQ_F_POLLED | REQ_F_INFLIGHT); + return req->flags & IO_REQ_CLEAN_FLAGS; } static void io_req_complete_state(struct io_kiocb *req, long res, @@ -1652,7 +1659,7 @@ static inline void io_req_complete(struct io_kiocb *req, long res) static void io_req_complete_failed(struct io_kiocb *req, long res) { - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); io_req_complete_post(req, res, 0); } @@ -1661,8 +1668,8 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_comp_state *cs) { spin_lock_irq(&ctx->completion_lock); - list_splice_init(&cs->locked_free_list, &cs->free_list); - cs->locked_free_nr = 0; + list_splice_init(&ctx->locked_free_list, &cs->free_list); + ctx->locked_free_nr = 0; spin_unlock_irq(&ctx->completion_lock); } @@ -1678,7 +1685,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * locked cache, grab the lock and move them over to our submission * side cache. */ - if (READ_ONCE(cs->locked_free_nr) > IO_COMPL_BATCH) + if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) io_flush_cached_locked_reqs(ctx, cs); nr = state->free_reqs; @@ -1700,11 +1707,11 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - BUILD_BUG_ON(IO_REQ_ALLOC_BATCH > ARRAY_SIZE(state->reqs)); + BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); if (!state->free_reqs) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - int ret; + int ret, i; if (io_flush_cached_reqs(ctx)) goto got_req; @@ -1722,6 +1729,20 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) return NULL; ret = 1; } + + /* + * Don't initialise the fields below on every allocation, but + * do that in advance and keep valid on free. + */ + for (i = 0; i < ret; i++) { + struct io_kiocb *req = state->reqs[i]; + + req->ctx = ctx; + req->link = NULL; + req->async_data = NULL; + /* not necessary, but safer to zero */ + req->result = 0; + } state->free_reqs = ret; } got_req: @@ -1745,11 +1766,9 @@ static void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); if (req->fixed_rsrc_refs) percpu_ref_put(req->fixed_rsrc_refs); - if (req->async_data) + if (req->async_data) { kfree(req->async_data); - if (req->work.creds) { - put_cred(req->work.creds); - req->work.creds = NULL; + req->async_data = NULL; } } @@ -1831,7 +1850,7 @@ static bool io_disarm_next(struct io_kiocb *req) if (likely(req->flags & REQ_F_LINK_TIMEOUT)) posted = io_kill_linked_timeout(req); - if (unlikely((req->flags & REQ_F_FAIL_LINK) && + if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { posted |= (req->link != NULL); io_fail_links(req); @@ -1849,7 +1868,7 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL_LINK)) { + if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) { struct io_ring_ctx *ctx = req->ctx; unsigned long flags; bool posted; @@ -1880,54 +1899,51 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx) return; if (ctx->submit_state.comp.nr) { mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(&ctx->submit_state.comp, ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } percpu_ref_put(&ctx->refs); } -static bool __tctx_task_work(struct io_uring_task *tctx) +static void tctx_task_work(struct callback_head *cb) { struct io_ring_ctx *ctx = NULL; - struct io_wq_work_list list; - struct io_wq_work_node *node; - - if (wq_list_empty(&tctx->task_list)) - return false; - - spin_lock_irq(&tctx->task_lock); - list = tctx->task_list; - INIT_WQ_LIST(&tctx->task_list); - spin_unlock_irq(&tctx->task_lock); - - node = list.first; - while (node) { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req; + struct io_uring_task *tctx = container_of(cb, struct io_uring_task, + task_work); - req = container_of(node, struct io_kiocb, io_task_work.node); - if (req->ctx != ctx) { - ctx_flush_and_put(ctx); - ctx = req->ctx; - percpu_ref_get(&ctx->refs); + while (1) { + struct io_wq_work_node *node; + + spin_lock_irq(&tctx->task_lock); + node = tctx->task_list.first; + INIT_WQ_LIST(&tctx->task_list); + spin_unlock_irq(&tctx->task_lock); + + while (node) { + struct io_wq_work_node *next = node->next; + struct io_kiocb *req = container_of(node, struct io_kiocb, + io_task_work.node); + + if (req->ctx != ctx) { + ctx_flush_and_put(ctx); + ctx = req->ctx; + percpu_ref_get(&ctx->refs); + } + req->task_work.func(&req->task_work); + node = next; } - - req->task_work.func(&req->task_work); - node = next; + if (wq_list_empty(&tctx->task_list)) { + clear_bit(0, &tctx->task_state); + if (wq_list_empty(&tctx->task_list)) + break; + /* another tctx_task_work() is enqueued, yield */ + if (test_and_set_bit(0, &tctx->task_state)) + break; + } + cond_resched(); } ctx_flush_and_put(ctx); - return list.first != NULL; -} - -static void tctx_task_work(struct callback_head *cb) -{ - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); - - clear_bit(0, &tctx->task_state); - - while (__tctx_task_work(tctx)) - cond_resched(); } static int io_req_task_work_add(struct io_kiocb *req) @@ -2128,26 +2144,26 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, list_add(&req->compl.list, &state->comp.free_list); } -static void io_submit_flush_completions(struct io_comp_state *cs, - struct io_ring_ctx *ctx) +static void io_submit_flush_completions(struct io_ring_ctx *ctx) { + struct io_comp_state *cs = &ctx->submit_state.comp; int i, nr = cs->nr; - struct io_kiocb *req; struct req_batch rb; - io_init_req_batch(&rb); spin_lock_irq(&ctx->completion_lock); for (i = 0; i < nr; i++) { - req = cs->reqs[i]; + struct io_kiocb *req = cs->reqs[i]; + __io_cqring_fill_event(ctx, req->user_data, req->result, req->compl.cflags); } io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + + io_init_req_batch(&rb); for (i = 0; i < nr; i++) { - req = cs->reqs[i]; + struct io_kiocb *req = cs->reqs[i]; /* submission and completion refs */ if (req_ref_sub_and_test(req, 2)) @@ -2235,12 +2251,6 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) static inline bool io_run_task_work(void) { - /* - * Not safe to run on exiting task, and the task_work handling will - * not add work to such a task. - */ - if (unlikely(current->flags & PF_EXITING)) - return false; if (current->task_works) { __set_current_state(TASK_RUNNING); task_work_run(); @@ -2304,7 +2314,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * Only spin for completions if we don't have multiple devices hanging * off our complete list, and we're under the requested amount. */ - spin = !ctx->poll_multi_file && *nr_events < min; + spin = !ctx->poll_multi_queue && *nr_events < min; ret = 0; list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { @@ -2389,7 +2399,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - if (test_bit(0, &ctx->cq_check_overflow)) + if (test_bit(0, &ctx->check_cq_overflow)) __io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx)) goto out; @@ -2488,7 +2498,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, req->flags |= REQ_F_REISSUE; return; } - req_set_fail_links(req); + req_set_fail(req); } if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); @@ -2511,7 +2521,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (unlikely(res != req->result)) { if (!(res == -EAGAIN && io_rw_should_reissue(req) && io_resubmit_prep(req))) { - req_set_fail_links(req); + req_set_fail(req); req->flags |= REQ_F_DONT_REISSUE; } } @@ -2528,9 +2538,14 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) +static void io_iopoll_req_issued(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + const bool in_async = io_wq_current_is_worker(); + + /* workqueue context doesn't hold uring_lock, grab it now */ + if (unlikely(in_async)) + mutex_lock(&ctx->uring_lock); /* * Track whether we have multiple files in our lists. This will impact @@ -2538,14 +2553,22 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) * different devices. */ if (list_empty(&ctx->iopoll_list)) { - ctx->poll_multi_file = false; - } else if (!ctx->poll_multi_file) { + ctx->poll_multi_queue = false; + } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; + unsigned int queue_num0, queue_num1; list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, inflight_entry); - if (list_req->file != req->file) - ctx->poll_multi_file = true; + + if (list_req->file != req->file) { + ctx->poll_multi_queue = true; + } else { + queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); + queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); + if (queue_num0 != queue_num1) + ctx->poll_multi_queue = true; + } } /* @@ -2557,14 +2580,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req, bool in_async) else list_add_tail(&req->inflight_entry, &ctx->iopoll_list); - /* - * If IORING_SETUP_SQPOLL is enabled, sqes are either handled in sq thread - * task context or in io worker task context. If current task context is - * sq thread, we don't need to check whether should wake up sq thread. - */ - if (in_async && (ctx->flags & IORING_SETUP_SQPOLL) && - wq_has_sleeper(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); + if (unlikely(in_async)) { + /* + * If IORING_SETUP_SQPOLL is enabled, sqes are either handle + * in sq thread task context or in io worker task context. If + * current task context is sq thread, we don't need to check + * whether should wake up sq thread. + */ + if ((ctx->flags & IORING_SETUP_SQPOLL) && + wq_has_sleeper(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); + + mutex_unlock(&ctx->uring_lock); + } } static inline void io_state_file_put(struct io_submit_state *state) @@ -2621,7 +2649,7 @@ static bool __io_file_supports_async(struct file *file, int rw) return true; return false; } - if (S_ISCHR(mode) || S_ISSOCK(mode)) + if (S_ISSOCK(mode)) return true; if (S_ISREG(mode)) { if (IS_ENABLED(CONFIG_BLOCK) && @@ -2754,12 +2782,12 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (ret >= 0 && kiocb->ki_complete == io_complete_rw) + if (ret >= 0 && check_reissue) __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); - if (check_reissue && req->flags & REQ_F_REISSUE) { + if (check_reissue && (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { req_ref_get(req); @@ -2767,7 +2795,7 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, } else { int cflags = 0; - req_set_fail_links(req); + req_set_fail(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); __io_req_complete(req, issue_flags, ret, cflags); @@ -3238,7 +3266,7 @@ static bool io_rw_should_retry(struct io_kiocb *req) return true; } -static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) { if (req->file->f_op->read_iter) return call_read_iter(req->file, &req->rw.kiocb, iter); @@ -3453,6 +3481,10 @@ static int io_renameat_prep(struct io_kiocb *req, struct io_rename *ren = &req->rename; const char __user *oldf, *newf; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->buf_index) + return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3489,7 +3521,7 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3500,6 +3532,10 @@ static int io_unlinkat_prep(struct io_kiocb *req, struct io_unlink *un = &req->unlink; const char __user *fname; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3533,7 +3569,7 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3570,7 +3606,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_shutdown_sock(sock, req->shutdown.how); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -3581,7 +3617,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice* sp = &req->splice; + struct io_splice *sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -3628,14 +3664,14 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret != sp->len) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice* sp = &req->splice; + struct io_splice *sp = &req->splice; sp->off_in = READ_ONCE(sqe->splice_off_in); sp->off_out = READ_ONCE(sqe->off); @@ -3665,7 +3701,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret != sp->len) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3718,7 +3754,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3747,7 +3783,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, req->sync.len); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -3841,32 +3877,31 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) goto err; file = do_filp_open(req->open.dfd, req->open.filename, &op); - /* only retry if RESOLVE_CACHED wasn't already set by application */ - if ((!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)) && - file == ERR_PTR(-EAGAIN)) { + if (IS_ERR(file)) { /* - * We could hang on to this 'fd', but seems like marginal - * gain for something that is now known to be a slower path. - * So just put it, and we'll get a new one when we retry. + * We could hang on to this 'fd' on retrying, but seems like + * marginal gain for something that is now known to be a slower + * path. So just put it, and we'll get a new one when we retry. */ put_unused_fd(ret); - return -EAGAIN; - } - if (IS_ERR(file)) { - put_unused_fd(ret); ret = PTR_ERR(file); - } else { - if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) - file->f_flags &= ~O_NONBLOCK; - fsnotify_open(file); - fd_install(ret, file); + /* only retry if RESOLVE_CACHED wasn't already set by application */ + if (ret == -EAGAIN && + (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) + return -EAGAIN; + goto err; } + + if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) + file->f_flags &= ~O_NONBLOCK; + fsnotify_open(file); + fd_install(ret, file); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -3938,7 +3973,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (head) ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); @@ -4029,7 +4064,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) __io_remove_buffers(ctx, head, p->bgid, -1U); } if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); io_ring_submit_unlock(ctx, !force_nonblock); @@ -4075,7 +4110,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; #else @@ -4111,7 +4146,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; #else @@ -4150,7 +4185,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4185,7 +4220,7 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) ctx->buffer); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4243,7 +4278,7 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) ret = filp_close(file, current->files); err: if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); if (file) fput(file); __io_req_complete(req, issue_flags, ret, 0); @@ -4276,7 +4311,7 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -4380,7 +4415,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < min_ret) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4422,7 +4457,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) ret = -EINTR; if (ret < min_ret) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -4617,7 +4652,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; } @@ -4672,7 +4707,7 @@ out_free: if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_recv_kbuf(req); if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, cflags); return 0; } @@ -4711,7 +4746,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) { if (ret == -ERESTARTSYS) ret = -EINTR; - req_set_fail_links(req); + req_set_fail(req); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4775,7 +4810,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = -EINTR; out: if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5064,7 +5099,7 @@ static void io_async_task_func(struct callback_head *cb) struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); + trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); @@ -5143,50 +5178,51 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, return mask; } -static bool io_arm_poll_handler(struct io_kiocb *req) +enum { + IO_APOLL_OK, + IO_APOLL_ABORTED, + IO_APOLL_READY +}; + +static int io_arm_poll_handler(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask, ret; + __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; int rw; if (!req->file || !file_can_poll(req->file)) - return false; + return IO_APOLL_ABORTED; if (req->flags & REQ_F_POLLED) - return false; - if (def->pollin) + return IO_APOLL_ABORTED; + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; + + if (def->pollin) { rw = READ; - else if (def->pollout) + mask |= POLLIN | POLLRDNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if ((req->opcode == IORING_OP_RECVMSG) && + (req->sr_msg.msg_flags & MSG_ERRQUEUE)) + mask &= ~POLLIN; + } else { rw = WRITE; - else - return false; + mask |= POLLOUT | POLLWRNORM; + } + /* if we can't nonblock try, then no point in arming a poll handler */ if (!io_file_supports_async(req, rw)) - return false; + return IO_APOLL_ABORTED; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) - return false; + return IO_APOLL_ABORTED; apoll->double_poll = NULL; - - req->flags |= REQ_F_POLLED; req->apoll = apoll; - - mask = EPOLLONESHOT; - if (def->pollin) - mask |= POLLIN | POLLRDNORM; - if (def->pollout) - mask |= POLLOUT | POLLWRNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if ((req->opcode == IORING_OP_RECVMSG) && - (req->sr_msg.msg_flags & MSG_ERRQUEUE)) - mask &= ~POLLIN; - - mask |= POLLERR | POLLPRI; - + req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, @@ -5194,12 +5230,14 @@ static bool io_arm_poll_handler(struct io_kiocb *req) if (ret || ipt.error) { io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); - return false; + if (ret) + return IO_APOLL_READY; + return IO_APOLL_ABORTED; } spin_unlock_irq(&ctx->completion_lock); - trace_io_uring_poll_arm(ctx, req->opcode, req->user_data, mask, - apoll->poll.events); - return true; + trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, + mask, apoll->poll.events); + return IO_APOLL_OK; } static bool __io_poll_remove_one(struct io_kiocb *req, @@ -5246,7 +5284,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) if (do_complete) { io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); io_commit_cqring(req->ctx); - req_set_fail_links(req); + req_set_fail(req); io_put_req_deferred(req, 1); } @@ -5257,7 +5295,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) * Returns true if we found and killed one or more poll requests */ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) + bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5269,7 +5307,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task(req, tsk, files)) + if (io_match_task(req, tsk, cancel_all)) posted += io_poll_remove_one(req); } } @@ -5456,7 +5494,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) err: if (ret < 0) { spin_unlock_irq(&ctx->completion_lock); - req_set_fail_links(req); + req_set_fail(req); io_req_complete(req, ret); return 0; } @@ -5476,7 +5514,7 @@ err: if (!completing) { ret = io_poll_add(preq, issue_flags); if (ret < 0) { - req_set_fail_links(preq); + req_set_fail(preq); io_req_complete(preq, ret); } } @@ -5501,7 +5539,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); return HRTIMER_NORESTART; } @@ -5537,7 +5575,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (IS_ERR(req)) return PTR_ERR(req); - req_set_fail_links(req); + req_set_fail(req); io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); io_put_req_deferred(req, 1); return 0; @@ -5616,7 +5654,7 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); return 0; } @@ -5639,6 +5677,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; req->timeout.off = off; + if (unlikely(off && !req->ctx->off_timeout_used)) + req->ctx->off_timeout_used = true; if (!req->async_data && io_alloc_async_data(req)) return -ENOMEM; @@ -5769,7 +5809,7 @@ done: io_cqring_ev_posted(ctx); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5826,7 +5866,7 @@ done: io_cqring_ev_posted(ctx); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); io_put_req(req); return 0; } @@ -5868,7 +5908,7 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) mutex_unlock(&ctx->uring_lock); if (ret < 0) - req_set_fail_links(req); + req_set_fail(req); __io_req_complete(req, issue_flags, ret, 0); return 0; } @@ -5979,48 +6019,69 @@ static int io_req_prep_async(struct io_kiocb *req) static u32 io_get_sequence(struct io_kiocb *req) { - struct io_kiocb *pos; - struct io_ring_ctx *ctx = req->ctx; - u32 total_submitted, nr_reqs = 0; - - io_for_each_link(pos, req) - nr_reqs++; + u32 seq = req->ctx->cached_sq_head; - total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped; - return total_submitted - nr_reqs; + /* need original cached_sq_head, but it was increased for each req */ + io_for_each_link(req, req) + seq--; + return seq; } -static int io_req_defer(struct io_kiocb *req) +static bool io_drain_req(struct io_kiocb *req) { + struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; u32 seq; + /* + * If we need to drain a request in the middle of a link, drain the + * head request and the next request/link after the current link. + * Considering sequential execution of links, IOSQE_IO_DRAIN will be + * maintained for every request of our link. + */ + if (ctx->drain_next) { + req->flags |= REQ_F_IO_DRAIN; + ctx->drain_next = false; + } + /* not interested in head, start from the first linked */ + io_for_each_link(pos, req->link) { + if (pos->flags & REQ_F_IO_DRAIN) { + ctx->drain_next = true; + req->flags |= REQ_F_IO_DRAIN; + break; + } + } + /* Still need defer if there is pending req in defer list. */ if (likely(list_empty_careful(&ctx->defer_list) && - !(req->flags & REQ_F_IO_DRAIN))) - return 0; + !(req->flags & REQ_F_IO_DRAIN))) { + ctx->drain_active = false; + return false; + } seq = io_get_sequence(req); /* Still a chance to pass the sequence check */ if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) - return 0; + return false; ret = io_req_prep_async(req); if (ret) return ret; io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); - if (!de) - return -ENOMEM; + if (!de) { + io_req_complete_failed(req, ret); + return true; + } spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); kfree(de); io_queue_async_work(req); - return -EIOCBQUEUED; + return true; } trace_io_uring_defer(ctx, req, req->user_data); @@ -6028,7 +6089,7 @@ static int io_req_defer(struct io_kiocb *req) de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); - return -EIOCBQUEUED; + return true; } static void io_clean_op(struct io_kiocb *req) @@ -6045,7 +6106,6 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->sr_msg.kbuf); break; } - req->flags &= ~REQ_F_BUFFER_SELECTED; } if (req->flags & REQ_F_NEED_CLEANUP) { @@ -6057,8 +6117,8 @@ static void io_clean_op(struct io_kiocb *req) case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: { struct io_async_rw *io = req->async_data; - if (io->free_iovec) - kfree(io->free_iovec); + + kfree(io->free_iovec); break; } case IORING_OP_RECVMSG: @@ -6086,7 +6146,6 @@ static void io_clean_op(struct io_kiocb *req) putname(req->unlink.filename); break; } - req->flags &= ~REQ_F_NEED_CLEANUP; } if ((req->flags & REQ_F_POLLED) && req->apoll) { kfree(req->apoll->double_poll); @@ -6097,8 +6156,11 @@ static void io_clean_op(struct io_kiocb *req) struct io_uring_task *tctx = req->task->io_uring; atomic_dec(&tctx->inflight_tracked); - req->flags &= ~REQ_F_INFLIGHT; } + if (req->flags & REQ_F_CREDS) + put_cred(req->creds); + + req->flags &= ~IO_REQ_CLEAN_FLAGS; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) @@ -6107,8 +6169,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) const struct cred *creds = NULL; int ret; - if (req->work.creds && req->work.creds != current_cred()) - creds = override_creds(req->work.creds); + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); switch (req->opcode) { case IORING_OP_NOP: @@ -6218,23 +6280,11 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); - if (ret) return ret; - /* If the op doesn't have a file, we're not polling for it */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { - const bool in_async = io_wq_current_is_worker(); - - /* workqueue context doesn't hold uring_lock, grab it now */ - if (in_async) - mutex_lock(&ctx->uring_lock); - - io_iopoll_req_issued(req, in_async); - - if (in_async) - mutex_unlock(&ctx->uring_lock); - } + if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) + io_iopoll_req_issued(req); return 0; } @@ -6416,6 +6466,7 @@ static void __io_queue_sqe(struct io_kiocb *req) struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); int ret; +issue_sqe: ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); /* @@ -6430,17 +6481,21 @@ static void __io_queue_sqe(struct io_kiocb *req) cs->reqs[cs->nr++] = req; if (cs->nr == ARRAY_SIZE(cs->reqs)) - io_submit_flush_completions(cs, ctx); + io_submit_flush_completions(ctx); } else { io_put_req(req); } } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - if (!io_arm_poll_handler(req)) { + switch (io_arm_poll_handler(req)) { + case IO_APOLL_READY: + goto issue_sqe; + case IO_APOLL_ABORTED: /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); + break; } } else { io_req_complete_failed(req, ret); @@ -6449,23 +6504,20 @@ static void __io_queue_sqe(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static void io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req) { - int ret; + if (unlikely(req->ctx->drain_active) && io_drain_req(req)) + return; - ret = io_req_defer(req); - if (ret) { - if (ret != -EIOCBQUEUED) { -fail_req: - io_req_complete_failed(req, ret); - } - } else if (req->flags & REQ_F_FORCE_ASYNC) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - goto fail_req; - io_queue_async_work(req); - } else { + if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) { __io_queue_sqe(req); + } else { + int ret = io_req_prep_async(req); + + if (unlikely(ret)) + io_req_complete_failed(req, ret); + else + io_queue_async_work(req); } } @@ -6478,7 +6530,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->restricted) + if (likely(!ctx->restricted)) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) @@ -6506,35 +6558,33 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); - req->async_data = NULL; req->file = NULL; - req->ctx = ctx; - req->link = NULL; req->fixed_rsrc_refs = NULL; /* one is dropped after submission, the other at completion */ atomic_set(&req->refs, 2); req->task = current; - req->result = 0; - req->work.creds = NULL; /* enforce forwards compatibility on users */ if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + if (!io_check_restriction(ctx, req, sqe_flags)) return -EACCES; if ((sqe_flags & IOSQE_BUFFER_SELECT) && !io_op_defs[req->opcode].buffer_select) return -EOPNOTSUPP; + if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) + ctx->drain_active = true; personality = READ_ONCE(sqe->personality); if (personality) { - req->work.creds = xa_load(&ctx->personalities, personality); - if (!req->work.creds) + req->creds = xa_load(&ctx->personalities, personality); + if (!req->creds) return -EINVAL; - get_cred(req->work.creds); + get_cred(req->creds); + req->flags |= REQ_F_CREDS; } state = &ctx->submit_state; @@ -6571,20 +6621,22 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, fail_req: if (link->head) { /* fail even hard links since we don't submit */ - link->head->flags |= REQ_F_FAIL_LINK; + req_set_fail(link->head); io_req_complete_failed(link->head, -ECANCELED); link->head = NULL; } io_req_complete_failed(req, ret); return ret; } + ret = io_req_prep(req, sqe); if (unlikely(ret)) goto fail_req; /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, ctx->flags & IORING_SETUP_SQPOLL); + trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, + req->flags, true, + ctx->flags & IORING_SETUP_SQPOLL); /* * If we already have a head request, queue this one for async @@ -6596,17 +6648,6 @@ fail_req: if (link->head) { struct io_kiocb *head = link->head; - /* - * Taking sequential execution of a link, draining both sides - * of the link also fullfils IOSQE_IO_DRAIN semantics for all - * requests in the link. So, it drains the head and the - * next after the link request. The last one is done via - * drain_next flag to persist the effect across calls. - */ - if (req->flags & REQ_F_IO_DRAIN) { - head->flags |= REQ_F_IO_DRAIN; - ctx->drain_next = 1; - } ret = io_req_prep_async(req); if (unlikely(ret)) goto fail_req; @@ -6616,14 +6657,10 @@ fail_req: /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_sqe(head); link->head = NULL; + io_queue_sqe(head); } } else { - if (unlikely(ctx->drain_next)) { - req->flags |= REQ_F_IO_DRAIN; - ctx->drain_next = 0; - } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { link->head = req; link->last = req; @@ -6644,7 +6681,7 @@ static void io_submit_state_end(struct io_submit_state *state, if (state->link.head) io_queue_sqe(state->link.head); if (state->comp.nr) - io_submit_flush_completions(&state->comp, ctx); + io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); io_state_file_put(state); @@ -6675,7 +6712,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that sqe_ptr will point to memory + * Fetch an sqe, if one is available. Note this returns a pointer to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later @@ -6684,8 +6721,8 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) */ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) { - u32 *sq_array = ctx->sq_array; - unsigned head; + unsigned head, mask = ctx->sq_entries - 1; + unsigned sq_idx = ctx->cached_sq_head++ & mask; /* * The cached sq head (or cq tail) serves two purposes: @@ -6695,28 +6732,36 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = READ_ONCE(sq_array[ctx->cached_sq_head++ & ctx->sq_mask]); + head = READ_ONCE(ctx->sq_array[sq_idx]); if (likely(head < ctx->sq_entries)) return &ctx->sq_sqes[head]; /* drop invalid entries */ - ctx->cached_sq_dropped++; - WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); + ctx->cq_extra--; + WRITE_ONCE(ctx->rings->sq_dropped, + READ_ONCE(ctx->rings->sq_dropped) + 1); return NULL; } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { + struct io_uring_task *tctx; int submitted = 0; /* make sure SQ entry isn't read before tail */ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); - if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; - percpu_counter_add(¤t->io_uring->inflight, nr); - refcount_add(nr, ¤t->usage); + tctx = current->io_uring; + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) { + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; + + percpu_counter_add(&tctx->inflight, refill); + refcount_add(refill, ¤t->usage); + tctx->cached_refs += refill; + } io_submit_state_start(&ctx->submit_state, nr); while (submitted < nr) { @@ -6742,12 +6787,10 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; - struct io_uring_task *tctx = current->io_uring; int unused = nr - ref_used; + current->io_uring->cached_refs += unused; percpu_ref_put_many(&ctx->refs, unused); - percpu_counter_sub(&tctx->inflight, unused); - put_task_struct_many(current, unused); } io_submit_state_end(&ctx->submit_state, ctx); @@ -6757,6 +6800,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return submitted; } +static inline bool io_sqd_events_pending(struct io_sq_data *sqd) +{ + return READ_ONCE(sqd->state); +} + static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) { /* Tell userspace we may need a wakeup call */ @@ -6779,11 +6827,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) to_submit = io_sqring_entries(ctx); /* if we're handling multiple rings, cap submit size for fairness */ - if (cap_entries && to_submit > 8) - to_submit = 8; + if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) + to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; if (!list_empty(&ctx->iopoll_list) || to_submit) { unsigned nr_events = 0; + const struct cred *creds = NULL; + + if (ctx->sq_creds != current_cred()) + creds = override_creds(ctx->sq_creds); mutex_lock(&ctx->uring_lock); if (!list_empty(&ctx->iopoll_list)) @@ -6797,10 +6849,12 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) !(ctx->flags & IORING_SETUP_R_DISABLED)) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - } - if (!io_sqring_full(ctx) && wq_has_sleeper(&ctx->sqo_sq_wait)) - wake_up(&ctx->sqo_sq_wait); + if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + if (creds) + revert_creds(creds); + } return ret; } @@ -6815,6 +6869,22 @@ static void io_sqd_update_thread_idle(struct io_sq_data *sqd) sqd->sq_thread_idle = sq_thread_idle; } +static bool io_sqd_handle_event(struct io_sq_data *sqd) +{ + bool did_sig = false; + struct ksignal ksig; + + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || + signal_pending(current)) { + mutex_unlock(&sqd->lock); + if (signal_pending(current)) + did_sig = get_signal(&ksig); + cond_resched(); + mutex_lock(&sqd->lock); + } + return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); +} + static int io_sq_thread(void *data) { struct io_sq_data *sqd = data; @@ -6833,48 +6903,26 @@ static int io_sq_thread(void *data) current->flags |= PF_NO_SETAFFINITY; mutex_lock(&sqd->lock); - /* a user may had exited before the thread started */ - io_run_task_work_head(&sqd->park_task_work); - - while (!test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) { - int ret; - bool cap_entries, sqt_spin, needs_sched; - - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || - signal_pending(current)) { - bool did_sig = false; - - mutex_unlock(&sqd->lock); - if (signal_pending(current)) { - struct ksignal ksig; + while (1) { + bool cap_entries, sqt_spin = false; - did_sig = get_signal(&ksig); - } - cond_resched(); - mutex_lock(&sqd->lock); - io_run_task_work(); - io_run_task_work_head(&sqd->park_task_work); - if (did_sig) + if (io_sqd_events_pending(sqd) || signal_pending(current)) { + if (io_sqd_handle_event(sqd)) break; timeout = jiffies + sqd->sq_thread_idle; - continue; } - sqt_spin = false; + cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - const struct cred *creds = NULL; + int ret = __io_sq_thread(ctx, cap_entries); - if (ctx->sq_creds != current_cred()) - creds = override_creds(ctx->sq_creds); - ret = __io_sq_thread(ctx, cap_entries); - if (creds) - revert_creds(creds); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; } + if (io_run_task_work()) + sqt_spin = true; if (sqt_spin || !time_after(jiffies, timeout)) { - io_run_task_work(); cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; @@ -6882,12 +6930,12 @@ static int io_sq_thread(void *data) } prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) { - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_ring_set_wakeup_flag(ctx); + if (!io_sqd_events_pending(sqd) && !current->task_works) { + bool needs_sched = true; - needs_sched = true; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + io_ring_set_wakeup_flag(ctx); + if ((ctx->flags & IORING_SETUP_IOPOLL) && !list_empty_careful(&ctx->iopoll_list)) { needs_sched = false; @@ -6909,16 +6957,14 @@ static int io_sq_thread(void *data) } finish_wait(&sqd->wait, &wait); - io_run_task_work_head(&sqd->park_task_work); timeout = jiffies + sqd->sq_thread_idle; } - io_uring_cancel_sqpoll(sqd); + io_uring_cancel_generic(true, sqd); sqd->thread = NULL; list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); io_run_task_work(); - io_run_task_work_head(&sqd->park_task_work); mutex_unlock(&sqd->lock); complete(&sqd->exited); @@ -6955,7 +7001,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->cq_check_overflow)) + if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } @@ -6983,7 +7029,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (ret || io_should_wake(iowq)) return ret; /* let the caller flush overflows, retry */ - if (test_bit(0, &ctx->cq_check_overflow)) + if (test_bit(0, &ctx->check_cq_overflow)) return 1; *timeout = schedule_timeout(*timeout); @@ -7048,10 +7094,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = -EBUSY; break; } - prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); - finish_wait(&ctx->wait, &iowq.wq); + finish_wait(&ctx->cq_wait, &iowq.wq); cond_resched(); } while (ret > 0); @@ -7060,14 +7106,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) +static void io_free_page_table(void **table, size_t size) { - unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); for (i = 0; i < nr_tables; i++) - kfree(table->files[i]); - kfree(table->files); - table->files = NULL; + kfree(table[i]); + kfree(table); +} + +static void **io_alloc_page_table(size_t size) +{ + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + size_t init_size = size; + void **table; + + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL); + if (!table) + return NULL; + + for (i = 0; i < nr_tables; i++) { + unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + + table[i] = kzalloc(this_size, GFP_KERNEL); + if (!table[i]) { + io_free_page_table(table, init_size); + return NULL; + } + size -= this_size; + } + return table; } static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx) @@ -7156,33 +7224,77 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct return ret; } +static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +{ + unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; + unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; + + return &data->tags[table_idx][off]; +} + static void io_rsrc_data_free(struct io_rsrc_data *data) { - kvfree(data->tags); + size_t size = data->nr * sizeof(data->tags[0][0]); + + if (data->tags) + io_free_page_table((void **)data->tags, size); kfree(data); } -static struct io_rsrc_data *io_rsrc_data_alloc(struct io_ring_ctx *ctx, - rsrc_put_fn *do_put, - unsigned nr) +static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, + u64 __user *utags, unsigned nr, + struct io_rsrc_data **pdata) { struct io_rsrc_data *data; + int ret = -ENOMEM; + unsigned i; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) - return NULL; - - data->tags = kvcalloc(nr, sizeof(*data->tags), GFP_KERNEL); + return -ENOMEM; + data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); if (!data->tags) { kfree(data); - return NULL; + return -ENOMEM; } - atomic_set(&data->refs, 1); + data->nr = nr; data->ctx = ctx; data->do_put = do_put; + if (utags) { + ret = -EFAULT; + for (i = 0; i < nr; i++) { + u64 *tag_slot = io_get_tag_slot(data, i); + + if (copy_from_user(tag_slot, &utags[i], + sizeof(*tag_slot))) + goto fail; + } + } + + atomic_set(&data->refs, 1); init_completion(&data->done); - return data; + *pdata = data; + return 0; +fail: + io_rsrc_data_free(data); + return ret; +} + +static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) +{ + size_t size = nr_files * sizeof(struct io_fixed_file); + + table->files = (struct io_fixed_file **)io_alloc_page_table(size); + return !!table->files; +} + +static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) +{ + size_t size = nr_files * sizeof(struct io_fixed_file); + + io_free_page_table((void **)table->files, size); + table->files = NULL; } static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -7446,31 +7558,6 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx) } #endif -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) -{ - unsigned i, nr_tables = DIV_ROUND_UP(nr_files, IORING_MAX_FILES_TABLE); - - table->files = kcalloc(nr_tables, sizeof(*table->files), GFP_KERNEL); - if (!table->files) - return false; - - for (i = 0; i < nr_tables; i++) { - unsigned int this_files = min(nr_files, IORING_MAX_FILES_TABLE); - - table->files[i] = kcalloc(this_files, sizeof(*table->files[i]), - GFP_KERNEL); - if (!table->files[i]) - break; - nr_files -= this_files; - } - - if (i == nr_tables) - return true; - - io_free_file_tables(table, nr_tables * IORING_MAX_FILES_TABLE); - return false; -} - static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) { struct file *file = prsrc->file; @@ -7545,14 +7632,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) if (prsrc->tag) { bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; - unsigned long flags; io_ring_submit_lock(ctx, lock_ring); - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irq(&ctx->completion_lock); io_cqring_fill_event(ctx, prsrc->tag, 0, 0); ctx->cq_extra++; io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_ring_submit_unlock(ctx, lock_ring); } @@ -7634,7 +7720,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, struct file *file; int fd, ret; unsigned i; - struct io_rsrc_data *file_data; if (ctx->file_data) return -EBUSY; @@ -7645,27 +7730,24 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; + ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, + &ctx->file_data); + if (ret) + return ret; - file_data = io_rsrc_data_alloc(ctx, io_rsrc_file_put, nr_args); - if (!file_data) - return -ENOMEM; - ctx->file_data = file_data; ret = -ENOMEM; if (!io_alloc_file_tables(&ctx->file_table, nr_args)) goto out_free; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - u64 tag = 0; - - if ((tags && copy_from_user(&tag, &tags[i], sizeof(tag))) || - copy_from_user(&fd, &fds[i], sizeof(fd))) { + if (copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto out_fput; } /* allow sparse sets */ if (fd == -1) { ret = -EINVAL; - if (unlikely(tag)) + if (unlikely(*io_get_tag_slot(ctx->file_data, i))) goto out_fput; continue; } @@ -7686,7 +7768,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto out_fput; } - ctx->file_data->tags[i] = tag; io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); } @@ -7764,7 +7845,7 @@ static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, if (!prsrc) return -ENOMEM; - prsrc->tag = data->tags[idx]; + prsrc->tag = *io_get_tag_slot(data, idx); prsrc->rsrc = rsrc; list_add(&prsrc->list, &node->rsrc_list); return 0; @@ -7834,7 +7915,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - data->tags[up->offset + done] = tag; + *io_get_tag_slot(data, up->offset + done) = tag; io_fixed_file_set(file_slot, file); err = io_sqe_file_register(ctx, file, i); if (err) { @@ -7892,7 +7973,7 @@ static int io_uring_alloc_task_context(struct task_struct *task, struct io_uring_task *tctx; int ret; - tctx = kmalloc(sizeof(*tctx), GFP_KERNEL); + tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); if (unlikely(!tctx)) return -ENOMEM; @@ -7912,13 +7993,11 @@ static int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); - tctx->last = NULL; atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - tctx->task_state = 0; init_task_work(&tctx->task_work, tctx_task_work); return 0; } @@ -7929,6 +8008,7 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(!xa_empty(&tctx->xa)); WARN_ON_ONCE(tctx->io_wq); + WARN_ON_ONCE(tctx->cached_refs); percpu_counter_destroy(&tctx->inflight); kfree(tctx); @@ -8305,6 +8385,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, for (i = 0; i < nr_pages; i++) { struct vm_area_struct *vma = vmas[i]; + if (vma_is_shmem(vma)) + continue; if (vma->vm_file && !is_file_hugepages(vma->vm_file)) { ret = -EOPNOTSUPP; @@ -8402,9 +8484,9 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; - data = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, nr_args); - if (!data) - return -ENOMEM; + ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); + if (ret) + return ret; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { io_rsrc_data_free(data); @@ -8412,19 +8494,13 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - u64 tag = 0; - - if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) { - ret = -EFAULT; - break; - } ret = io_copy_iov(ctx, &iov, arg, i); if (ret) break; ret = io_buffer_validate(&iov); if (ret) break; - if (!iov.iov_base && tag) { + if (!iov.iov_base && *io_get_tag_slot(data, i)) { ret = -EINVAL; break; } @@ -8433,7 +8509,6 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, &last_hpage); if (ret) break; - data->tags[i] = tag; } WARN_ON_ONCE(ctx->buf_data); @@ -8498,7 +8573,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, } ctx->user_bufs[i] = imu; - ctx->buf_data->tags[offset] = tag; + *io_get_tag_slot(ctx->buf_data, offset) = tag; } if (needs_switch) @@ -8520,6 +8595,7 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) ctx->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ctx->cq_ev_fd)) { int ret = PTR_ERR(ctx->cq_ev_fd); + ctx->cq_ev_fd = NULL; return ret; } @@ -8643,7 +8719,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->cq_wait, wait); + poll_wait(file, &ctx->poll_wait, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -8665,7 +8741,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) * Users may get EPOLLIN meanwhile seeing nothing in cqring, this * pushs them to do the flush. */ - if (io_cqring_events(ctx) || test_bit(0, &ctx->cq_check_overflow)) + if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -8713,7 +8789,7 @@ static void io_tctx_exit_cb(struct callback_head *cb) * node. It'll be removed by the end of cancellation, just ignore it. */ if (!atomic_read(&tctx->in_idle)) - io_uring_del_task_file((unsigned long)work->ctx); + io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } @@ -8739,7 +8815,7 @@ static void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - io_uring_try_cancel_requests(ctx, NULL, NULL); + io_uring_try_cancel_requests(ctx, NULL, true); if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; @@ -8790,14 +8866,14 @@ static void io_ring_exit_work(struct work_struct *work) /* Returns true if we found and killed one or more timeouts */ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) + bool cancel_all) { struct io_kiocb *req, *tmp; int canceled = 0; spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, files)) { + if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; } @@ -8823,8 +8899,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - io_kill_timeouts(ctx, NULL, NULL); - io_poll_remove_all(ctx, NULL, NULL); + io_kill_timeouts(ctx, NULL, true); + io_poll_remove_all(ctx, NULL, true); /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); @@ -8850,7 +8926,7 @@ static int io_uring_release(struct inode *inode, struct file *file) struct io_task_cancel { struct task_struct *task; - struct files_struct *files; + bool all; }; static bool io_cancel_task_cb(struct io_wq_work *work, void *data) @@ -8859,30 +8935,29 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_task_cancel *cancel = data; bool ret; - if (cancel->files && (req->flags & REQ_F_LINK_TIMEOUT)) { + if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ spin_lock_irqsave(&ctx->completion_lock, flags); - ret = io_match_task(req, cancel->task, cancel->files); + ret = io_match_task(req, cancel->task, cancel->all); spin_unlock_irqrestore(&ctx->completion_lock, flags); } else { - ret = io_match_task(req, cancel->task, cancel->files); + ret = io_match_task(req, cancel->task, cancel->all); } return ret; } static bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, - struct files_struct *files) + struct task_struct *task, bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task(de->req, task, files)) { + if (io_match_task(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -8926,9 +9001,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, - struct files_struct *files) + bool cancel_all) { - struct io_task_cancel cancel = { .task = task, .files = files, }; + struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; while (1) { @@ -8948,7 +9023,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && !files) || + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || (ctx->sq_data && ctx->sq_data->thread == current)) { while (!list_empty_careful(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); @@ -8956,10 +9031,11 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } - ret |= io_cancel_defer_files(ctx, task, files); - ret |= io_poll_remove_all(ctx, task, files); - ret |= io_kill_timeouts(ctx, task, files); - ret |= io_run_task_work(); + ret |= io_cancel_defer_files(ctx, task, cancel_all); + ret |= io_poll_remove_all(ctx, task, cancel_all); + ret |= io_kill_timeouts(ctx, task, cancel_all); + if (task) + ret |= io_run_task_work(); ret |= io_run_ctx_fallback(ctx); if (!ret) break; @@ -8967,7 +9043,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } -static int __io_uring_add_task_file(struct io_ring_ctx *ctx) +static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -9004,19 +9080,19 @@ static int __io_uring_add_task_file(struct io_ring_ctx *ctx) /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static inline int io_uring_add_task_file(struct io_ring_ctx *ctx) +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; if (likely(tctx && tctx->last == ctx)) return 0; - return __io_uring_add_task_file(ctx); + return __io_uring_add_tctx_node(ctx); } /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_task_file(unsigned long index) +static void io_uring_del_tctx_node(unsigned long index) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -9046,7 +9122,7 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) unsigned long index; xa_for_each(&tctx->xa, index, node) - io_uring_del_task_file(index); + io_uring_del_tctx_node(index); if (wq) { /* * Must be after io_uring_del_task_file() (removes nodes under @@ -9064,99 +9140,83 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) return percpu_counter_sum(&tctx->inflight); } -static void io_uring_try_cancel(struct files_struct *files) +static void io_uring_drop_tctx_refs(struct task_struct *task) { - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - struct io_ring_ctx *ctx = node->ctx; + struct io_uring_task *tctx = task->io_uring; + unsigned int refs = tctx->cached_refs; - /* sqpoll task will cancel all its requests */ - if (!ctx->sq_data) - io_uring_try_cancel_requests(ctx, current, files); - } + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); } -/* should only be called by SQPOLL task */ -static void io_uring_cancel_sqpoll(struct io_sq_data *sqd) +/* + * Find any io_uring ctx that this task has registered or done IO on, and cancel + * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. + */ +static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; s64 inflight; DEFINE_WAIT(wait); + WARN_ON_ONCE(sqd && sqd->thread != current); + if (!current->io_uring) return; if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - WARN_ON_ONCE(!sqd || sqd->thread != current); - + io_uring_drop_tctx_refs(current); atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = tctx_inflight(tctx, false); + inflight = tctx_inflight(tctx, !cancel_all); if (!inflight) break; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_try_cancel_requests(ctx, current, NULL); - - prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); - /* - * If we've seen completions, retry without waiting. This - * avoids a race where a completion comes in before we did - * prepare_to_wait(). - */ - if (inflight == tctx_inflight(tctx, false)) - schedule(); - finish_wait(&tctx->wait, &wait); - } while (1); - atomic_dec(&tctx->in_idle); -} -/* - * Find any io_uring fd that this task has registered or done IO on, and cancel - * requests. - */ -void __io_uring_cancel(struct files_struct *files) -{ - struct io_uring_task *tctx = current->io_uring; - DEFINE_WAIT(wait); - s64 inflight; + if (!sqd) { + struct io_tctx_node *node; + unsigned long index; - if (tctx->io_wq) - io_wq_exit_start(tctx->io_wq); + xa_for_each(&tctx->xa, index, node) { + /* sqpoll task will cancel all its requests */ + if (node->ctx->sq_data) + continue; + io_uring_try_cancel_requests(node->ctx, current, + cancel_all); + } + } else { + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_try_cancel_requests(ctx, current, + cancel_all); + } - /* make sure overflow events are dropped */ - atomic_inc(&tctx->in_idle); - do { - /* read completions before cancelations */ - inflight = tctx_inflight(tctx, !!files); - if (!inflight) - break; - io_uring_try_cancel(files); prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); - /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did * prepare_to_wait(). */ - if (inflight == tctx_inflight(tctx, !!files)) + if (inflight == tctx_inflight(tctx, !cancel_all)) schedule(); finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); io_uring_clean_tctx(tctx); - if (!files) { + if (cancel_all) { /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } } +void __io_uring_cancel(struct files_struct *files) +{ + io_uring_cancel_generic(!files, NULL); +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { @@ -9317,9 +9377,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_cqring_overflow_flush(ctx, false); ret = -EOWNERDEAD; - if (unlikely(ctx->sq_data->thread == NULL)) { + if (unlikely(ctx->sq_data->thread == NULL)) goto out; - } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { @@ -9329,7 +9388,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(ctx); + ret = io_uring_add_tctx_node(ctx); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -9513,8 +9572,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; rings->cq_ring_entries = p->cq_entries; - ctx->sq_mask = rings->sq_ring_mask; - ctx->cq_mask = rings->cq_ring_mask; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { @@ -9541,7 +9598,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_task_file(ctx); + ret = io_uring_add_tctx_node(ctx); if (ret) { put_unused_fd(fd); return ret; @@ -9956,6 +10013,43 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } +static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, + unsigned len) +{ + struct io_uring_task *tctx = current->io_uring; + cpumask_var_t new_mask; + int ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(new_mask); + if (len > cpumask_size()) + len = cpumask_size(); + + if (copy_from_user(new_mask, arg, len)) { + free_cpumask_var(new_mask); + return -EFAULT; + } + + ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); + free_cpumask_var(new_mask); + return ret; +} + +static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + + return io_wq_cpu_affinity(tctx->io_wq, NULL); +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -9971,6 +10065,8 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_FILES_UPDATE2: case IORING_REGISTER_BUFFERS2: case IORING_REGISTER_BUFFERS_UPDATE: + case IORING_REGISTER_IOWQ_AFF: + case IORING_UNREGISTER_IOWQ_AFF: return false; default: return true; @@ -10110,6 +10206,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ret = io_register_rsrc_update(ctx, arg, nr_args, IORING_RSRC_BUFFER); break; + case IORING_REGISTER_IOWQ_AFF: + ret = -EINVAL; + if (!arg || !nr_args) + break; + ret = io_register_iowq_aff(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_IOWQ_AFF: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_unregister_iowq_aff(ctx); + break; default: ret = -EINVAL; break; @@ -10188,6 +10296,7 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); BUILD_BUG_SQE_ELEM(32, __u64, user_data); BUILD_BUG_SQE_ELEM(40, __u16, buf_index); + BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); @@ -10200,6 +10309,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index b9e6a7ec78be..eb2f8273e6f1 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -235,8 +235,6 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, break; } ctx->pos += de_len; - - continue; } if (bh) brelse(bh); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 63b526d44886..51d1eb2ffeb9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -80,23 +80,15 @@ static inline void __buffer_relink_io(struct journal_head *jh) } /* - * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. + * Check a checkpoint buffer could be release or not. * * Requires j_list_lock */ -static int __try_to_free_cp_buf(struct journal_head *jh) +static inline bool __cp_buffer_busy(struct journal_head *jh) { - int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_transaction == NULL && !buffer_locked(bh) && - !buffer_dirty(bh) && !buffer_write_io_error(bh)) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - } - return ret; + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); } /* @@ -228,7 +220,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) * OK, we need to start writing disk blocks. Take one transaction * and write it. */ - result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; @@ -295,8 +286,6 @@ restart: goto restart; } if (!buffer_dirty(bh)) { - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO; BUFFER_TRACE(bh, "remove from checkpoint"); if (__jbd2_journal_remove_checkpoint(jh)) /* The transaction was released; we're done */ @@ -356,8 +345,6 @@ restart2: spin_lock(&journal->j_list_lock); goto restart2; } - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO; /* * Now in whatever state the buffer currently is, we @@ -369,10 +356,7 @@ restart2: } out: spin_unlock(&journal->j_list_lock); - if (result < 0) - jbd2_journal_abort(journal, result); - else - result = jbd2_cleanup_journal_tail(journal); + result = jbd2_cleanup_journal_tail(journal); return (result < 0) ? result : 0; } @@ -437,7 +421,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret; if (!jh) return 0; @@ -446,13 +429,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) do { jh = next_jh; next_jh = jh->b_cpnext; - if (!destroy) - ret = __try_to_free_cp_buf(jh); - else - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - if (!ret) + + if (!destroy && __cp_buffer_busy(jh)) return 0; - if (ret == 2) + + if (__jbd2_journal_remove_checkpoint(jh)) return 1; /* * This function only frees up some memory @@ -468,6 +449,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) } /* + * journal_shrink_one_cp_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the given list + * and try to release them. If the whole transaction is released, set + * the 'released' parameter. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, + unsigned long *nr_to_scan, + bool *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + unsigned long nr_freed = 0; + int ret; + + if (!jh || *nr_to_scan == 0) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + + (*nr_to_scan)--; + if (__cp_buffer_busy(jh)) + continue; + + nr_freed++; + ret = __jbd2_journal_remove_checkpoint(jh); + if (ret) { + *released = true; + break; + } + + if (need_resched()) + break; + } while (jh != last_jh && *nr_to_scan); + + return nr_freed; +} + +/* + * jbd2_journal_shrink_checkpoint_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the journal + * and try to release them. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, + unsigned long *nr_to_scan) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + bool released; + tid_t first_tid = 0, last_tid = 0, next_tid = 0; + tid_t tid = 0; + unsigned long nr_freed = 0; + unsigned long nr_scanned = *nr_to_scan; + +again: + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + goto out; + } + + /* + * Get next shrink transaction, resume previous scan or start + * over again. If some others do checkpoint and drop transaction + * from the checkpoint list, we ignore saved j_shrink_transaction + * and start over unconditionally. + */ + if (journal->j_shrink_transaction) + transaction = journal->j_shrink_transaction; + else + transaction = journal->j_checkpoint_transactions; + + if (!first_tid) + first_tid = transaction->t_tid; + last_transaction = journal->j_checkpoint_transactions->t_cpprev; + next_transaction = transaction; + last_tid = last_transaction->t_tid; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + tid = transaction->t_tid; + released = false; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + if (released) + continue; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + } while (transaction != last_transaction); + + if (transaction != last_transaction) { + journal->j_shrink_transaction = next_transaction; + next_tid = next_transaction->t_tid; + } else { + journal->j_shrink_transaction = NULL; + next_tid = 0; + } + + spin_unlock(&journal->j_list_lock); + cond_resched(); + + if (*nr_to_scan && next_tid) + goto again; +out: + nr_scanned -= *nr_to_scan; + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, + nr_freed, nr_scanned, next_tid); + + return nr_freed; +} + +/* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. @@ -564,24 +676,37 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; - int ret = 0; + struct buffer_head *bh = jh2bh(jh); JBUFFER_TRACE(jh, "entry"); - if ((transaction = jh->b_cp_transaction) == NULL) { + transaction = jh->b_cp_transaction; + if (!transaction) { JBUFFER_TRACE(jh, "not on transaction"); - goto out; + return 0; } journal = transaction->t_journal; JBUFFER_TRACE(jh, "removing from transaction"); + + /* + * If we have failed to write the buffer out to disk, the filesystem + * may become inconsistent. We cannot abort the journal here since + * we hold j_list_lock and we have to be careful about races with + * jbd2_journal_destroy(). So mark the writeback IO error in the + * journal here and we abort the journal later from a better context. + */ + if (buffer_write_io_error(bh)) + set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); + __buffer_unlink(jh); jh->b_cp_transaction = NULL; + percpu_counter_dec(&journal->j_jh_shrink_count); jbd2_journal_put_journal_head(jh); - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) - goto out; + /* Is this transaction empty? */ + if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) + return 0; /* * There is one special case to worry about: if we have just pulled the @@ -593,10 +718,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * See the comment at the end of jbd2_journal_commit_transaction(). */ if (transaction->t_state != T_FINISHED) - goto out; + return 0; - /* OK, that was the last buffer for the transaction: we can now - safely remove this transaction from the log */ + /* + * OK, that was the last buffer for the transaction, we can now + * safely remove this transaction from the log. + */ stats = &transaction->t_chp_stats; if (stats->cs_chp_time) stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, @@ -606,9 +733,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) __jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); - ret = 1; -out: - return ret; + return 1; } /* @@ -639,6 +764,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; + percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); } /* @@ -654,6 +780,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) { assert_spin_locked(&journal->j_list_lock); + + journal->j_shrink_transaction = NULL; if (transaction->t_cpnext) { transaction->t_cpnext->t_cpprev = transaction->t_cpprev; transaction->t_cpprev->t_cpnext = transaction->t_cpnext; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2dc944442802..152880c298ca 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -934,10 +934,6 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) } EXPORT_SYMBOL(jbd2_fc_wait_bufs); -/* - * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf - * for completion. - */ int jbd2_fc_release_bufs(journal_t *journal) { struct buffer_head *bh; @@ -945,10 +941,6 @@ int jbd2_fc_release_bufs(journal_t *journal) j_fc_off = journal->j_fc_off; - /* - * Wait in reverse order to minimize chances of us being woken up before - * all IOs have completed - */ for (i = j_fc_off - 1; i >= 0; i--) { bh = journal->j_fc_wbuf[i]; if (!bh) @@ -1618,6 +1610,10 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, if (is_journal_aborted(journal)) return -EIO; + if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { + jbd2_journal_abort(journal, -EIO); + return -EIO; + } BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1686,6 +1682,110 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) write_unlock(&journal->j_state_lock); } +/** + * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock) + * @journal: The journal to erase. + * @flags: A discard/zeroout request is sent for each physically contigous + * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or + * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation + * to perform. + * + * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes + * will be explicitly written if no hardware offload is available, see + * blkdev_issue_zeroout for more details. + */ +static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) +{ + int err = 0; + unsigned long block, log_offset; /* logical */ + unsigned long long phys_block, block_start, block_stop; /* physical */ + loff_t byte_start, byte_stop, byte_count; + struct request_queue *q = bdev_get_queue(journal->j_dev); + + /* flags must be set to either discard or zeroout */ + if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if (!q) + return -ENXIO; + + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + return -EOPNOTSUPP; + + /* + * lookup block mapping and issue discard/zeroout for each + * contiguous region + */ + log_offset = be32_to_cpu(journal->j_superblock->s_first); + block_start = ~0ULL; + for (block = log_offset; block < journal->j_total_len; block++) { + err = jbd2_journal_bmap(journal, block, &phys_block); + if (err) { + pr_err("JBD2: bad block at offset %lu", block); + return err; + } + + if (block_start == ~0ULL) { + block_start = phys_block; + block_stop = block_start - 1; + } + + /* + * last block not contiguous with current block, + * process last contiguous region and return to this block on + * next loop + */ + if (phys_block != block_stop + 1) { + block--; + } else { + block_stop++; + /* + * if this isn't the last block of journal, + * no need to process now because next block may also + * be part of this contiguous region + */ + if (block != journal->j_total_len - 1) + continue; + } + + /* + * end of contiguous region or this is last block of journal, + * take care of the region + */ + byte_start = block_start * journal->j_blocksize; + byte_stop = block_stop * journal->j_blocksize; + byte_count = (block_stop - block_start + 1) * + journal->j_blocksize; + + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping, + byte_start, byte_stop); + + if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { + err = blkdev_issue_discard(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS, 0); + } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { + err = blkdev_issue_zeroout(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS, 0); + } + + if (unlikely(err != 0)) { + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", + err, block_start, block_stop); + return err; + } + + /* reset start and stop after processing a region */ + block_start = ~0ULL; + } + + return blkdev_issue_flush(journal->j_dev); +} /** * jbd2_journal_update_sb_errno() - Update error in the journal. @@ -1951,6 +2051,93 @@ recovery_error: } /** + * jbd2_journal_shrink_scan() + * + * Scan the checkpointed buffer on the checkpoint list and release the + * journal_head. + */ +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long nr_to_scan = sc->nr_to_scan; + unsigned long nr_shrunk; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); + + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); + + return nr_shrunk; +} + +/** + * jbd2_journal_shrink_count() + * + * Count the number of checkpoint buffers on the checkpoint list. + */ +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); + + return count; +} + +/** + * jbd2_journal_register_shrinker() + * @journal: Journal to act on. + * + * Init a percpu counter to record the checkpointed buffers on the checkpoint + * list and register a shrinker to release their journal_head. + */ +int jbd2_journal_register_shrinker(journal_t *journal) +{ + int err; + + journal->j_shrink_transaction = NULL; + + err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); + if (err) + return err; + + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; + journal->j_shrinker.seeks = DEFAULT_SEEKS; + journal->j_shrinker.batch = journal->j_max_transaction_buffers; + + err = register_shrinker(&journal->j_shrinker); + if (err) { + percpu_counter_destroy(&journal->j_jh_shrink_count); + return err; + } + + return 0; +} +EXPORT_SYMBOL(jbd2_journal_register_shrinker); + +/** + * jbd2_journal_unregister_shrinker() + * @journal: Journal to act on. + * + * Unregister the checkpointed buffer shrinker and destroy the percpu counter. + */ +void jbd2_journal_unregister_shrinker(journal_t *journal) +{ + percpu_counter_destroy(&journal->j_jh_shrink_count); + unregister_shrinker(&journal->j_shrinker); +} +EXPORT_SYMBOL(jbd2_journal_unregister_shrinker); + +/** * jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. * @@ -1995,6 +2182,16 @@ int jbd2_journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); + /* + * OK, all checkpoint transactions have been checked, now check the + * write out io error flag and abort the journal if some buffer failed + * to write back to the original location, otherwise the filesystem + * may become inconsistent. + */ + if (!is_journal_aborted(journal) && + test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) + jbd2_journal_abort(journal, -EIO); + if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock_io(&journal->j_checkpoint_mutex); @@ -2012,6 +2209,8 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } + jbd2_journal_unregister_shrinker(journal); + if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); iput(journal->j_inode); @@ -2246,13 +2445,18 @@ EXPORT_SYMBOL(jbd2_journal_clear_features); /** * jbd2_journal_flush() - Flush journal * @journal: Journal to act on. + * @flags: optional operation on the journal blocks after the flush (see below) * * Flush all data for a given journal to disk and empty the journal. * Filesystems can use this when remounting readonly to ensure that - * recovery does not need to happen on remount. + * recovery does not need to happen on remount. Optionally, a discard or zeroout + * can be issued on the journal blocks after flushing. + * + * flags: + * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks + * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks */ - -int jbd2_journal_flush(journal_t *journal) +int jbd2_journal_flush(journal_t *journal, unsigned int flags) { int err = 0; transaction_t *transaction = NULL; @@ -2306,6 +2510,10 @@ int jbd2_journal_flush(journal_t *journal) * commits of data to the journal will restore the current * s_start value. */ jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); + + if (flags) + err = __jbd2_journal_erase(journal, flags); + mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e8fc45fd751f..8804e126805f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2123,7 +2123,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) { struct buffer_head *head; struct buffer_head *bh; - bool has_write_io_error = false; int ret = 0; J_ASSERT(PageLocked(page)); @@ -2148,26 +2147,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) goto busy; - - /* - * If we free a metadata buffer which has been failed to - * write out, the jbd2 checkpoint procedure will not detect - * this failure and may lead to filesystem inconsistency - * after cleanup journal tail. - */ - if (buffer_write_io_error(bh)) { - pr_err("JBD2: Error while async write back metadata bh %llu.", - (unsigned long long)bh->b_blocknr); - has_write_io_error = true; - } } while ((bh = bh->b_this_page) != head); ret = try_to_free_buffers(page); - busy: - if (has_write_io_error) - jbd2_journal_abort(journal, -EIO); - return ret; } diff --git a/fs/namespace.c b/fs/namespace.c index c3f1a78ba369..ab4174a3c802 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3464,9 +3464,10 @@ out_type: return ret; } -#define FSMOUNT_VALID_FLAGS \ - (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ - MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) +#define FSMOUNT_VALID_FLAGS \ + (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ + MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ + MOUNT_ATTR_NOSYMFOLLOW) #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) @@ -3487,6 +3488,8 @@ static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) mnt_flags |= MNT_NOEXEC; if (attr_flags & MOUNT_ATTR_NODIRATIME) mnt_flags |= MNT_NODIRATIME; + if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; return mnt_flags; } diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 9c6c0e2e5880..68e8d61e28dd 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -19,19 +19,6 @@ /* /sys/fs/<nilfs>/ */ static struct kset *nilfs_kset; -#define NILFS_SHOW_TIME(time_t_val, buf) ({ \ - struct tm res; \ - int count = 0; \ - time64_to_tm(time_t_val, 0, &res); \ - res.tm_year += 1900; \ - res.tm_mon += 1; \ - count = scnprintf(buf, PAGE_SIZE, \ - "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \ - res.tm_year, res.tm_mon, res.tm_mday, \ - res.tm_hour, res.tm_min, res.tm_sec);\ - count; \ -}) - #define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \ static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \ struct attribute *attr, char *buf) \ @@ -576,7 +563,7 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, ctime = nilfs->ns_ctime; up_read(&nilfs->ns_segctor_sem); - return NILFS_SHOW_TIME(ctime, buf); + return sysfs_emit(buf, "%ptTs\n", &ctime); } static ssize_t @@ -604,7 +591,7 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, nongc_ctime = nilfs->ns_nongc_ctime; up_read(&nilfs->ns_segctor_sem); - return NILFS_SHOW_TIME(nongc_ctime, buf); + return sysfs_emit(buf, "%ptTs\n", &nongc_ctime); } static ssize_t @@ -724,7 +711,7 @@ nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, sbwtime = nilfs->ns_sbwtime; up_read(&nilfs->ns_sem); - return NILFS_SHOW_TIME(sbwtime, buf); + return sysfs_emit(buf, "%ptTs\n", &sbwtime); } static ssize_t diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e032f2e2c2c5..f1cc8258d34a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6018,7 +6018,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) * Then truncate log will be replayed resulting in cluster double free. */ jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4e589ce2fce6..4f15750aac5d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -308,7 +308,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) } jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { up_write(&journal->j_trans_barrier); @@ -1000,7 +1000,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (ocfs2_mount_local(osb)) { jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); @@ -1070,7 +1070,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) if (replayed) { jbd2_journal_lock_updates(journal->j_journal); - status = jbd2_journal_flush(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal, 0); jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); @@ -1666,7 +1666,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* wipe the journal */ jbd2_journal_lock_updates(journal); - status = jbd2_journal_flush(journal); + status = jbd2_journal_flush(journal, 0); jbd2_journal_unlock_updates(journal); if (status < 0) mlog_errno(status); diff --git a/fs/open.c b/fs/open.c index f76e960d10ea..1a325b3194df 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1011,12 +1011,20 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { - int flags = how->flags; + u64 flags = how->flags; + u64 strip = FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); - /* Must never be set by userspace */ - flags &= ~(FMODE_NONOTIFY | O_CLOEXEC); + BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS), + "struct open_flags doesn't yet handle flags > 32 bits"); + + /* + * Strip flags that either shouldn't be set by userspace like + * FMODE_NONOTIFY or that aren't relevant in determining struct + * open_flags like O_CLOEXEC. + */ + flags &= ~strip; /* * Older syscalls implicitly clear all of the invalid flags or argument diff --git a/fs/proc/array.c b/fs/proc/array.c index 7ec59171f197..ee0ce8cecc4a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) collect_sigign_sigcatch(p, &ignored, &caught); num_threads = get_nr_threads(p); rcu_read_lock(); /* FIXME: is this correct? */ - qsize = atomic_read(&__task_cred(p)->user->sigpending); + qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 8468baee951d..f32878d9a39f 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -16,7 +16,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %u/%d %d\n", LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), diff --git a/fs/proc/stat.c b/fs/proc/stat.c index f25e8531fd27..6561a06ef905 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -200,8 +200,8 @@ static int show_stat(struct seq_file *p, void *v) "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" - "procs_running %lu\n" - "procs_blocked %lu\n", + "procs_running %u\n" + "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8adabde685f1..328da35da390 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -173,6 +173,7 @@ config PSTORE_BLK tristate "Log panic/oops to a block device" depends on PSTORE depends on BLOCK + depends on BROKEN select PSTORE_ZONE default n help diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 4bb8a344957a..04ce58c939a0 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -8,15 +8,16 @@ #include <linux/kernel.h> #include <linux/module.h> -#include "../../block/blk.h" #include <linux/blkdev.h> #include <linux/string.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/platform_device.h> #include <linux/pstore_blk.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/init_syscalls.h> #include <linux/mount.h> -#include <linux/uio.h> static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE; module_param(kmsg_size, long, 0400); @@ -57,27 +58,7 @@ MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require sto /* * blkdev - the block device to use for pstore storage - * - * Usually, this will be a partition of a block device. - * - * blkdev accepts the following variants: - * 1) <hex_major><hex_minor> device number in hexadecimal representation, - * with no leading 0x, for example b302. - * 2) /dev/<disk_name> represents the device number of disk - * 3) /dev/<disk_name><decimal> represents the device number - * of partition - device number of disk plus the partition number - * 4) /dev/<disk_name>p<decimal> - same as the above, that form is - * used when disk name of partitioned disk ends on a digit. - * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the - * unique id of a partition if the partition table provides it. - * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS - * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- - * filled hex representation of the 32-bit "NT disk signature", and PP - * is a zero-filled hex representation of the 1-based partition number. - * 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to - * a partition with a known unique id. - * 7) <major>:<minor> major and minor number of the device separated by - * a colon. + * See Documentation/admin-guide/pstore-blk.rst for details. */ static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV; module_param_string(blkdev, blkdev, 80, 0400); @@ -88,14 +69,8 @@ MODULE_PARM_DESC(blkdev, "block device for pstore storage"); * during the register/unregister functions. */ static DEFINE_MUTEX(pstore_blk_lock); -static struct block_device *psblk_bdev; -static struct pstore_zone_info *pstore_zone_info; - -struct bdev_info { - dev_t devt; - sector_t nr_sects; - sector_t start_sect; -}; +static struct file *psblk_file; +static struct pstore_device_info *pstore_device_info; #define check_size(name, alignsize) ({ \ long _##name_ = (name); \ @@ -108,57 +83,63 @@ struct bdev_info { _##name_; \ }) +#define verify_size(name, alignsize, enabled) { \ + long _##name_; \ + if (enabled) \ + _##name_ = check_size(name, alignsize); \ + else \ + _##name_ = 0; \ + /* Synchronize module parameters with resuls. */ \ + name = _##name_ / 1024; \ + dev->zone.name = _##name_; \ +} + static int __register_pstore_device(struct pstore_device_info *dev) { int ret; lockdep_assert_held(&pstore_blk_lock); - if (!dev || !dev->total_size || !dev->read || !dev->write) + if (!dev) { + pr_err("NULL device info\n"); + return -EINVAL; + } + if (!dev->zone.total_size) { + pr_err("zero sized device\n"); return -EINVAL; + } + if (!dev->zone.read) { + pr_err("no read handler for device\n"); + return -EINVAL; + } + if (!dev->zone.write) { + pr_err("no write handler for device\n"); + return -EINVAL; + } /* someone already registered before */ - if (pstore_zone_info) + if (pstore_device_info) return -EBUSY; - pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); - if (!pstore_zone_info) - return -ENOMEM; - /* zero means not limit on which backends to attempt to store. */ if (!dev->flags) dev->flags = UINT_MAX; -#define verify_size(name, alignsize, enabled) { \ - long _##name_; \ - if (enabled) \ - _##name_ = check_size(name, alignsize); \ - else \ - _##name_ = 0; \ - name = _##name_ / 1024; \ - pstore_zone_info->name = _##name_; \ - } - + /* Copy in module parameters. */ verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); -#undef verify_size - - pstore_zone_info->total_size = dev->total_size; - pstore_zone_info->max_reason = max_reason; - pstore_zone_info->read = dev->read; - pstore_zone_info->write = dev->write; - pstore_zone_info->erase = dev->erase; - pstore_zone_info->panic_write = dev->panic_write; - pstore_zone_info->name = KBUILD_MODNAME; - pstore_zone_info->owner = THIS_MODULE; - - ret = register_pstore_zone(pstore_zone_info); - if (ret) { - kfree(pstore_zone_info); - pstore_zone_info = NULL; - } + dev->zone.max_reason = max_reason; + + /* Initialize required zone ownership details. */ + dev->zone.name = KBUILD_MODNAME; + dev->zone.owner = THIS_MODULE; + + ret = register_pstore_zone(&dev->zone); + if (ret == 0) + pstore_device_info = dev; + return ret; } /** @@ -185,10 +166,9 @@ EXPORT_SYMBOL_GPL(register_pstore_device); static void __unregister_pstore_device(struct pstore_device_info *dev) { lockdep_assert_held(&pstore_blk_lock); - if (pstore_zone_info && pstore_zone_info->read == dev->read) { - unregister_pstore_zone(pstore_zone_info); - kfree(pstore_zone_info); - pstore_zone_info = NULL; + if (pstore_device_info && pstore_device_info == dev) { + unregister_pstore_zone(&dev->zone); + pstore_device_info = NULL; } } @@ -205,204 +185,59 @@ void unregister_pstore_device(struct pstore_device_info *dev) } EXPORT_SYMBOL_GPL(unregister_pstore_device); -/** - * psblk_get_bdev() - open block device - * - * @holder: Exclusive holder identifier - * @info: Information about bdev to fill in - * - * Return: pointer to block device on success and others on error. - * - * On success, the returned block_device has reference count of one. - */ -static struct block_device *psblk_get_bdev(void *holder, - struct bdev_info *info) -{ - struct block_device *bdev = ERR_PTR(-ENODEV); - fmode_t mode = FMODE_READ | FMODE_WRITE; - sector_t nr_sects; - - lockdep_assert_held(&pstore_blk_lock); - - if (pstore_zone_info) - return ERR_PTR(-EBUSY); - - if (!blkdev[0]) - return ERR_PTR(-ENODEV); - - if (holder) - mode |= FMODE_EXCL; - bdev = blkdev_get_by_path(blkdev, mode, holder); - if (IS_ERR(bdev)) { - dev_t devt; - - devt = name_to_dev_t(blkdev); - if (devt == 0) - return ERR_PTR(-ENODEV); - bdev = blkdev_get_by_dev(devt, mode, holder); - if (IS_ERR(bdev)) - return bdev; - } - - nr_sects = bdev_nr_sectors(bdev); - if (!nr_sects) { - pr_err("not enough space for '%s'\n", blkdev); - blkdev_put(bdev, mode); - return ERR_PTR(-ENOSPC); - } - - if (info) { - info->devt = bdev->bd_dev; - info->nr_sects = nr_sects; - info->start_sect = get_start_sect(bdev); - } - - return bdev; -} - -static void psblk_put_bdev(struct block_device *bdev, void *holder) -{ - fmode_t mode = FMODE_READ | FMODE_WRITE; - - lockdep_assert_held(&pstore_blk_lock); - - if (!bdev) - return; - - if (holder) - mode |= FMODE_EXCL; - blkdev_put(bdev, mode); -} - static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos) { - struct block_device *bdev = psblk_bdev; - struct file file; - struct kiocb kiocb; - struct iov_iter iter; - struct kvec iov = {.iov_base = buf, .iov_len = bytes}; - - if (!bdev) - return -ENODEV; - - memset(&file, 0, sizeof(struct file)); - file.f_mapping = bdev->bd_inode->i_mapping; - file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; - file.f_inode = bdev->bd_inode; - file_ra_state_init(&file.f_ra, file.f_mapping); - - init_sync_kiocb(&kiocb, &file); - kiocb.ki_pos = pos; - iov_iter_kvec(&iter, READ, &iov, 1, bytes); - - return generic_file_read_iter(&kiocb, &iter); + return kernel_read(psblk_file, buf, bytes, &pos); } static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, loff_t pos) { - struct block_device *bdev = psblk_bdev; - struct iov_iter iter; - struct kiocb kiocb; - struct file file; - ssize_t ret; - struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes}; - - if (!bdev) - return -ENODEV; - /* Console/Ftrace backend may handle buffer until flush dirty zones */ if (in_interrupt() || irqs_disabled()) return -EBUSY; - - memset(&file, 0, sizeof(struct file)); - file.f_mapping = bdev->bd_inode->i_mapping; - file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; - file.f_inode = bdev->bd_inode; - - init_sync_kiocb(&kiocb, &file); - kiocb.ki_pos = pos; - iov_iter_kvec(&iter, WRITE, &iov, 1, bytes); - - inode_lock(bdev->bd_inode); - ret = generic_write_checks(&kiocb, &iter); - if (ret > 0) - ret = generic_perform_write(&file, &iter, pos); - inode_unlock(bdev->bd_inode); - - if (likely(ret > 0)) { - const struct file_operations f_op = {.fsync = blkdev_fsync}; - - file.f_op = &f_op; - kiocb.ki_pos += ret; - ret = generic_write_sync(&kiocb, ret); - } - return ret; + return kernel_write(psblk_file, buf, bytes, &pos); } /* * This takes its configuration only from the module parameters now. - * See psblk_get_bdev() and blkdev. */ -static int __register_pstore_blk(void) +static int __register_pstore_blk(struct pstore_device_info *dev, + const char *devpath) { - char bdev_name[BDEVNAME_SIZE]; - struct block_device *bdev; - struct pstore_device_info dev; - struct bdev_info binfo; - void *holder = blkdev; + struct inode *inode; int ret = -ENODEV; lockdep_assert_held(&pstore_blk_lock); - /* hold bdev exclusively */ - memset(&binfo, 0, sizeof(binfo)); - bdev = psblk_get_bdev(holder, &binfo); - if (IS_ERR(bdev)) { - pr_err("failed to open '%s'!\n", blkdev); - return PTR_ERR(bdev); + psblk_file = filp_open(devpath, O_RDWR | O_DSYNC | O_NOATIME | O_EXCL, 0); + if (IS_ERR(psblk_file)) { + ret = PTR_ERR(psblk_file); + pr_err("failed to open '%s': %d!\n", devpath, ret); + goto err; } - /* only allow driver matching the @blkdev */ - if (!binfo.devt) { - pr_debug("no major\n"); - ret = -ENODEV; - goto err_put_bdev; + inode = file_inode(psblk_file); + if (!S_ISBLK(inode->i_mode)) { + pr_err("'%s' is not block device!\n", devpath); + goto err_fput; } - /* psblk_bdev must be assigned before register to pstore/blk */ - psblk_bdev = bdev; - - memset(&dev, 0, sizeof(dev)); - dev.total_size = binfo.nr_sects << SECTOR_SHIFT; - dev.read = psblk_generic_blk_read; - dev.write = psblk_generic_blk_write; + inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode; + dev->zone.total_size = i_size_read(inode); - ret = __register_pstore_device(&dev); + ret = __register_pstore_device(dev); if (ret) - goto err_put_bdev; + goto err_fput; - bdevname(bdev, bdev_name); - pr_info("attached %s (no dedicated panic_write!)\n", bdev_name); return 0; -err_put_bdev: - psblk_bdev = NULL; - psblk_put_bdev(bdev, holder); - return ret; -} - -static void __unregister_pstore_blk(unsigned int major) -{ - struct pstore_device_info dev = { .read = psblk_generic_blk_read }; - void *holder = blkdev; +err_fput: + fput(psblk_file); +err: + psblk_file = NULL; - lockdep_assert_held(&pstore_blk_lock); - if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { - __unregister_pstore_device(&dev); - psblk_put_bdev(psblk_bdev, holder); - psblk_bdev = NULL; - } + return ret; } /* get information of pstore/blk */ @@ -419,13 +254,93 @@ int pstore_blk_get_config(struct pstore_blk_config *info) } EXPORT_SYMBOL_GPL(pstore_blk_get_config); + +#ifndef MODULE +static const char devname[] = "/dev/pstore-blk"; +static __init const char *early_boot_devpath(const char *initial_devname) +{ + /* + * During early boot the real root file system hasn't been + * mounted yet, and no device nodes are present yet. Use the + * same scheme to find the device that we use for mounting + * the root file system. + */ + dev_t dev = name_to_dev_t(initial_devname); + + if (!dev) { + pr_err("failed to resolve '%s'!\n", initial_devname); + return initial_devname; + } + + init_unlink(devname); + init_mknod(devname, S_IFBLK | 0600, new_encode_dev(dev)); + + return devname; +} +#else +static inline const char *early_boot_devpath(const char *initial_devname) +{ + return initial_devname; +} +#endif + +static int __init __best_effort_init(void) +{ + struct pstore_device_info *best_effort_dev; + int ret; + + /* No best-effort mode requested. */ + if (!best_effort) + return 0; + + /* Reject an empty blkdev. */ + if (!blkdev[0]) { + pr_err("blkdev empty with best_effort=Y\n"); + return -EINVAL; + } + + best_effort_dev = kzalloc(sizeof(*best_effort_dev), GFP_KERNEL); + if (!best_effort_dev) + return -ENOMEM; + + best_effort_dev->zone.read = psblk_generic_blk_read; + best_effort_dev->zone.write = psblk_generic_blk_write; + + ret = __register_pstore_blk(best_effort_dev, + early_boot_devpath(blkdev)); + if (ret) + kfree(best_effort_dev); + else + pr_info("attached %s (%zu) (no dedicated panic_write!)\n", + blkdev, best_effort_dev->zone.total_size); + + return ret; +} + +static void __exit __best_effort_exit(void) +{ + /* + * Currently, the only user of psblk_file is best_effort, so + * we can assume that pstore_device_info is associated with it. + * Once there are "real" blk devices, there will need to be a + * dedicated pstore_blk_info, etc. + */ + if (psblk_file) { + struct pstore_device_info *dev = pstore_device_info; + + __unregister_pstore_device(dev); + kfree(dev); + fput(psblk_file); + psblk_file = NULL; + } +} + static int __init pstore_blk_init(void) { - int ret = 0; + int ret; mutex_lock(&pstore_blk_lock); - if (!pstore_zone_info && best_effort && blkdev[0]) - ret = __register_pstore_blk(); + ret = __best_effort_init(); mutex_unlock(&pstore_blk_lock); return ret; @@ -435,15 +350,9 @@ late_initcall(pstore_blk_init); static void __exit pstore_blk_exit(void) { mutex_lock(&pstore_blk_lock); - if (psblk_bdev) - __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); - else { - struct pstore_device_info dev = { }; - - if (pstore_zone_info) - dev.read = pstore_zone_info->read; - __unregister_pstore_device(&dev); - } + __best_effort_exit(); + /* If we've been asked to unload, unregister any remaining device. */ + __unregister_pstore_device(pstore_device_info); mutex_unlock(&pstore_blk_lock); } module_exit(pstore_blk_exit); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 05e4bd9ab6d6..2bcc9a6f1bfc 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -968,31 +968,30 @@ out: return ret; } -SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *, - mountpoint, qid_t, id, void __user *, addr) +SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd, + qid_t, id, void __user *, addr) { struct super_block *sb; - struct path mountpath; unsigned int cmds = cmd >> SUBCMDSHIFT; unsigned int type = cmd & SUBCMDMASK; + struct fd f; int ret; - if (type >= MAXQUOTAS) - return -EINVAL; + f = fdget_raw(fd); + if (!f.file) + return -EBADF; - ret = user_path_at(AT_FDCWD, mountpoint, - LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT, &mountpath); - if (ret) - return ret; - - sb = mountpath.mnt->mnt_sb; + ret = -EINVAL; + if (type >= MAXQUOTAS) + goto out; if (quotactl_cmd_write(cmds)) { - ret = mnt_want_write(mountpath.mnt); + ret = mnt_want_write(f.file->f_path.mnt); if (ret) goto out; } + sb = f.file->f_path.mnt->mnt_sb; if (quotactl_cmd_onoff(cmds)) down_write(&sb->s_umount); else @@ -1006,9 +1005,8 @@ SYSCALL_DEFINE4(quotactl_path, unsigned int, cmd, const char __user *, up_read(&sb->s_umount); if (quotactl_cmd_write(cmds)) - mnt_drop_write(mountpath.mnt); + mnt_drop_write(f.file->f_path.mnt); out: - path_put(&mountpath); - + fdput(f); return ret; } diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index c5562c871c8b..d3e995e1046f 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -47,15 +47,6 @@ static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) / info->dqi_entry_size; } -static char *getdqbuf(size_t size) -{ - char *buf = kmalloc(size, GFP_NOFS); - if (!buf) - printk(KERN_WARNING - "VFS: Not enough memory for quota buffers.\n"); - return buf; -} - static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) { struct super_block *sb = info->dqi_sb; @@ -83,7 +74,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) /* Remove empty block from list and return it */ static int get_free_dqblk(struct qtree_mem_dqinfo *info) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int ret, blk; @@ -132,7 +123,7 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk) static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { - char *tmpbuf = getdqbuf(info->dqi_usable_bs); + char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; uint nextblk = le32_to_cpu(dh->dqdh_next_free); uint prevblk = le32_to_cpu(dh->dqdh_prev_free); @@ -179,7 +170,7 @@ out_buf: static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, uint blk) { - char *tmpbuf = getdqbuf(info->dqi_usable_bs); + char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS); struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; int err; @@ -227,7 +218,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, { uint blk, i; struct qt_disk_dqdbheader *dh; - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); char *ddquot; *err = 0; @@ -298,7 +289,7 @@ out_buf: static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *treeblk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0, newson = 0, newact = 0; __le32 *ref; uint newblk; @@ -375,7 +366,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) int type = dquot->dq_id.type; struct super_block *sb = dquot->dq_sb; ssize_t ret; - char *ddquot = getdqbuf(info->dqi_entry_size); + char *ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS); if (!ddquot) return -ENOMEM; @@ -414,7 +405,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { struct qt_disk_dqdbheader *dh; - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0; if (!buf) @@ -474,7 +465,7 @@ out_buf: static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint *blk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); int ret = 0; uint newblk; __le32 *ref = (__le32 *)buf; @@ -533,7 +524,7 @@ EXPORT_SYMBOL(qtree_delete_dquot); static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); loff_t ret = 0; int i; char *ddquot; @@ -571,7 +562,7 @@ out_buf: static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, uint blk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); loff_t ret = 0; __le32 *ref = (__le32 *)buf; @@ -635,7 +626,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) } dquot->dq_off = offset; } - ddquot = getdqbuf(info->dqi_entry_size); + ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS); if (!ddquot) return -ENOMEM; ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size, @@ -679,7 +670,7 @@ EXPORT_SYMBOL(qtree_release_dquot); static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id, unsigned int blk, int depth) { - char *buf = getdqbuf(info->dqi_usable_bs); + char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS); __le32 *ref = (__le32 *)buf; ssize_t ret; unsigned int epb = info->dqi_usable_bs >> 2; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 780bb90c1804..f49b72ccac4c 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2584,9 +2584,7 @@ static int reiserfs_write_full_page(struct page *page, clear_buffer_dirty(bh); set_buffer_uptodate(bh); } else if ((checked || buffer_dirty(bh)) && - (!buffer_mapped(bh) || (buffer_mapped(bh) - && bh->b_blocknr == - 0))) { + (!buffer_mapped(bh) || bh->b_blocknr == 0)) { /* * not mapped yet, or it points to a direct item, search * the btree for the mapping info, and log any direct diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9edc8e2b154e..0834b101c316 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2758,6 +2758,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } + /* + * Sanity check to see if journal first block is correct. + * If journal first block is invalid it can cause + * zeroing important superblock members. + */ + if (!SB_ONDISK_JOURNAL_DEVICE(sb) && + SB_ONDISK_JOURNAL_1st_BLOCK(sb) < SB_JOURNAL_1st_RESERVED_BLOCK(sb)) { + reiserfs_warning(sb, "journal-1393", + "journal 1st super block is invalid: 1st reserved block %d, but actual 1st block is %d", + SB_JOURNAL_1st_RESERVED_BLOCK(sb), + SB_ONDISK_JOURNAL_1st_BLOCK(sb)); + goto free_and_return; + } + if (journal_init_dev(sb, journal, j_dev_name) != 0) { reiserfs_warning(sb, "sh-462", "unable to initialize journal device"); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 017db70d0f48..3d7a35d6a18b 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -132,6 +132,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, return IO_ERROR; } PATH_LAST_POSITION(path)--; + break; case ITEM_FOUND: break; diff --git a/fs/super.c b/fs/super.c index 11b7e7213fd1..91b7f156735b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1277,9 +1277,9 @@ int get_tree_bdev(struct fs_context *fc, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ @@ -1352,9 +1352,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 3ae9f1e91984..7c7c9bbbfa57 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -934,6 +934,10 @@ static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, iinfo->i_location.partitionReferenceNum, 0); epos.bh = udf_tgetblk(sb, block); + if (unlikely(!epos.bh)) { + err = -ENOMEM; + goto out_no_entry; + } lock_buffer(epos.bh); memset(epos.bh->b_data, 0x00, bsize); set_buffer_uptodate(epos.bh); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 19ebae443ade..f6e0f0c0d0e5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -337,7 +337,7 @@ out: return ret; } -static inline long userfaultfd_get_blocking_state(unsigned int flags) +static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) { if (flags & FAULT_FLAG_INTERRUPTIBLE) return TASK_INTERRUPTIBLE; @@ -370,7 +370,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; bool must_wait; - long blocking_state; + unsigned int blocking_state; /* * We don't do userfault handling for the final child pid update. diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index bbfea8022a3b..e0d5cdc57cc2 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -366,7 +366,7 @@ xfs_ag_resv_alloc_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : XFS_TRANS_SB_FDBLOCKS; @@ -408,7 +408,7 @@ xfs_ag_resv_free_extent( break; default: ASSERT(0); - /* fall through */ + fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); return; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 82b7cbb1f24f..af3d5f9271f6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3174,7 +3174,7 @@ xfs_alloc_vextent( } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); args->type = XFS_ALLOCTYPE_NEAR_BNO; - /* FALLTHROUGH */ + fallthrough; case XFS_ALLOCTYPE_FIRST_AG: /* * Rotate through the allocation groups looking for a winner. diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 83ac9771bfb5..747ec77912c3 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -282,7 +282,7 @@ xfs_da3_node_read_verify( __this_address); break; } - /* fall through */ + fallthrough; case XFS_DA_NODE_MAGIC: fa = xfs_da3_node_verify(bp); if (fa) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 7a2f9b5f2db5..f96e84793cc9 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -86,6 +86,7 @@ xchk_superblock( case -ENOSYS: case -EFBIG: error = -EFSCORRUPTED; + fallthrough; default: break; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index b5ebf1d1b4db..77d5c4a0f09f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -271,7 +271,7 @@ xchk_bmap_iextent_xref( case XFS_DATA_FORK: if (xfs_is_reflink_inode(info->sc->ip)) break; - /* fall through */ + fallthrough; case XFS_ATTR_FORK: xchk_xref_is_not_shared(info->sc, agbno, irec->br_blockcount); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index a94bd8122c60..bd1172358964 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -44,7 +44,7 @@ __xchk_btree_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index be38c960da85..6cc92291c46f 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -83,7 +83,7 @@ __xchk_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_op_error(sc, agno, bno, *error, ret_ip); @@ -136,7 +136,7 @@ __xchk_fblock_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= errflag; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_file_op_error(sc, whichfork, offset, *error, ret_ip); @@ -696,7 +696,7 @@ xchk_get_inode( if (error) return -ENOENT; error = -EFSCORRUPTED; - /* fall through */ + fallthrough; default: trace_xchk_op_error(sc, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 653f3280e1c1..9f0dbb47c82c 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -47,7 +47,7 @@ xchk_da_process_error( /* Note the badness but don't abort. */ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; *error = 0; - /* fall through */ + fallthrough; default: trace_xchk_file_op_error(sc, ds->dargs.whichfork, xfs_dir2_da_to_db(ds->dargs.geo, diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c2857d854c83..b8202dd08939 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -947,7 +947,7 @@ xrep_ino_dqattach( xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); - /* fall through */ + fallthrough; case -ESRCH: error = 0; break; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0936f3a96fe6..2988efa97e14 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -286,7 +286,7 @@ xfs_bmap_count_blocks( */ *count += btblocks - 1; - /* fall through */ + fallthrough; case XFS_DINODE_FMT_EXTENTS: *nextents = xfs_bmap_count_leaves(ifp, count); break; diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 465fd9e048d4..1da59bdff245 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -84,7 +84,7 @@ xfs_fs_encode_fh( case FILEID_INO32_GEN_PARENT: fid->i32.parent_ino = XFS_I(parent)->i_ino; fid->i32.parent_gen = parent->i_generation; - /*FALLTHRU*/ + fallthrough; case FILEID_INO32_GEN: fid->i32.ino = XFS_I(inode)->i_ino; fid->i32.gen = inode->i_generation; @@ -92,7 +92,7 @@ xfs_fs_encode_fh( case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: fid64->parent_ino = XFS_I(parent)->i_ino; fid64->parent_gen = parent->i_generation; - /*FALLTHRU*/ + fallthrough; case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: fid64->ino = XFS_I(inode)->i_ino; fid64->gen = inode->i_generation; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 396ef36dcd0a..3c0749ab9e40 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -863,7 +863,7 @@ xfs_break_layouts( error = xfs_break_dax_layouts(inode, &retry); if (error || retry) break; - /* fall through */ + fallthrough; case BREAK_WRITE: error = xfs_break_leased_layouts(inode, iolock, &retry); break; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e4c2da4566f1..1db99a909b23 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -877,7 +877,7 @@ xfs_init_new_inode( xfs_inode_inherit_flags(ip, pip); if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) xfs_inode_inherit_flags2(ip, pip); - /* FALLTHROUGH */ + fallthrough; case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_bytes = 0; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1fe4c1fc0aea..7b1796cede10 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -558,7 +558,7 @@ xfs_ioc_attrmulti_one( case ATTR_OP_REMOVE: value = NULL; *len = 0; - /* fall through */ + fallthrough; case ATTR_OP_SET: error = mnt_want_write_file(parfilp); if (error) @@ -1511,7 +1511,7 @@ xfs_ioc_getbmap( switch (cmd) { case XFS_IOC_GETBMAPA: bmx.bmv_iflags = BMV_IF_ATTRFORK; - /*FALLTHRU*/ + fallthrough; case XFS_IOC_GETBMAP: /* struct getbmap is a strict subset of struct getbmapx. */ recsize = sizeof(struct getbmap); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d154f42e2dc6..d8cd2583dedb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1036,7 +1036,7 @@ retry: prealloc_blocks = 0; goto retry; } - /*FALLTHRU*/ + fallthrough; default: goto out_unlock; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c19a82adea1e..a002425377b5 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2626,6 +2626,7 @@ xlog_covered_state( case XLOG_STATE_COVER_IDLE: if (iclogs_changed == 1) return XLOG_STATE_COVER_IDLE; + fallthrough; case XLOG_STATE_COVER_NEED: case XLOG_STATE_COVER_NEED2: break; diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7ec1a9207517..bb9860ec9a93 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -2,6 +2,8 @@ #ifndef __XFS_MESSAGE_H #define __XFS_MESSAGE_H 1 +#include <linux/once_lite.h> + struct xfs_mount; extern __printf(2, 3) @@ -41,16 +43,7 @@ do { \ } while (0) #define xfs_printk_once(func, dev, fmt, ...) \ -({ \ - static bool __section(".data.once") __print_once; \ - bool __ret_print_once = !__print_once; \ - \ - if (!__print_once) { \ - __print_once = true; \ - func(dev, fmt, ##__VA_ARGS__); \ - } \ - unlikely(__ret_print_once); \ -}) + DO_ONCE_LITE(func, dev, fmt, ##__VA_ARGS__) #define xfs_emerg_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 9aced0a00003..d11d032da0b4 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -294,7 +294,7 @@ xfs_trans_read_buf_map( default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); - /* fall through */ + fallthrough; case -ENOMEM: case -EAGAIN: return error; |