diff options
Diffstat (limited to 'fs')
141 files changed, 3166 insertions, 2162 deletions
diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 4dd97afa536c..5219182e52e1 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1358,6 +1358,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op->dentry = dentry; op->create.mode = S_IFDIR | mode; op->create.reason = afs_edit_dir_for_mkdir; + op->mtime = current_time(dir); op->ops = &afs_mkdir_operation; return afs_do_sync_operation(op); } @@ -1661,6 +1662,7 @@ static int afs_create(struct mnt_idmap *idmap, struct inode *dir, op->dentry = dentry; op->create.mode = S_IFREG | mode; op->create.reason = afs_edit_dir_for_create; + op->mtime = current_time(dir); op->ops = &afs_create_operation; return afs_do_sync_operation(op); @@ -1796,6 +1798,7 @@ static int afs_symlink(struct mnt_idmap *idmap, struct inode *dir, op->ops = &afs_symlink_operation; op->create.reason = afs_edit_dir_for_symlink; op->create.symlink = content; + op->mtime = current_time(dir); return afs_do_sync_operation(op); error: diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index d1c7068b4346..58452b86e672 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -115,8 +115,8 @@ responded: } } - if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && - rtt_us < server->probe.rtt) { + rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); + if (rtt_us < server->probe.rtt) { server->probe.rtt = rtt_us; server->rtt = rtt_us; alist->preferred = index; diff --git a/fs/afs/write.c b/fs/afs/write.c index c822d6006033..8750b99c3f56 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -731,6 +731,7 @@ static int afs_writepages_region(struct address_space *mapping, * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ +try_again: if (wbc->sync_mode != WB_SYNC_NONE) { ret = folio_lock_killable(folio); if (ret < 0) { @@ -757,12 +758,14 @@ static int afs_writepages_region(struct address_space *mapping, #ifdef CONFIG_AFS_FSCACHE folio_wait_fscache(folio); #endif - } else { - start += folio_size(folio); + goto try_again; } + + start += folio_size(folio); if (wbc->sync_mode == WB_SYNC_NONE) { if (skips >= 5 || need_resched()) { *_next = start; + folio_batch_release(&fbatch); _leave(" = 0 [%llx]", *_next); return 0; } @@ -530,7 +530,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) for (i = 0; i < nr_pages; i++) { struct page *page; page = find_or_create_page(file->f_mapping, - i, GFP_HIGHUSER | __GFP_ZERO); + i, GFP_USER | __GFP_ZERO); if (!page) break; pr_debug("pid(%d) page[%d]->count=%d\n", @@ -571,7 +571,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ ring->id = ~0U; ring->head = ring->tail = 0; @@ -579,7 +579,6 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; ring->header_length = sizeof(struct aio_ring); - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); return 0; @@ -682,9 +681,8 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) * we are protected from page migration * changes ring_pages by ->ring_lock. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->id = ctx->id; - kunmap_atomic(ring); return 0; } @@ -1025,9 +1023,8 @@ static void user_refill_reqs_available(struct kioctx *ctx) * against ctx->completed_events below will make sure we do the * safe/right thing. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; - kunmap_atomic(ring); refill_reqs_available(ctx, head, ctx->tail); } @@ -1133,12 +1130,11 @@ static void aio_complete(struct aio_kiocb *iocb) if (++tail >= ctx->nr_events) tail = 0; - ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); + ev_page = page_address(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; *event = iocb->ki_res; - kunmap_atomic(ev_page); flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %Lx %Lx\n", ctx, tail, iocb, @@ -1152,10 +1148,9 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->tail = tail; - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; ring->tail = tail; - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); ctx->completed_events++; @@ -1215,10 +1210,9 @@ static long aio_read_events_ring(struct kioctx *ctx, mutex_lock(&ctx->ring_lock); /* Access to ->ring_pages here is protected by ctx->ring_lock. */ - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); head = ring->head; tail = ring->tail; - kunmap_atomic(ring); /* * Ensure that once we've read the current tail pointer, that @@ -1250,10 +1244,9 @@ static long aio_read_events_ring(struct kioctx *ctx, avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); - ev = kmap(page); + ev = page_address(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); - kunmap(page); if (unlikely(copy_ret)) { ret = -EFAULT; @@ -1265,9 +1258,8 @@ static long aio_read_events_ring(struct kioctx *ctx, head %= ctx->nr_events; } - ring = kmap_atomic(ctx->ring_pages[0]); + ring = page_address(ctx->ring_pages[0]); ring->head = head; - kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); pr_debug("%li h%u t%u\n", ret, head, tail); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 6baf90b08e0e..93046c9dc461 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); return 0; } @@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); @@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); - dir->i_mtime = current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); return 0; } diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 590b03560265..e97af2e510c3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1973,7 +1973,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); if (!buf) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2b1b227505f3..dabc79c1af1b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -242,7 +242,6 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - u64 start = eb->start; int i, num_pages = num_extent_pages(eb); int ret = 0; @@ -251,12 +250,14 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; + u64 start = max_t(u64, eb->start, page_offset(p)); + u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + u32 len = end - start; - ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE, - start, p, start - page_offset(p), mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, + start, p, offset_in_page(start), mirror_num); if (ret) break; - start += PAGE_SIZE; } return ret; @@ -995,13 +996,18 @@ int btrfs_global_root_insert(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *tmp; + int ret = 0; write_lock(&fs_info->global_root_lock); tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp); write_unlock(&fs_info->global_root_lock); - ASSERT(!tmp); - return tmp ? -EEXIST : 0; + if (tmp) { + ret = -EEXIST; + btrfs_warn(fs_info, "global root %llu %llu already exists", + root->root_key.objectid, root->root_key.offset); + } + return ret; } void btrfs_global_root_delete(struct btrfs_root *root) @@ -2841,6 +2847,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* We can't trust the free space cache either */ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; if (ret < 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 19c707bc8801..7fcafcc5292c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1864,7 +1864,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), key->offset - args->extent_offset, - args->disk_bytenr, false, path); + args->disk_bytenr, args->strict, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -7264,7 +7264,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, - u64 start, u64 len, + u64 start, u64 *lenp, unsigned int iomap_flags) { const bool nowait = (iomap_flags & IOMAP_NOWAIT); @@ -7275,6 +7275,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; + u64 len = *lenp; u64 prev_len; int ret = 0; @@ -7345,15 +7346,19 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, free_extent_map(em); *map = NULL; - if (nowait) - return -EAGAIN; + if (nowait) { + ret = -EAGAIN; + goto out; + } /* * If we could not allocate data space before locking the file * range and we can't do a NOCOW write, then we have to fail. */ - if (!dio_data->data_space_reserved) - return -ENOSPC; + if (!dio_data->data_space_reserved) { + ret = -ENOSPC; + goto out; + } /* * We have to COW and we have already reserved data space before, @@ -7394,6 +7399,7 @@ out: btrfs_delalloc_release_extents(BTRFS_I(inode), len); btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } + *lenp = len; return ret; } @@ -7570,7 +7576,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len, flags); + start, &len, flags); if (ret < 0) goto unlock_err; unlock_extents = true; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7c666517d3d3..16c228344cbb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -134,8 +134,14 @@ struct scrub_stripe { * The errors hit during the initial read of the stripe. * * Would be utilized for error reporting and repair. + * + * The remaining init_nr_* records the number of errors hit, only used + * by error reporting. */ unsigned long init_error_bitmap; + unsigned int init_nr_io_errors; + unsigned int init_nr_csum_errors; + unsigned int init_nr_meta_errors; /* * The following error bitmaps are all for the current status. @@ -1003,12 +1009,9 @@ skip: sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; sctx->stat.no_csum += nr_nodatacsum_sectors; - sctx->stat.read_errors += - bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors); - sctx->stat.csum_errors += - bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors); - sctx->stat.verify_errors += - bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); + sctx->stat.read_errors += stripe->init_nr_io_errors; + sctx->stat.csum_errors += stripe->init_nr_csum_errors; + sctx->stat.verify_errors += stripe->init_nr_meta_errors; sctx->stat.uncorrectable_errors += bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; @@ -1041,6 +1044,12 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); /* Save the initial failed bitmap for later repair and report usage. */ stripe->init_error_bitmap = stripe->error_bitmap; + stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, + stripe->nr_sectors); + stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, + stripe->nr_sectors); + stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, + stripe->nr_sectors); if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) goto out; @@ -1295,7 +1304,7 @@ static int get_raid56_logic_offset(u64 physical, int num, u32 stripe_index; u32 rot; - *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT); + *offset = last_offset + btrfs_stripe_nr_to_offset(i); stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; @@ -1310,7 +1319,7 @@ static int get_raid56_logic_offset(u64 physical, int num, if (stripe_index < num) j++; } - *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT); + *offset = last_offset + btrfs_stripe_nr_to_offset(j); return 1; } @@ -1490,6 +1499,9 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { stripe->extent_sector_bitmap = 0; stripe->init_error_bitmap = 0; + stripe->init_nr_io_errors = 0; + stripe->init_nr_csum_errors = 0; + stripe->init_nr_meta_errors = 0; stripe->error_bitmap = 0; stripe->io_error_bitmap = 0; stripe->csum_error_bitmap = 0; @@ -1703,7 +1715,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - nr_stripes << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(nr_stripes)); for (int i = 0; i < nr_stripes; i++) { stripe = &sctx->stripes[i]; scrub_submit_initial_read(sctx, stripe); @@ -1730,7 +1742,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) break; } } - } else { + } else if (!sctx->readonly) { for (int i = 0; i < nr_stripes; i++) { unsigned long repaired; @@ -1826,7 +1838,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bool all_empty = true; const int data_stripes = nr_data_stripes(map); unsigned long extent_bitmap = 0; - u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT; + u64 length = btrfs_stripe_nr_to_offset(data_stripes); int ret; ASSERT(sctx->raid56_data_stripes); @@ -1841,13 +1853,13 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; stripe_index = (i + rot) % map->num_stripes; physical = map->stripes[stripe_index].physical + - (rot << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(rot); scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); ret = scrub_find_fill_first_stripe(bg, map->stripes[stripe_index].dev, physical, 1, - full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT), + full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); if (ret < 0) goto out; @@ -1857,7 +1869,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ if (ret > 0) { stripe->logical = full_stripe_start + - (i << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(i); stripe->dev = map->stripes[stripe_index].dev; stripe->mirror_num = 1; set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); @@ -2050,7 +2062,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); - return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + return btrfs_stripe_nr_to_offset(map->num_stripes / map->sub_stripes); } /* Get the logical bytenr for the stripe */ @@ -2066,7 +2078,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, * (stripe_index / sub_stripes) gives how many data stripes we need to * skip. */ - return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) + + return btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes) + bg->start; } @@ -2192,7 +2204,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); - offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + offset = btrfs_stripe_nr_to_offset(stripe_index / map->sub_stripes); goto out; } @@ -2207,7 +2219,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, /* Initialize @offset in case we need to go to out: label */ get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); - increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + increment = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * Due to the rotation, for RAID56 it's better to iterate each stripe @@ -2254,7 +2266,7 @@ next: } out: ret2 = flush_scrub_stripes(sctx); - if (!ret2) + if (!ret) ret = ret2; if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ec18e2210602..efeb1a9d040a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1841,6 +1841,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_clear_sb_rdonly(sb); set_bit(BTRFS_FS_OPEN, &fs_info->flags); + + /* + * If we've gone from readonly -> read/write, we need to get + * our sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); } out: /* diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index e2b54793bf0c..2138e9fc0564 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -857,10 +857,10 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, * * Thus it should be a good way to catch obvious bitflips. */ - if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) { + if (unlikely(length >= btrfs_stripe_nr_to_offset(U32_MAX))) { chunk_err(leaf, chunk, logical, "chunk length too large: have %llu limit %llu", - length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT); + length, btrfs_stripe_nr_to_offset(U32_MAX)); return -EUCLEAN; } if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 841e799dece5..72a838c97534 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5125,7 +5125,7 @@ static void init_alloc_chunk_ctl_policy_regular( /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); - ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT; + ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes); } static void init_alloc_chunk_ctl_policy_zoned( @@ -5801,7 +5801,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, if (!WARN_ON(IS_ERR(em))) { map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); free_extent_map(em); } return len; @@ -5975,12 +5975,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr); stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> BTRFS_STRIPE_LEN_SHIFT; stripe_cnt = stripe_nr_end - stripe_nr; - stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - + stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) - (offset + length); /* * after this, stripe_nr is the number of stripes on this @@ -6023,12 +6023,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, for (i = 0; i < *num_stripes; i++) { stripes[i].physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT; + stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev); if (i / sub_stripes < remaining_stripes) stripes[i].length += BTRFS_STRIPE_LEN; @@ -6183,8 +6183,8 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, ASSERT(*stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = nr_data_stripes(map) << - BTRFS_STRIPE_LEN_SHIFT; + unsigned long full_stripe_len = + btrfs_stripe_nr_to_offset(nr_data_stripes(map)); /* * For full stripe start, we use previously calculated @@ -6196,9 +6196,11 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * not ensured to be power of 2. */ *full_stripe_start = - rounddown(*stripe_nr, nr_data_stripes(map)) << - BTRFS_STRIPE_LEN_SHIFT; + btrfs_stripe_nr_to_offset( + rounddown(*stripe_nr, nr_data_stripes(map))); + ASSERT(*full_stripe_start + full_stripe_len > offset); + ASSERT(*full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6221,7 +6223,7 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * { dst->dev = map->stripes[stripe_index].dev; dst->physical = map->stripes[stripe_index].physical + - stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); } int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -6343,7 +6345,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* Return the length to the full stripe end */ *length = min(logical + *length, raid56_full_stripe_start + em->start + - (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical; + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6433,7 +6436,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * modulo, to reduce one modulo call. */ bioc->full_stripe_logical = em->start + - ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); for (i = 0; i < num_stripes; i++) set_io_stripe(&bioc->stripes[i], map, (i + stripe_nr) % num_stripes, @@ -8030,7 +8033,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc, for (i = 0; i < data_stripes; i++) { u64 stripe_start = bioc->full_stripe_logical + - (i << BTRFS_STRIPE_LEN_SHIFT); + btrfs_stripe_nr_to_offset(i); if (logical >= stripe_start && logical < stripe_start + BTRFS_STRIPE_LEN) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bf47a1a70813..64066d48dce1 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -574,6 +574,17 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } +/* + * Do the type safe converstion from stripe_nr to offset inside the chunk. + * + * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger + * than 4G. This does the proper type cast to avoid overflow. + */ +static inline u64 btrfs_stripe_nr_to_offset(u32 stripe_nr) +{ + return (u64)stripe_nr << BTRFS_STRIPE_LEN_SHIFT; +} + void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, diff --git a/fs/buffer.c b/fs/buffer.c index a7fc561758b1..fe64356e89b8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -111,7 +111,6 @@ void buffer_check_dirty_writeback(struct folio *folio, bh = bh->b_this_page; } while (bh != head); } -EXPORT_SYMBOL(buffer_check_dirty_writeback); /* * Block until a buffer comes unlocked. This doesn't stop it diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 82219a8f6084..d9d22d0ec38a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -451,9 +451,10 @@ struct file *cachefiles_create_tmpfile(struct cachefiles_object *object) ret = cachefiles_inject_write_error(); if (ret == 0) { - file = vfs_tmpfile_open(&nop_mnt_idmap, &parentpath, S_IFREG, - O_RDWR | O_LARGEFILE | O_DIRECT, - cache->cache_cred); + file = kernel_tmpfile_open(&nop_mnt_idmap, &parentpath, + S_IFREG | 0600, + O_RDWR | O_LARGEFILE | O_DIRECT, + cache->cache_cred); ret = PTR_ERR_OR_ZERO(file); } if (ret) { @@ -560,8 +561,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, */ path.mnt = cache->mnt; path.dentry = dentry; - file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, - d_backing_inode(dentry), cache->cache_cred); + file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_backing_inode(dentry), cache->cache_cred); if (IS_ERR(file)) { trace_cachefiles_vfs_error(object, d_backing_inode(dentry), PTR_ERR(file), diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 789be30d6ee2..2321e5ddb664 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1627,6 +1627,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci, struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_session *session = NULL; + bool need_put = false; int mds; dout("ceph_flush_snaps %p\n", inode); @@ -1671,8 +1672,13 @@ out: ceph_put_mds_session(session); /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); + if (!list_empty(&ci->i_snap_flush_item)) + need_put = true; list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); + + if (need_put) + iput(inode); } /* diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0b236ebd989f..2e73ba62bd7a 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -693,8 +693,10 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size); spin_lock(&mdsc->snap_flush_lock); - if (list_empty(&ci->i_snap_flush_item)) + if (list_empty(&ci->i_snap_flush_item)) { + ihold(inode); list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + } spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } diff --git a/fs/char_dev.c b/fs/char_dev.c index 13deb45f1ec6..950b6919fb87 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -150,7 +150,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; - strlcpy(cd->name, name, sizeof(cd->name)); + strscpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; diff --git a/fs/coredump.c b/fs/coredump.c index 88740c51b942..9d235fa14ab9 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -648,7 +648,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } else { struct mnt_idmap *idmap; struct inode *inode; - int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | + int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL; if (cprm.limit < binfmt->min_coredump) diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 7ab5a7b7eef8..2d63da48635a 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -171,7 +171,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) */ struct fscrypt_symlink_data { __le16 len; - char encrypted_path[1]; + char encrypted_path[]; } __packed; /** diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 9e786ae66a13..6238dbcadcad 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -255,10 +255,10 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target, * for now since filesystems will assume it is there and subtract it. */ if (!__fscrypt_fname_encrypted_size(policy, len, - max_len - sizeof(struct fscrypt_symlink_data), + max_len - sizeof(struct fscrypt_symlink_data) - 1, &disk_link->len)) return -ENAMETOOLONG; - disk_link->len += sizeof(struct fscrypt_symlink_data); + disk_link->len += sizeof(struct fscrypt_symlink_data) + 1; disk_link->name = NULL; return 0; @@ -289,7 +289,7 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, if (!sd) return -ENOMEM; } - ciphertext_len = disk_link->len - sizeof(*sd); + ciphertext_len = disk_link->len - sizeof(*sd) - 1; sd->len = cpu_to_le16(ciphertext_len); err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, @@ -367,7 +367,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, * the ciphertext length, even though this is redundant with i_size. */ - if (max_size < sizeof(*sd)) + if (max_size < sizeof(*sd) + 1) return ERR_PTR(-EUCLEAN); sd = caddr; cstr.name = (unsigned char *)sd->encrypted_path; @@ -376,7 +376,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr, if (cstr.len == 0) return ERR_PTR(-EUCLEAN); - if (cstr.len + sizeof(*sd) - 1 > max_size) + if (cstr.len + sizeof(*sd) > max_size) return ERR_PTR(-EUCLEAN); err = fscrypt_fname_alloc_buffer(cstr.len, &pstr); diff --git a/fs/d_path.c b/fs/d_path.c index 56a6ee4c6331..5f4da5c8d5db 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/prefetch.h> #include "mount.h" +#include "internal.h" struct prepend_buffer { char *buf; diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 26fa170090b8..b1b846504027 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -89,8 +89,7 @@ static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool); +extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 7021e2cf6146..2a29943fa5cc 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -363,7 +363,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, return 0; } -static struct z_erofs_decompressor decompressors[] = { +const struct z_erofs_decompressor erofs_decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { .decompress = z_erofs_transform_plain, .name = "shifted" @@ -383,9 +383,3 @@ static struct z_erofs_decompressor decompressors[] = { }, #endif }; - -int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct page **pagepool) -{ - return decompressors[rq->alg].decompress(rq, pagepool); -} diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1e39c03357d1..36e32fa542f0 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -208,46 +208,12 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) - /* basic unit of the workstation of a super_block */ struct erofs_workgroup { - /* the workgroup index in the workstation */ pgoff_t index; - - /* overall workgroup reference count */ - atomic_t refcount; + struct lockref lockref; }; -static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, - int val) -{ - preempt_disable(); - if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { - preempt_enable(); - return false; - } - return true; -} - -static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, - int orig_val) -{ - /* - * other observers should notice all modifications - * in the freezing period. - */ - smp_mb(); - atomic_set(&grp->refcount, orig_val); - preempt_enable(); -} - -static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) -{ - return atomic_cond_read_relaxed(&grp->refcount, - VAL != EROFS_LOCKED_MAGIC); -} - enum erofs_kmap_type { EROFS_NO_KMAP, /* don't map the buffer */ EROFS_KMAP, /* use kmap_local_page() to map the buffer */ @@ -486,7 +452,7 @@ static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP -int erofs_workgroup_put(struct erofs_workgroup *grp); +void erofs_workgroup_put(struct erofs_workgroup *grp); struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, pgoff_t index); struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, @@ -500,7 +466,6 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); @@ -511,6 +476,7 @@ void erofs_put_pcpubuf(void *ptr); int erofs_pcpubuf_growsize(unsigned int nrpages); void __init erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); +int erofs_init_managed_cache(struct super_block *sb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -530,6 +496,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } static inline void erofs_pcpubuf_init(void) {} static inline void erofs_pcpubuf_exit(void) {} +static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 811ab66d805e..ff18f6b14de5 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -599,68 +599,6 @@ static int erofs_fc_parse_param(struct fs_context *fc, return 0; } -#ifdef CONFIG_EROFS_FS_ZIP -static const struct address_space_operations managed_cache_aops; - -static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp) -{ - bool ret = true; - struct address_space *const mapping = folio->mapping; - - DBG_BUGON(!folio_test_locked(folio)); - DBG_BUGON(mapping->a_ops != &managed_cache_aops); - - if (folio_test_private(folio)) - ret = erofs_try_to_free_cached_page(&folio->page); - - return ret; -} - -/* - * It will be called only on inode eviction. In case that there are still some - * decompression requests in progress, wait with rescheduling for a bit here. - * We could introduce an extra locking instead but it seems unnecessary. - */ -static void erofs_managed_cache_invalidate_folio(struct folio *folio, - size_t offset, size_t length) -{ - const size_t stop = length + offset; - - DBG_BUGON(!folio_test_locked(folio)); - - /* Check for potential overflow in debug mode */ - DBG_BUGON(stop > folio_size(folio) || stop < length); - - if (offset == 0 && stop == folio_size(folio)) - while (!erofs_managed_cache_release_folio(folio, GFP_NOFS)) - cond_resched(); -} - -static const struct address_space_operations managed_cache_aops = { - .release_folio = erofs_managed_cache_release_folio, - .invalidate_folio = erofs_managed_cache_invalidate_folio, -}; - -static int erofs_init_managed_cache(struct super_block *sb) -{ - struct erofs_sb_info *const sbi = EROFS_SB(sb); - struct inode *const inode = new_inode(sb); - - if (!inode) - return -ENOMEM; - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - - inode->i_mapping->a_ops = &managed_cache_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - sbi->managed_cache = inode; - return 0; -} -#else -static int erofs_init_managed_cache(struct super_block *sb) { return 0; } -#endif - static struct inode *erofs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1016,10 +954,8 @@ static int __init erofs_module_init(void) sizeof(struct erofs_inode), 0, SLAB_RECLAIM_ACCOUNT, erofs_inode_init_once); - if (!erofs_inode_cachep) { - err = -ENOMEM; - goto icache_err; - } + if (!erofs_inode_cachep) + return -ENOMEM; err = erofs_init_shrinker(); if (err) @@ -1054,7 +990,6 @@ lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); -icache_err: return err; } diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 46627cb69abe..cc6fb9e98899 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -4,7 +4,6 @@ * https://www.huawei.com/ */ #include "internal.h" -#include <linux/pagevec.h> struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { @@ -33,22 +32,21 @@ void erofs_release_pages(struct page **pagepool) /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; -static int erofs_workgroup_get(struct erofs_workgroup *grp) +static bool erofs_workgroup_get(struct erofs_workgroup *grp) { - int o; + if (lockref_get_not_zero(&grp->lockref)) + return true; -repeat: - o = erofs_wait_on_workgroup_freezed(grp); - if (o <= 0) - return -1; - - if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o) - goto repeat; + spin_lock(&grp->lockref.lock); + if (__lockref_is_dead(&grp->lockref)) { + spin_unlock(&grp->lockref.lock); + return false; + } - /* decrease refcount paired by erofs_workgroup_put */ - if (o == 1) + if (!grp->lockref.count++) atomic_long_dec(&erofs_global_shrink_cnt); - return 0; + spin_unlock(&grp->lockref.lock); + return true; } struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, @@ -61,7 +59,7 @@ repeat: rcu_read_lock(); grp = xa_load(&sbi->managed_pslots, index); if (grp) { - if (erofs_workgroup_get(grp)) { + if (!erofs_workgroup_get(grp)) { /* prefer to relax rcu read side */ rcu_read_unlock(); goto repeat; @@ -80,11 +78,10 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb, struct erofs_workgroup *pre; /* - * Bump up a reference count before making this visible - * to others for the XArray in order to avoid potential - * UAF without serialized by xa_lock. + * Bump up before making this visible to others for the XArray in order + * to avoid potential UAF without serialized by xa_lock. */ - atomic_inc(&grp->refcount); + lockref_get(&grp->lockref); repeat: xa_lock(&sbi->managed_pslots); @@ -93,13 +90,13 @@ repeat: if (pre) { if (xa_is_err(pre)) { pre = ERR_PTR(xa_err(pre)); - } else if (erofs_workgroup_get(pre)) { + } else if (!erofs_workgroup_get(pre)) { /* try to legitimize the current in-tree one */ xa_unlock(&sbi->managed_pslots); cond_resched(); goto repeat; } - atomic_dec(&grp->refcount); + lockref_put_return(&grp->lockref); grp = pre; } xa_unlock(&sbi->managed_pslots); @@ -112,38 +109,34 @@ static void __erofs_workgroup_free(struct erofs_workgroup *grp) erofs_workgroup_free_rcu(grp); } -int erofs_workgroup_put(struct erofs_workgroup *grp) +void erofs_workgroup_put(struct erofs_workgroup *grp) { - int count = atomic_dec_return(&grp->refcount); + if (lockref_put_or_lock(&grp->lockref)) + return; - if (count == 1) + DBG_BUGON(__lockref_is_dead(&grp->lockref)); + if (grp->lockref.count == 1) atomic_long_inc(&erofs_global_shrink_cnt); - else if (!count) - __erofs_workgroup_free(grp); - return count; + --grp->lockref.count; + spin_unlock(&grp->lockref.lock); } static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, struct erofs_workgroup *grp) { - /* - * If managed cache is on, refcount of workgroups - * themselves could be < 0 (freezed). In other words, - * there is no guarantee that all refcounts > 0. - */ - if (!erofs_workgroup_try_to_freeze(grp, 1)) - return false; + int free = false; + + spin_lock(&grp->lockref.lock); + if (grp->lockref.count) + goto out; /* - * Note that all cached pages should be unattached - * before deleted from the XArray. Otherwise some - * cached pages could be still attached to the orphan - * old workgroup when the new one is available in the tree. + * Note that all cached pages should be detached before deleted from + * the XArray. Otherwise some cached pages could be still attached to + * the orphan old workgroup when the new one is available in the tree. */ - if (erofs_try_to_free_all_cached_pages(sbi, grp)) { - erofs_workgroup_unfreeze(grp, 1); - return false; - } + if (erofs_try_to_free_all_cached_pages(sbi, grp)) + goto out; /* * It's impossible to fail after the workgroup is freezed, @@ -152,10 +145,13 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, */ DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); - /* last refcount should be connected with its managed pslot. */ - erofs_workgroup_unfreeze(grp, 0); - __erofs_workgroup_free(grp); - return true; + lockref_mark_dead(&grp->lockref); + free = true; +out: + spin_unlock(&grp->lockref.lock); + if (free) + __erofs_workgroup_free(grp); + return free; } static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index bbfe7ce170d2..40178b6e0688 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -7,32 +7,27 @@ #include <linux/security.h> #include "xattr.h" -static inline erofs_blk_t erofs_xattr_blkaddr(struct super_block *sb, - unsigned int xattr_id) -{ - return EROFS_SB(sb)->xattr_blkaddr + - erofs_blknr(sb, xattr_id * sizeof(__u32)); -} - -static inline unsigned int erofs_xattr_blkoff(struct super_block *sb, - unsigned int xattr_id) -{ - return erofs_blkoff(sb, xattr_id * sizeof(__u32)); -} - -struct xattr_iter { +struct erofs_xattr_iter { struct super_block *sb; struct erofs_buf buf; + erofs_off_t pos; void *kaddr; - erofs_blk_t blkaddr; - unsigned int ofs; + char *buffer; + int buffer_size, buffer_ofs; + + /* getxattr */ + int index, infix_len; + struct qstr name; + + /* listxattr */ + struct dentry *dentry; }; static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct xattr_iter it; + struct erofs_xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; struct super_block *sb = inode->i_sb; @@ -81,17 +76,17 @@ static int erofs_init_inode_xattrs(struct inode *inode) } it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(sb, erofs_iloc(inode) + vi->inode_isize); - it.ofs = erofs_blkoff(sb, erofs_iloc(inode) + vi->inode_isize); + erofs_init_metabuf(&it.buf, sb); + it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), EROFS_KMAP); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; } - ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); + ih = it.kaddr + erofs_blkoff(sb, it.pos); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -102,26 +97,20 @@ static int erofs_init_inode_xattrs(struct inode *inode) } /* let's skip ibody header */ - it.ofs += sizeof(struct erofs_xattr_ibody_header); + it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - if (it.ofs >= sb->s_blocksize) { - /* cannot be unaligned */ - DBG_BUGON(it.ofs != sb->s_blocksize); - - it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr, - EROFS_KMAP); - if (IS_ERR(it.kaddr)) { - kfree(vi->xattr_shared_xattrs); - vi->xattr_shared_xattrs = NULL; - ret = PTR_ERR(it.kaddr); - goto out_unlock; - } - it.ofs = 0; + it.kaddr = erofs_bread(&it.buf, erofs_blknr(sb, it.pos), + EROFS_KMAP); + if (IS_ERR(it.kaddr)) { + kfree(vi->xattr_shared_xattrs); + vi->xattr_shared_xattrs = NULL; + ret = PTR_ERR(it.kaddr); + goto out_unlock; } - vi->xattr_shared_xattrs[i] = - le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); - it.ofs += sizeof(__le32); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *) + (it.kaddr + erofs_blkoff(sb, it.pos))); + it.pos += sizeof(__le32); } erofs_put_metabuf(&it.buf); @@ -134,287 +123,6 @@ out_unlock: return ret; } -/* - * the general idea for these return values is - * if 0 is returned, go on processing the current xattr; - * 1 (> 0) is returned, skip this round to process the next xattr; - * -err (< 0) is returned, an error (maybe ENOXATTR) occurred - * and need to be handled - */ -struct xattr_iter_handlers { - int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); - int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); - int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); - void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, - unsigned int len); -}; - -static inline int xattr_iter_fixup(struct xattr_iter *it) -{ - if (it->ofs < it->sb->s_blocksize) - return 0; - - it->blkaddr += erofs_blknr(it->sb, it->ofs); - it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - it->ofs = erofs_blkoff(it->sb, it->ofs); - return 0; -} - -static int inline_xattr_iter_begin(struct xattr_iter *it, - struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - unsigned int xattr_header_sz, inline_xattr_ofs; - - xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + - sizeof(u32) * vi->xattr_shared_count; - if (xattr_header_sz >= vi->xattr_isize) { - DBG_BUGON(xattr_header_sz > vi->xattr_isize); - return -ENOATTR; - } - - inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - - it->blkaddr = erofs_blknr(it->sb, erofs_iloc(inode) + inline_xattr_ofs); - it->ofs = erofs_blkoff(it->sb, erofs_iloc(inode) + inline_xattr_ofs); - it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, - EROFS_KMAP); - if (IS_ERR(it->kaddr)) - return PTR_ERR(it->kaddr); - return vi->xattr_isize - xattr_header_sz; -} - -/* - * Regardless of success or failure, `xattr_foreach' will end up with - * `ofs' pointing to the next xattr item rather than an arbitrary position. - */ -static int xattr_foreach(struct xattr_iter *it, - const struct xattr_iter_handlers *op, - unsigned int *tlimit) -{ - struct erofs_xattr_entry entry; - unsigned int value_sz, processed, slice; - int err; - - /* 0. fixup blkaddr, ofs, ipage */ - err = xattr_iter_fixup(it); - if (err) - return err; - - /* - * 1. read xattr entry to the memory, - * since we do EROFS_XATTR_ALIGN - * therefore entry should be in the page - */ - entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); - if (tlimit) { - unsigned int entry_sz = erofs_xattr_entry_size(&entry); - - /* xattr on-disk corruption: xattr entry beyond xattr_isize */ - if (*tlimit < entry_sz) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - *tlimit -= entry_sz; - } - - it->ofs += sizeof(struct erofs_xattr_entry); - value_sz = le16_to_cpu(entry.e_value_size); - - /* handle entry */ - err = op->entry(it, &entry); - if (err) { - it->ofs += entry.e_name_len + value_sz; - goto out; - } - - /* 2. handle xattr name (ofs will finally be at the end of name) */ - processed = 0; - - while (processed < entry.e_name_len) { - if (it->ofs >= it->sb->s_blocksize) { - DBG_BUGON(it->ofs > it->sb->s_blocksize); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, - entry.e_name_len - processed); - - /* handle name */ - err = op->name(it, processed, it->kaddr + it->ofs, slice); - if (err) { - it->ofs += entry.e_name_len - processed + value_sz; - goto out; - } - - it->ofs += slice; - processed += slice; - } - - /* 3. handle xattr value */ - processed = 0; - - if (op->alloc_buffer) { - err = op->alloc_buffer(it, value_sz); - if (err) { - it->ofs += value_sz; - goto out; - } - } - - while (processed < value_sz) { - if (it->ofs >= it->sb->s_blocksize) { - DBG_BUGON(it->ofs > it->sb->s_blocksize); - - err = xattr_iter_fixup(it); - if (err) - goto out; - it->ofs = 0; - } - - slice = min_t(unsigned int, it->sb->s_blocksize - it->ofs, - value_sz - processed); - op->value(it, processed, it->kaddr + it->ofs, slice); - it->ofs += slice; - processed += slice; - } - -out: - /* xattrs should be 4-byte aligned (on-disk constraint) */ - it->ofs = EROFS_XATTR_ALIGN(it->ofs); - return err < 0 ? err : 0; -} - -struct getxattr_iter { - struct xattr_iter it; - - char *buffer; - int buffer_size, index, infix_len; - struct qstr name; -}; - -static int erofs_xattr_long_entrymatch(struct getxattr_iter *it, - struct erofs_xattr_entry *entry) -{ - struct erofs_sb_info *sbi = EROFS_SB(it->it.sb); - struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + - (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); - - if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) - return -ENOATTR; - - if (it->index != pf->prefix->base_index || - it->name.len != entry->e_name_len + pf->infix_len) - return -ENOATTR; - - if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) - return -ENOATTR; - - it->infix_len = pf->infix_len; - return 0; -} - -static int xattr_entrymatch(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - /* should also match the infix for long name prefixes */ - if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) - return erofs_xattr_long_entrymatch(it, entry); - - if (it->index != entry->e_name_index || - it->name.len != entry->e_name_len) - return -ENOATTR; - it->infix_len = 0; - return 0; -} - -static int xattr_namematch(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - if (memcmp(buf, it->name.name + it->infix_len + processed, len)) - return -ENOATTR; - return 0; -} - -static int xattr_checkbuffer(struct xattr_iter *_it, - unsigned int value_sz) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - int err = it->buffer_size < value_sz ? -ERANGE : 0; - - it->buffer_size = value_sz; - return !it->buffer ? 1 : err; -} - -static void xattr_copyvalue(struct xattr_iter *_it, - unsigned int processed, - char *buf, unsigned int len) -{ - struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); - - memcpy(it->buffer + processed, buf, len); -} - -static const struct xattr_iter_handlers find_xattr_handlers = { - .entry = xattr_entrymatch, - .name = xattr_namematch, - .alloc_buffer = xattr_checkbuffer, - .value = xattr_copyvalue -}; - -static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - int ret; - unsigned int remaining; - - ret = inline_xattr_iter_begin(&it->it, inode); - if (ret < 0) - return ret; - - remaining = ret; - while (remaining) { - ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - -static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = it->it.sb; - unsigned int i, xsid; - int ret = -ENOATTR; - - for (i = 0; i < vi->xattr_shared_count; ++i) { - xsid = vi->xattr_shared_xattrs[i]; - it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); - it->it.ofs = erofs_xattr_blkoff(sb, xsid); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, - it->it.blkaddr, EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - - ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); - if (ret != -ENOATTR) - break; - } - return ret ? ret : it->buffer_size; -} - static bool erofs_xattr_user_list(struct dentry *dentry) { return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); @@ -425,39 +133,6 @@ static bool erofs_xattr_trusted_list(struct dentry *dentry) return capable(CAP_SYS_ADMIN); } -int erofs_getxattr(struct inode *inode, int index, - const char *name, - void *buffer, size_t buffer_size) -{ - int ret; - struct getxattr_iter it; - - if (!name) - return -EINVAL; - - ret = erofs_init_inode_xattrs(inode); - if (ret) - return ret; - - it.index = index; - it.name.len = strlen(name); - if (it.name.len > EROFS_NAME_LEN) - return -ERANGE; - - it.it.buf = __EROFS_BUF_INITIALIZER; - it.name.name = name; - - it.buffer = buffer; - it.buffer_size = buffer_size; - - it.it.sb = inode->i_sb; - ret = inline_getxattr(inode, &it); - if (ret == -ENOATTR) - ret = shared_getxattr(inode, &it); - erofs_put_metabuf(&it.it.buf); - return ret; -} - static int erofs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) @@ -500,30 +175,49 @@ const struct xattr_handler *erofs_xattr_handlers[] = { NULL, }; -struct listxattr_iter { - struct xattr_iter it; - - struct dentry *dentry; - char *buffer; - int buffer_size, buffer_ofs; -}; +static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, + unsigned int len) +{ + unsigned int slice, processed; + struct super_block *sb = it->sb; + void *src; + + for (processed = 0; processed < len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + src = it->kaddr + erofs_blkoff(sb, it->pos); + slice = min_t(unsigned int, sb->s_blocksize - + erofs_blkoff(sb, it->pos), len - processed); + memcpy(it->buffer + it->buffer_ofs, src, slice); + it->buffer_ofs += slice; + it->pos += slice; + } + return 0; +} -static int xattr_entrylist(struct xattr_iter *_it, - struct erofs_xattr_entry *entry) +static int erofs_listxattr_foreach(struct erofs_xattr_iter *it) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); - unsigned int base_index = entry->e_name_index; - unsigned int prefix_len, infix_len = 0; + struct erofs_xattr_entry entry; + unsigned int base_index, name_total, prefix_len, infix_len = 0; const char *prefix, *infix = NULL; + int err; + + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(it->sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); - if (entry->e_name_index & EROFS_XATTR_LONG_PREFIX) { - struct erofs_sb_info *sbi = EROFS_SB(_it->sb); + base_index = entry.e_name_index; + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(it->sb); struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + - (entry->e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) - return 1; + return 0; infix = pf->prefix->infix; infix_len = pf->infix_len; base_index = pf->prefix->base_index; @@ -531,120 +225,228 @@ static int xattr_entrylist(struct xattr_iter *_it, prefix = erofs_xattr_prefix(base_index, it->dentry); if (!prefix) - return 1; + return 0; prefix_len = strlen(prefix); + name_total = prefix_len + infix_len + entry.e_name_len + 1; if (!it->buffer) { - it->buffer_ofs += prefix_len + infix_len + - entry->e_name_len + 1; - return 1; + it->buffer_ofs += name_total; + return 0; } - if (it->buffer_ofs + prefix_len + infix_len + - + entry->e_name_len + 1 > it->buffer_size) + if (it->buffer_ofs + name_total > it->buffer_size) return -ERANGE; memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); memcpy(it->buffer + it->buffer_ofs + prefix_len, infix, infix_len); it->buffer_ofs += prefix_len + infix_len; - return 0; -} -static int xattr_namelist(struct xattr_iter *_it, - unsigned int processed, char *buf, unsigned int len) -{ - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + /* 2. handle xattr name */ + err = erofs_xattr_copy_to_buffer(it, entry.e_name_len); + if (err) + return err; - memcpy(it->buffer + it->buffer_ofs, buf, len); - it->buffer_ofs += len; + it->buffer[it->buffer_ofs++] = '\0'; return 0; } -static int xattr_skipvalue(struct xattr_iter *_it, - unsigned int value_sz) +static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) { - struct listxattr_iter *it = - container_of(_it, struct listxattr_iter, it); + struct super_block *sb = it->sb; + struct erofs_xattr_entry entry; + unsigned int slice, processed, value_sz; - it->buffer[it->buffer_ofs++] = '\0'; - return 1; -} + /* 1. handle xattr entry */ + entry = *(struct erofs_xattr_entry *) + (it->kaddr + erofs_blkoff(sb, it->pos)); + it->pos += sizeof(struct erofs_xattr_entry); + value_sz = le16_to_cpu(entry.e_value_size); -static const struct xattr_iter_handlers list_xattr_handlers = { - .entry = xattr_entrylist, - .name = xattr_namelist, - .alloc_buffer = xattr_skipvalue, - .value = NULL -}; + /* should also match the infix for long name prefixes */ + if (entry.e_name_index & EROFS_XATTR_LONG_PREFIX) { + struct erofs_sb_info *sbi = EROFS_SB(sb); + struct erofs_xattr_prefix_item *pf = sbi->xattr_prefixes + + (entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK); + + if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count) + return -ENOATTR; + + if (it->index != pf->prefix->base_index || + it->name.len != entry.e_name_len + pf->infix_len) + return -ENOATTR; + + if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len)) + return -ENOATTR; + + it->infix_len = pf->infix_len; + } else { + if (it->index != entry.e_name_index || + it->name.len != entry.e_name_len) + return -ENOATTR; -static int inline_listxattr(struct listxattr_iter *it) + it->infix_len = 0; + } + + /* 2. handle xattr name */ + for (processed = 0; processed < entry.e_name_len; processed += slice) { + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + slice = min_t(unsigned int, + sb->s_blocksize - erofs_blkoff(sb, it->pos), + entry.e_name_len - processed); + if (memcmp(it->name.name + it->infix_len + processed, + it->kaddr + erofs_blkoff(sb, it->pos), slice)) + return -ENOATTR; + it->pos += slice; + } + + /* 3. handle xattr value */ + if (!it->buffer) { + it->buffer_ofs = value_sz; + return 0; + } + + if (it->buffer_size < value_sz) + return -ERANGE; + + return erofs_xattr_copy_to_buffer(it, value_sz); +} + +static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { + struct erofs_inode *const vi = EROFS_I(inode); + unsigned int xattr_header_sz, remaining, entry_sz; + erofs_off_t next_pos; int ret; - unsigned int remaining; - ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); - if (ret < 0) - return ret; + xattr_header_sz = sizeof(struct erofs_xattr_ibody_header) + + sizeof(u32) * vi->xattr_shared_count; + if (xattr_header_sz >= vi->xattr_isize) { + DBG_BUGON(xattr_header_sz > vi->xattr_isize); + return -ENOATTR; + } + + remaining = vi->xattr_isize - xattr_header_sz; + it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; - remaining = ret; while (remaining) { - ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); - if (ret) + it->kaddr = erofs_bread(&it->buf, erofs_blknr(it->sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + entry_sz = erofs_xattr_entry_size(it->kaddr + + erofs_blkoff(it->sb, it->pos)); + /* xattr on-disk corruption: xattr entry beyond xattr_isize */ + if (remaining < entry_sz) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + remaining -= entry_sz; + next_pos = it->pos + entry_sz; + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) break; + + it->pos = next_pos; } - return ret ? ret : it->buffer_ofs; + return ret; } -static int shared_listxattr(struct listxattr_iter *it) +static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, + struct inode *inode, bool getxattr) { - struct inode *const inode = d_inode(it->dentry); struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = it->it.sb; - unsigned int i, xsid; - int ret = 0; + struct super_block *const sb = it->sb; + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int i; + int ret = -ENOATTR; for (i = 0; i < vi->xattr_shared_count; ++i) { - xsid = vi->xattr_shared_xattrs[i]; - it->it.blkaddr = erofs_xattr_blkaddr(sb, xsid); - it->it.ofs = erofs_xattr_blkoff(sb, xsid); - it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, - it->it.blkaddr, EROFS_KMAP); - if (IS_ERR(it->it.kaddr)) - return PTR_ERR(it->it.kaddr); - - ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); - if (ret) + it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + + vi->xattr_shared_xattrs[i] * sizeof(__le32); + it->kaddr = erofs_bread(&it->buf, erofs_blknr(sb, it->pos), + EROFS_KMAP); + if (IS_ERR(it->kaddr)) + return PTR_ERR(it->kaddr); + + if (getxattr) + ret = erofs_getxattr_foreach(it); + else + ret = erofs_listxattr_foreach(it); + if ((getxattr && ret != -ENOATTR) || (!getxattr && ret)) break; } - return ret ? ret : it->buffer_ofs; + return ret; +} + +int erofs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size) +{ + int ret; + struct erofs_xattr_iter it; + + if (!name) + return -EINVAL; + + ret = erofs_init_inode_xattrs(inode); + if (ret) + return ret; + + it.index = index; + it.name = (struct qstr)QSTR_INIT(name, strlen(name)); + if (it.name.len > EROFS_NAME_LEN) + return -ERANGE; + + it.sb = inode->i_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); + it.buffer = buffer; + it.buffer_size = buffer_size; + it.buffer_ofs = 0; + + ret = erofs_xattr_iter_inline(&it, inode, true); + if (ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, true); + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; } -ssize_t erofs_listxattr(struct dentry *dentry, - char *buffer, size_t buffer_size) +ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret; - struct listxattr_iter it; + struct erofs_xattr_iter it; + struct inode *inode = d_inode(dentry); - ret = erofs_init_inode_xattrs(d_inode(dentry)); + ret = erofs_init_inode_xattrs(inode); if (ret == -ENOATTR) return 0; if (ret) return ret; - it.it.buf = __EROFS_BUF_INITIALIZER; + it.sb = dentry->d_sb; + it.buf = __EROFS_BUF_INITIALIZER; + erofs_init_metabuf(&it.buf, it.sb); it.dentry = dentry; it.buffer = buffer; it.buffer_size = buffer_size; it.buffer_ofs = 0; - it.it.sb = dentry->d_sb; - - ret = inline_listxattr(&it); - if (ret >= 0 || ret == -ENOATTR) - ret = shared_listxattr(&it); - erofs_put_metabuf(&it.it.buf); - return ret; + ret = erofs_xattr_iter_inline(&it, inode, false); + if (!ret || ret == -ENOATTR) + ret = erofs_xattr_iter_shared(&it, inode, false); + if (ret == -ENOATTR) + ret = 0; + erofs_put_metabuf(&it.buf); + return ret ? ret : it.buffer_ofs; } void erofs_xattr_prefixes_cleanup(struct super_block *sb) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 160b3da43aec..5f1890e309c6 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -5,7 +5,6 @@ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" -#include <linux/prefetch.h> #include <linux/psi.h> #include <linux/cpuhotplug.h> #include <trace/events/erofs.h> @@ -92,13 +91,8 @@ struct z_erofs_pcluster { struct z_erofs_bvec compressed_bvecs[]; }; -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - +/* the end of a chain of pclusters */ +#define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) #define Z_EROFS_PCLUSTER_NIL (NULL) struct z_erofs_decompressqueue { @@ -241,14 +235,20 @@ static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, - struct page **candidate_bvpage) + struct page **candidate_bvpage, + struct page **pagepool) { - if (iter->cur == iter->nr) { - if (!*candidate_bvpage) - return -EAGAIN; - + if (iter->cur >= iter->nr) { + struct page *nextpage = *candidate_bvpage; + + if (!nextpage) { + nextpage = erofs_allocpage(pagepool, GFP_NOFS); + if (!nextpage) + return -ENOMEM; + set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); + } DBG_BUGON(iter->bvset->nextpage); - iter->bvset->nextpage = *candidate_bvpage; + iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; @@ -500,20 +500,6 @@ out_error_pcluster_pool: enum z_erofs_pclustermode { Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current pclusters was the tail of an exist chain, in addition - * that the previous processed chained pclusters are all decided to - * be hooked up to it. - * A new chain will be created for the remaining pclusters which are - * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, - * the next pcluster cannot reuse the whole page safely for inplace I/O - * in the following scenario: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (belongs to the next pcl) | (belongs to the current pcl) | - * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| - */ - Z_EROFS_PCLUSTER_HOOKED, - /* * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or @@ -530,8 +516,8 @@ enum z_erofs_pclustermode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PCLUSTER_FOLLOWED or | | - * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| + * | | | + * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ @@ -543,12 +529,12 @@ struct z_erofs_decompress_frontend { struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; + struct page *pagepool; struct page *candidate_bvpage; - struct z_erofs_pcluster *pcl, *tailpcl; + struct z_erofs_pcluster *pcl; z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -578,8 +564,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) return false; } -static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, - struct page **pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; @@ -620,7 +605,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, * succeeds or fallback to in-place I/O instead * to avoid any direct reclaim. */ - newpage = erofs_allocpage(pagepool, gfp); + newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); @@ -633,7 +618,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (page) put_page(page); else if (newpage) - erofs_pagepool_add(pagepool, newpage); + erofs_pagepool_add(&fe->pagepool, newpage); } /* @@ -654,7 +639,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); /* - * refcount of workgroup is now freezed as 1, + * refcount of workgroup is now freezed as 0, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { @@ -678,29 +663,73 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct page *page) +static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { - struct z_erofs_pcluster *const pcl = (void *)page_private(page); - int ret, i; + struct z_erofs_pcluster *pcl = folio_get_private(folio); + bool ret; + int i; - if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) - return 0; + if (!folio_test_private(folio)) + return true; + + ret = false; + spin_lock(&pcl->obj.lockref.lock); + if (pcl->obj.lockref.count > 0) + goto out; - ret = 0; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { - if (pcl->compressed_bvecs[i].page == page) { + if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); - ret = 1; + ret = true; break; } } - erofs_workgroup_unfreeze(&pcl->obj, 1); if (ret) - detach_page_private(page); + folio_detach_private(folio); +out: + spin_unlock(&pcl->obj.lockref.lock); return ret; } +/* + * It will be called only on inode eviction. In case that there are still some + * decompression requests in progress, wait with rescheduling for a bit here. + * An extra lock could be introduced instead but it seems unnecessary. + */ +static void z_erofs_cache_invalidate_folio(struct folio *folio, + size_t offset, size_t length) +{ + const size_t stop = length + offset; + + /* Check for potential overflow in debug mode */ + DBG_BUGON(stop > folio_size(folio) || stop < length); + + if (offset == 0 && stop == folio_size(folio)) + while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) + cond_resched(); +} + +static const struct address_space_operations z_erofs_cache_aops = { + .release_folio = z_erofs_cache_release_folio, + .invalidate_folio = z_erofs_cache_invalidate_folio, +}; + +int erofs_init_managed_cache(struct super_block *sb) +{ + struct inode *const inode = new_inode(sb); + + if (!inode) + return -ENOMEM; + + set_nlink(inode, 1); + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &z_erofs_cache_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + EROFS_SB(sb)->managed_cache = inode; + return 0; +} + static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { @@ -731,7 +760,8 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } - ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, + &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } @@ -750,19 +780,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) return; } - /* - * type 2, link to the end of an existing open chain, be careful - * that its submission is controlled by the original attached chain. - */ - if (*owned_head != &pcl->next && pcl != f->tailpcl && - cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - *owned_head) == Z_EROFS_PCLUSTER_TAIL) { - *owned_head = Z_EROFS_PCLUSTER_TAIL; - f->mode = Z_EROFS_PCLUSTER_HOOKED; - f->tailpcl = NULL; - return; - } - /* type 3, it belongs to a chain, but it isn't the end of the chain */ + /* type 2, it belongs to an ongoing chain */ f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } @@ -786,7 +804,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) if (IS_ERR(pcl)) return PTR_ERR(pcl); - atomic_set(&pcl->obj.refcount, 1); + spin_lock_init(&pcl->obj.lockref.lock); pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; @@ -823,9 +841,6 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) goto err_out; } } - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = pcl; fe->owned_head = &pcl->next; fe->pcl = pcl; return 0; @@ -846,7 +861,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); - DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); if (!(map->m_flags & EROFS_MAP_META)) { grp = erofs_find_workgroup(fe->inode->i_sb, @@ -865,10 +879,6 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); - /* used to check tail merging loop due to corrupted images */ - if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) - fe->tailpcl = fe->pcl; - z_erofs_try_to_claim_pcluster(fe); } else if (ret) { return ret; @@ -908,10 +918,8 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); - if (fe->candidate_bvpage) { - DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; - } /* * if all pending pages are added, don't hold its reference @@ -958,7 +966,7 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct page **pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; @@ -1016,7 +1024,7 @@ repeat: fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } else { /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe, pagepool); + z_erofs_bind_cache(fe); } hitted: /* @@ -1025,8 +1033,7 @@ hitted: * those chains are handled asynchronously thus the page cannot be used * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && - fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { @@ -1056,24 +1063,13 @@ hitted: if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); -retry: err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { .page = page, .offset = offset - map->m_la, .end = end, }), exclusive); - /* should allocate an additional short-lived page for bvset */ - if (err == -EAGAIN && !fe->candidate_bvpage) { - fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); - set_page_private(fe->candidate_bvpage, - Z_EROFS_SHORTLIVED_PAGE); - goto retry; - } - - if (err) { - DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + if (err) goto out; - } z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ @@ -1104,7 +1100,7 @@ out: return err; } -static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, +static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ @@ -1283,6 +1279,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + const struct z_erofs_decompressor *decompressor = + &erofs_decompressors[pcl->algorithmformat]; unsigned int i, inputsize; int err2; struct page *page; @@ -1326,7 +1324,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, else inputsize = pclusterpages * PAGE_SIZE; - err = z_erofs_decompress(&(struct z_erofs_decompress_req) { + err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, @@ -1404,10 +1402,7 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, }; z_erofs_next_pcluster_t owned = io->head; - while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ - DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ + while (owned != Z_EROFS_PCLUSTER_TAIL) { DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); be.pcl = container_of(owned, struct z_erofs_pcluster, next); @@ -1424,7 +1419,7 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) container_of(work, struct z_erofs_decompressqueue, u.work); struct page *pagepool = NULL; - DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); z_erofs_decompress_queue(bgq, &pagepool); erofs_release_pages(&pagepool); kvfree(bgq); @@ -1452,7 +1447,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; /* Use (kthread_)work and sync decompression for atomic contexts only */ - if (in_atomic() || irqs_disabled()) { + if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD struct kthread_worker *worker; @@ -1612,7 +1607,7 @@ fg_out: q->sync = true; } q->sb = sb; - q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; + q->head = Z_EROFS_PCLUSTER_TAIL; return q; } @@ -1630,11 +1625,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (owned_head == Z_EROFS_PCLUSTER_TAIL) - owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; - - WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); + WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); WRITE_ONCE(*submit_qtail, owned_head); WRITE_ONCE(*bypass_qtail, &pcl->next); @@ -1668,9 +1659,8 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, struct z_erofs_decompressqueue *fgq, - bool *force_fg) + bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); @@ -1705,15 +1695,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned int i = 0; bool bypass = true; - /* no possible 'owned_head' equals the following */ - DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned_head, struct z_erofs_pcluster, next); + owned_head = READ_ONCE(pcl->next); - /* close the main owned chain at first */ - owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - Z_EROFS_PCLUSTER_TAIL_CLOSED); if (z_erofs_is_inline_pcluster(pcl)) { move_to_bypass_jobqueue(pcl, qtail, owned_head); continue; @@ -1731,8 +1716,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, do { struct page *page; - page = pickup_page_for_submission(pcl, i++, pagepool, - mc); + page = pickup_page_for_submission(pcl, i++, + &f->pagepool, mc); if (!page) continue; @@ -1761,7 +1746,7 @@ submit_bio_retry: bio->bi_iter.bi_sector = (sector_t)cur << (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; - if (f->readahead) + if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } @@ -1797,16 +1782,16 @@ submit_bio_retry: } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, - struct page **pagepool, bool force_fg) + bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(f, pagepool, io, &force_fg); + z_erofs_submit_queue(f, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ - z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); + z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return; @@ -1815,7 +1800,7 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ - z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); + z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); } /* @@ -1823,29 +1808,28 @@ static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, - struct readahead_control *rac, - erofs_off_t end, - struct page **pagepool, - bool backmost) + struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; - erofs_off_t cur; + erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { + if (rac) + end = headoffset + readahead_length(rac) - 1; + else + end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; - /* expend ra for the trailing edge if readahead */ + /* expand ra for the trailing edge if readahead */ if (rac) { - loff_t newstart = readahead_pos(rac); - cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); - readahead_expand(rac, newstart, cur - newstart); + readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); @@ -1866,7 +1850,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, if (PageUptodate(page)) { unlock_page(page); } else { - err = z_erofs_do_read_page(f, page, pagepool); + err = z_erofs_do_read_page(f, page); if (err) erofs_err(inode->i_sb, "readmore error at page %lu @ nid %llu", @@ -1887,28 +1871,24 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) struct inode *const inode = page->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL; int err; trace_erofs_readpage(page, false); f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, - &pagepool, true); - err = z_erofs_do_read_page(&f, page, &pagepool); - z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); - + z_erofs_pcluster_readmore(&f, NULL, true); + err = z_erofs_do_read_page(&f, page); + z_erofs_pcluster_readmore(&f, NULL, false); (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, 0)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); return err; } @@ -1917,14 +1897,12 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *pagepool = NULL, *head = NULL, *page; + struct page *head = NULL, *page; unsigned int nr_pages; - f.readahead = true; f.headoffset = readahead_pos(rac); - z_erofs_pcluster_readmore(&f, rac, f.headoffset + - readahead_length(rac) - 1, &pagepool, true); + z_erofs_pcluster_readmore(&f, rac, true); nr_pages = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); @@ -1940,20 +1918,19 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", page->index, EROFS_I(inode)->nid); put_page(page); } - z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); + z_erofs_pcluster_readmore(&f, rac, false); (void)z_erofs_collector_end(&f); - z_erofs_runqueue(&f, &pagepool, - z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); erofs_put_metabuf(&f.map.buf); - erofs_release_pages(&pagepool); + erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index d37c5c89c728..1909ddafd9c7 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -22,8 +22,8 @@ struct z_erofs_maprecorder { bool partialref; }; -static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) +static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -129,7 +129,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, u8 *in, type; bool big_pcluster; - if (1 << amortizedshift == 4) + if (1 << amortizedshift == 4 && lclusterbits <= 14) vcnt = 2; else if (1 << amortizedshift == 2 && lclusterbits == 12) vcnt = 16; @@ -226,12 +226,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, return 0; } -static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn, bool lookahead) +static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, + unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); unsigned int totalidx = erofs_iblks(inode); @@ -239,9 +238,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, unsigned int amortizedshift; erofs_off_t pos; - if (lclusterbits != 12) - return -EOPNOTSUPP; - if (lcn >= totalidx) return -EINVAL; @@ -281,23 +277,23 @@ out: return unpack_compacted_index(m, amortizedshift, pos, lookahead); } -static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn, bool lookahead) +static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m, + unsigned int lcn, bool lookahead) { - const unsigned int datamode = EROFS_I(m->inode)->datalayout; - - if (datamode == EROFS_INODE_COMPRESSED_FULL) - return legacy_load_cluster_from_disk(m, lcn); - - if (datamode == EROFS_INODE_COMPRESSED_COMPACT) - return compacted_load_cluster_from_disk(m, lcn, lookahead); - - return -EINVAL; + switch (EROFS_I(m->inode)->datalayout) { + case EROFS_INODE_COMPRESSED_FULL: + return z_erofs_load_full_lcluster(m, lcn); + case EROFS_INODE_COMPRESSED_COMPACT: + return z_erofs_load_compact_lcluster(m, lcn, lookahead); + default: + return -EINVAL; + } } static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned int lookback_distance) { + struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; @@ -305,21 +301,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, unsigned long lcn = m->lcn - lookback_distance; int err; - /* load extent head logical cluster if needed */ - err = z_erofs_load_cluster_from_disk(m, lcn, false); + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; switch (m->type) { case Z_EROFS_LCLUSTER_TYPE_NONHEAD: - if (!m->delta[0]) { - erofs_err(m->inode->i_sb, - "invalid lookback distance 0 @ nid %llu", - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } lookback_distance = m->delta[0]; + if (!lookback_distance) + goto err_bogus; continue; case Z_EROFS_LCLUSTER_TYPE_PLAIN: case Z_EROFS_LCLUSTER_TYPE_HEAD1: @@ -328,16 +318,15 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; default: - erofs_err(m->inode->i_sb, - "unknown type %u @ lcn %lu of nid %llu", + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", m->type, lcn, vi->nid); DBG_BUGON(1); return -EOPNOTSUPP; } } - - erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu", - vi->nid); +err_bogus: + erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", + lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -369,7 +358,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if (m->compressedblks) goto out; - err = z_erofs_load_cluster_from_disk(m, lcn, false); + err = z_erofs_load_lcluster_from_disk(m, lcn, false); if (err) return err; @@ -401,9 +390,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, break; fallthrough; default: - erofs_err(m->inode->i_sb, - "cannot found CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, + vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -411,9 +399,7 @@ out: map->m_plen = erofs_pos(sb, m->compressedblks); return 0; err_bonus_cblkcnt: - erofs_err(m->inode->i_sb, - "bogus CBLKCNT @ lcn %lu of nid %llu", - lcn, vi->nid); + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); DBG_BUGON(1); return -EFSCORRUPTED; } @@ -434,7 +420,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } - err = z_erofs_load_cluster_from_disk(m, lcn, true); + err = z_erofs_load_lcluster_from_disk(m, lcn, true); if (err) return err; @@ -481,7 +467,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false); + err = z_erofs_load_lcluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; @@ -539,8 +525,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (flags & EROFS_GET_BLOCKS_FINDTAIL) { vi->z_tailextent_headlcn = m.lcn; /* for non-compact indexes, fragmentoff is 64 bits */ - if (fragment && - vi->datalayout == EROFS_INODE_COMPRESSED_FULL) + if (fragment && vi->datalayout == EROFS_INODE_COMPRESSED_FULL) vi->z_fragmentoff |= (u64)m.pblk << 32; } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { diff --git a/fs/eventfd.c b/fs/eventfd.c index 95850a13ce8d..8aa36cd37351 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -33,17 +33,17 @@ struct eventfd_ctx { /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a - * wakeup is performed on "wqh". A read(2) will return the "count" - * value to userspace, and will reset "count" to zero. The kernel - * side eventfd_signal() also, adds to the "count" counter and - * issue a wakeup. + * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not + * specified, a read(2) will return the "count" value to userspace, + * and will reset "count" to zero. The kernel side eventfd_signal() + * also, adds to the "count" counter and issue a wakeup. */ __u64 count; unsigned int flags; int id; }; -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask) { unsigned long flags; @@ -301,6 +301,8 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-id: %d\n", ctx->id); + seq_printf(m, "eventfd-semaphore: %d\n", + !!(ctx->flags & EFD_SEMAPHORE)); } #endif diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 980483455cc0..4b1b3362f697 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -536,7 +536,7 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, - unsigned pollflags) + __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } @@ -1805,7 +1805,11 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, { int ret = default_wake_function(wq_entry, mode, sync, key); - list_del_init(&wq_entry->entry); + /* + * Pairs with list_empty_careful in ep_poll, and ensures future loop + * iterations see the cause of this wakeup. + */ + list_del_init_careful(&wq_entry->entry); return ret; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index c1edde817be8..1f72f977c6db 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -324,17 +324,15 @@ static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info **grp_info; - long indexv, indexh; - - if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) { - ext4_error(sb, "invalid group %u", group); - return NULL; - } - indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); - indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); - grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); - return grp_info[indexh]; + struct ext4_group_info **grp_info; + long indexv, indexh; + + if (unlikely(group >= EXT4_SB(sb)->s_groups_count)) + return NULL; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); + return grp_info[indexh]; } /* @@ -886,7 +884,10 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, if (!ext4_bg_has_super(sb, group)) return 0; - return EXT4_SB(sb)->s_gdb_count; + if (ext4_has_feature_meta_bg(sb)) + return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); + else + return EXT4_SB(sb)->s_gdb_count; } /** diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 45b579805c95..0caf6c730ce3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3834,19 +3834,10 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, return retval; } - /* - * We need to protect against old.inode directory getting converted - * from inline directory format into a normal one. - */ - if (S_ISDIR(old.inode->i_mode)) - inode_lock_nested(old.inode, I_MUTEX_NONDIR2); - old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); - if (IS_ERR(old.bh)) { - retval = PTR_ERR(old.bh); - goto unlock_moved_dir; - } + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. @@ -4043,10 +4034,6 @@ release_bh: brelse(old.bh); brelse(new.bh); -unlock_moved_dir: - if (S_ISDIR(old.inode->i_mode)) - inode_unlock(old.inode); - return retval; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 56a5d1c469fc..05fcecc36244 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6388,7 +6388,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; - int enable_rw = 0; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; @@ -6575,7 +6574,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (err) goto restore_opts; - enable_rw = 1; + sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); @@ -6622,9 +6621,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); - if (enable_rw) - sb->s_flags &= ~SB_RDONLY; - /* * Reinitialize lazy itable initialization thread based on * current settings diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 13d7f17a9c8c..321e3a888c20 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2056,8 +2056,9 @@ inserted: else { u32 ref; +#ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); - +#endif /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, @@ -2120,8 +2121,9 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal, block; +#ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); - +#endif goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); block = ext4_new_meta_blocks(handle, inode, goal, 0, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 77a71276ecb1..ad597b417fea 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -995,20 +995,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out; } - /* - * Copied from ext4_rename: we need to protect against old.inode - * directory getting converted from inline directory format into - * a normal one. - */ - if (S_ISDIR(old_inode->i_mode)) - inode_lock_nested(old_inode, I_MUTEX_NONDIR2); - err = -ENOENT; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) err = PTR_ERR(old_page); - goto out_unlock_old; + goto out; } if (S_ISDIR(old_inode->i_mode)) { @@ -1116,9 +1108,6 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_unlock_op(sbi); - if (S_ISDIR(old_inode->i_mode)) - inode_unlock(old_inode); - if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); @@ -1133,9 +1122,6 @@ out_dir: f2fs_put_page(old_dir_page, 0); out_old: f2fs_put_page(old_page, 0); -out_unlock_old: - if (S_ISDIR(old_inode->i_mode)) - inode_unlock(old_inode); out: iput(whiteout); return err; diff --git a/fs/file_table.c b/fs/file_table.c index 372653b92617..e06c68e2d757 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,18 +44,40 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; +/* Container for backing file with optional real path */ +struct backing_file { + struct file file; + struct path real_path; +}; + +static inline struct backing_file *backing_file(struct file *f) +{ + return container_of(f, struct backing_file, file); +} + +struct path *backing_file_real_path(struct file *f) +{ + return &backing_file(f)->real_path; +} +EXPORT_SYMBOL_GPL(backing_file_real_path); + static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_rcuhead); put_cred(f->f_cred); - kmem_cache_free(filp_cachep, f); + if (unlikely(f->f_mode & FMODE_BACKING)) + kfree(backing_file(f)); + else + kmem_cache_free(filp_cachep, f); } static inline void file_free(struct file *f) { security_file_free(f); - if (!(f->f_mode & FMODE_NOACCOUNT)) + if (unlikely(f->f_mode & FMODE_BACKING)) + path_put(backing_file_real_path(f)); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) percpu_counter_dec(&nr_files); call_rcu(&f->f_rcuhead, file_free_rcu); } @@ -131,20 +153,15 @@ static int __init init_fs_stat_sysctls(void) fs_initcall(init_fs_stat_sysctls); #endif -static struct file *__alloc_file(int flags, const struct cred *cred) +static int init_file(struct file *f, int flags, const struct cred *cred) { - struct file *f; int error; - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (unlikely(!f)) - return ERR_PTR(-ENOMEM); - f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_rcuhead); - return ERR_PTR(error); + return error; } atomic_long_set(&f->f_count, 1); @@ -155,7 +172,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ - return f; + return 0; } /* Find an unused file structure and return a pointer to it. @@ -172,6 +189,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; + int error; /* * Privileged users can go above max_files @@ -185,9 +203,15 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = __alloc_file(flags, cred); - if (!IS_ERR(f)) - percpu_counter_inc(&nr_files); + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) + return ERR_PTR(error); + + percpu_counter_inc(&nr_files); return f; @@ -203,18 +227,51 @@ over: /* * Variant of alloc_empty_file() that doesn't check and modify nr_files. * - * Should not be used unless there's a very good reason to do so. + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. */ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) { - struct file *f = __alloc_file(flags, cred); + struct file *f; + int error; - if (!IS_ERR(f)) - f->f_mode |= FMODE_NOACCOUNT; + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + error = init_file(f, flags, cred); + if (unlikely(error)) + return ERR_PTR(error); + + f->f_mode |= FMODE_NOACCOUNT; return f; } +/* + * Variant of alloc_empty_file() that allocates a backing_file container + * and doesn't check and modify nr_files. + * + * This is only for kernel internal use, and the allocate file must not be + * installed into file tables or such. + */ +struct file *alloc_empty_backing_file(int flags, const struct cred *cred) +{ + struct backing_file *ff; + int error; + + ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL); + if (unlikely(!ff)) + return ERR_PTR(-ENOMEM); + + error = init_file(&ff->file, flags, cred); + if (unlikely(error)) + return ERR_PTR(error); + + ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; + return &ff->file; +} + /** * alloc_file - allocate and initialize a 'struct file' * diff --git a/fs/fs_context.c b/fs/fs_context.c index 24ce12f0db32..851214d1d013 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -561,7 +561,8 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) return -ENOMEM; } - ctx->legacy_data[size++] = ','; + if (size) + ctx->legacy_data[size++] = ','; len = strlen(param->key); memcpy(ctx->legacy_data + size, param->key, len); size += len; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 300844f50dcd..cb62c8f07d1e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -784,9 +784,13 @@ static inline bool should_fault_in_pages(struct iov_iter *i, if (!user_backed_iter(i)) return false; + /* + * Try to fault in multiple pages initially. When that doesn't result + * in any progress, fall back to a single page. + */ size = PAGE_SIZE; offs = offset_in_page(iocb->ki_pos); - if (*prev_count != count || !*window_size) { + if (*prev_count != count) { size_t nr_dirtied; nr_dirtied = max(current->nr_dirtied_pause - @@ -870,6 +874,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct gfs2_inode *ip = GFS2_I(inode); size_t prev_count = 0, window_size = 0; size_t written = 0; + bool enough_retries; ssize_t ret; /* @@ -913,11 +918,17 @@ retry: if (ret > 0) written = ret; + enough_retries = prev_count == iov_iter_count(from) && + window_size <= PAGE_SIZE; if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { gfs2_glock_dq(gh); window_size -= fault_in_iov_iter_readable(from, window_size); - if (window_size) - goto retry; + if (window_size) { + if (!enough_retries) + goto retry; + /* fall back to buffered I/O */ + ret = 0; + } } out_unlock: if (gfs2_holder_queued(gh)) diff --git a/fs/inode.c b/fs/inode.c index 577799b7855f..53ae3b76d232 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1104,9 +1104,51 @@ void discard_new_inode(struct inode *inode) EXPORT_SYMBOL(discard_new_inode); /** + * lock_two_inodes - lock two inodes (may be regular files but also dirs) + * + * Lock any non-NULL argument. The caller must make sure that if he is passing + * in two directories, one is not ancestor of the other. Zero, one or two + * objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + * @subclass1: inode lock subclass for the first lock obtained + * @subclass2: inode lock subclass for the second lock obtained + */ +void lock_two_inodes(struct inode *inode1, struct inode *inode2, + unsigned subclass1, unsigned subclass2) +{ + if (!inode1 || !inode2) { + /* + * Make sure @subclass1 will be used for the acquired lock. + * This is not strictly necessary (no current caller cares) but + * let's keep things consistent. + */ + if (!inode1) + swap(inode1, inode2); + goto lock; + } + + /* + * If one object is directory and the other is not, we must make sure + * to lock directory first as the other object may be its child. + */ + if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) { + if (inode1 > inode2) + swap(inode1, inode2); + } else if (!S_ISDIR(inode1->i_mode)) + swap(inode1, inode2); +lock: + if (inode1) + inode_lock_nested(inode1, subclass1); + if (inode2 && inode2 != inode1) + inode_lock_nested(inode2, subclass2); +} + +/** * lock_two_nondirectories - take two i_mutexes on non-directory objects * - * Lock any non-NULL argument that is not a directory. + * Lock any non-NULL argument. Passed objects must not be directories. * Zero, one or two objects may be locked by this function. * * @inode1: first inode to lock @@ -1114,13 +1156,9 @@ EXPORT_SYMBOL(discard_new_inode); */ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - if (inode1 > inode2) - swap(inode1, inode2); - - if (inode1 && !S_ISDIR(inode1->i_mode)) - inode_lock(inode1); - if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - inode_lock_nested(inode2, I_MUTEX_NONDIR2); + WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); + WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); + lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -1131,10 +1169,14 @@ EXPORT_SYMBOL(lock_two_nondirectories); */ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { - if (inode1 && !S_ISDIR(inode1->i_mode)) + if (inode1) { + WARN_ON_ONCE(S_ISDIR(inode1->i_mode)); inode_unlock(inode1); - if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + } + if (inode2 && inode2 != inode1) { + WARN_ON_ONCE(S_ISDIR(inode2->i_mode)); inode_unlock(inode2); + } } EXPORT_SYMBOL(unlock_two_nondirectories); diff --git a/fs/internal.h b/fs/internal.h index bd3b2810a36b..f7a3dc111026 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -97,8 +97,9 @@ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ -extern struct file *alloc_empty_file(int, const struct cred *); -extern struct file *alloc_empty_file_noaccount(int, const struct cred *); +struct file *alloc_empty_file(int flags, const struct cred *cred); +struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); +struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void put_file_access(struct file *file) { @@ -121,6 +122,47 @@ extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* + * Prepare superblock for changing its read-only state (i.e., either remount + * read-write superblock read-only or vice versa). After this function returns + * mnt_is_readonly() will return true for any mount of the superblock if its + * caller is able to observe any changes done by the remount. This holds until + * sb_end_ro_state_change() is called. + */ +static inline void sb_start_ro_state_change(struct super_block *sb) +{ + WRITE_ONCE(sb->s_readonly_remount, 1); + /* + * For RO->RW transition, the barrier pairs with the barrier in + * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY + * cleared, it will see s_readonly_remount set. + * For RW->RO transition, the barrier pairs with the barrier in + * __mnt_want_write() before the mnt_is_readonly() check. The barrier + * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already + * cleared, it will see s_readonly_remount set. + */ + smp_wmb(); +} + +/* + * Ends section changing read-only state of the superblock. After this function + * returns if mnt_is_readonly() returns false, the caller will be able to + * observe all the changes remount did to the superblock. + */ +static inline void sb_end_ro_state_change(struct super_block *sb) +{ + /* + * This barrier provides release semantics that pairs with + * the smp_rmb() acquire semantics in mnt_is_readonly(). + * This barrier pair ensure that when mnt_is_readonly() sees + * 0 for sb->s_readonly_remount, it will also see all the + * preceding flag changes that were made during the RO state + * change. + */ + smp_wmb(); + WRITE_ONCE(sb->s_readonly_remount, 0); +} + +/* * open.c */ struct open_flags { @@ -152,6 +194,8 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); +void lock_two_inodes(struct inode *inode1, struct inode *inode2, + unsigned subclass1, unsigned subclass2); /* * fs-writeback.c diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 837cd55fd4c5..6ae9d6fefb86 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -211,7 +211,10 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) ic->scan_dents = NULL; cond_resched(); } - jffs2_build_xattr_subsystem(c); + ret = jffs2_build_xattr_subsystem(c); + if (ret) + goto exit; + c->flags &= ~JFFS2_SB_FLAG_BUILDING; dbg_fsbuild("FS build complete\n"); diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index aa4048a27f31..3b6bdc9a49e1 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -772,10 +772,10 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) } #define XREF_TMPHASH_SIZE (128) -void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) +int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) { struct jffs2_xattr_ref *ref, *_ref; - struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE]; + struct jffs2_xattr_ref **xref_tmphash; struct jffs2_xattr_datum *xd, *_xd; struct jffs2_inode_cache *ic; struct jffs2_raw_node_ref *raw; @@ -784,9 +784,12 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING)); + xref_tmphash = kcalloc(XREF_TMPHASH_SIZE, + sizeof(struct jffs2_xattr_ref *), GFP_KERNEL); + if (!xref_tmphash) + return -ENOMEM; + /* Phase.1 : Merge same xref */ - for (i=0; i < XREF_TMPHASH_SIZE; i++) - xref_tmphash[i] = NULL; for (ref=c->xref_temp; ref; ref=_ref) { struct jffs2_xattr_ref *tmp; @@ -884,6 +887,8 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) "%u of xref (%u dead, %u orphan) found.\n", xdatum_count, xdatum_unchecked_count, xdatum_orphan_count, xref_count, xref_dead_count, xref_orphan_count); + kfree(xref_tmphash); + return 0; } struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 720007b2fd65..1b5030a3349d 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -71,7 +71,7 @@ static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref) #ifdef CONFIG_JFFS2_FS_XATTR extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c); -extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); +extern int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c); extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c); extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, @@ -103,7 +103,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); #else #define jffs2_init_xattr_subsystem(c) -#define jffs2_build_xattr_subsystem(c) +#define jffs2_build_xattr_subsystem(c) (0) #define jffs2_clear_xattr_subsystem(c) #define jffs2_xattr_do_crccheck_inode(c, ic) diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b29d68b5eec5..494b9f4043cf 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -876,7 +876,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, tid_t tid; ino_t ino = 0; struct component_name dname; - int ssize; /* source pathname size */ + u32 ssize; /* source pathname size */ struct btstack btstack; struct inode *ip = d_inode(dentry); s64 xlen = 0; @@ -957,7 +957,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, if (ssize > sizeof (JFS_IP(ip)->i_inline)) JFS_IP(ip)->mode2 &= ~INLINEEA; - jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ", + jfs_info("jfs_symlink: fast symlink added ssize:%u name:%s ", ssize, name); } /* @@ -987,7 +987,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ - int copy_size = min(ssize, PSIZE); + u32 copy_size = min_t(u32, ssize, PSIZE); mp = get_metapage(ip, xaddr, PSIZE, 1); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 04ba95b83d16..22d3ff3818f5 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -355,7 +355,6 @@ static int lockd_get(void) int error; if (nlmsvc_serv) { - svc_get(nlmsvc_serv); nlmsvc_users++; return 0; } diff --git a/fs/namei.c b/fs/namei.c index e4fe0879ae55..91171da719c5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3028,8 +3028,8 @@ static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) return p; } - inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); - inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); + lock_two_inodes(p1->d_inode, p2->d_inode, + I_MUTEX_PARENT, I_MUTEX_PARENT2); return NULL; } @@ -3703,7 +3703,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap, } /** - * vfs_tmpfile_open - open a tmpfile for kernel internal use + * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile @@ -3714,24 +3714,26 @@ static int vfs_tmpfile(struct mnt_idmap *idmap, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ -struct file *vfs_tmpfile_open(struct mnt_idmap *idmap, - const struct path *parentpath, - umode_t mode, int open_flag, const struct cred *cred) +struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, + const struct path *parentpath, + umode_t mode, int open_flag, + const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); - if (!IS_ERR(file)) { - error = vfs_tmpfile(idmap, parentpath, file, mode); - if (error) { - fput(file); - file = ERR_PTR(error); - } + if (IS_ERR(file)) + return file; + + error = vfs_tmpfile(idmap, parentpath, file, mode); + if (error) { + fput(file); + file = ERR_PTR(error); } return file; } -EXPORT_SYMBOL(vfs_tmpfile_open); +EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, @@ -4731,7 +4733,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we have to lock _four_ objects - parents and victim (if it exists), - * and source (if it is not a directory). + * and source. * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change @@ -4815,10 +4817,16 @@ int vfs_rename(struct renamedata *rd) take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); - if (!is_dir || (flags & RENAME_EXCHANGE)) - lock_two_nondirectories(source, target); - else if (target) - inode_lock(target); + /* + * Lock all moved children. Moved directories may need to change parent + * pointer so they need the lock to prevent against concurrent + * directory changes moving parent pointer. For regular files we've + * historically always done this. The lockdep locking subclasses are + * somewhat arbitrary but RENAME_EXCHANGE in particular can swap + * regular files and directories so it's difficult to tell which + * subclasses to use. + */ + lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2); error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) @@ -4866,9 +4874,9 @@ int vfs_rename(struct renamedata *rd) d_exchange(old_dentry, new_dentry); } out: - if (!is_dir || (flags & RENAME_EXCHANGE)) - unlock_two_nondirectories(source, target); - else if (target) + if (source) + inode_unlock(source); + if (target) inode_unlock(target); dput(new_dentry); if (!error) { diff --git a/fs/namespace.c b/fs/namespace.c index 54847db5b819..e157efc54023 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -309,9 +309,16 @@ static unsigned int mnt_get_writers(struct mount *mnt) static int mnt_is_readonly(struct vfsmount *mnt) { - if (mnt->mnt_sb->s_readonly_remount) + if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; - /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + /* + * The barrier pairs with the barrier in sb_start_ro_state_change() + * making sure if we don't see s_readonly_remount set yet, we also will + * not see any superblock / mount flag changes done by remount. + * It also pairs with the barrier in sb_end_ro_state_change() + * assuring that if we see s_readonly_remount already cleared, we will + * see the values of superblock / mount flags updated by remount. + */ smp_rmb(); return __mnt_is_readonly(mnt); } @@ -364,9 +371,11 @@ int __mnt_want_write(struct vfsmount *m) } } /* - * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will - * be set to match its requirements. So we must not load that until - * MNT_WRITE_HOLD is cleared. + * The barrier pairs with the barrier sb_start_ro_state_change() making + * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in + * mnt_is_readonly() and bail in case we are racing with remount + * read-only. */ smp_rmb(); if (mnt_is_readonly(m)) { @@ -588,10 +597,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; - if (!err) { - sb->s_readonly_remount = 1; - smp_wmb(); - } + if (!err) + sb_start_ro_state_change(sb); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; @@ -658,9 +665,25 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) return false; } -/* - * find the first mount at @dentry on vfsmount @mnt. - * call under rcu_read_lock() +/** + * __lookup_mnt - find first child mount + * @mnt: parent mount + * @dentry: mountpoint + * + * If @mnt has a child mount @c mounted @dentry find and return it. + * + * Note that the child mount @c need not be unique. There are cases + * where shadow mounts are created. For example, during mount + * propagation when a source mount @mnt whose root got overmounted by a + * mount @o after path lookup but before @namespace_sem could be + * acquired gets copied and propagated. So @mnt gets copied including + * @o. When @mnt is propagated to a destination mount @d that already + * has another mount @n mounted at the same mountpoint then the source + * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on + * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt + * on @dentry. + * + * Return: The first child of @mnt mounted @dentry or NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -910,6 +933,33 @@ void mnt_set_mountpoint(struct mount *mnt, hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } +/** + * mnt_set_mountpoint_beneath - mount a mount beneath another one + * + * @new_parent: the source mount + * @top_mnt: the mount beneath which @new_parent is mounted + * @new_mp: the new mountpoint of @top_mnt on @new_parent + * + * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and + * parent @top_mnt->mnt_parent and mount it on top of @new_parent at + * @new_mp. And mount @new_parent on the old parent and old + * mountpoint of @top_mnt. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. + */ +static void mnt_set_mountpoint_beneath(struct mount *new_parent, + struct mount *top_mnt, + struct mountpoint *new_mp) +{ + struct mount *old_top_parent = top_mnt->mnt_parent; + struct mountpoint *old_top_mp = top_mnt->mnt_mp; + + mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); + mnt_change_mountpoint(new_parent, new_mp, top_mnt); +} + + static void __attach_mnt(struct mount *mnt, struct mount *parent) { hlist_add_head_rcu(&mnt->mnt_hash, @@ -917,15 +967,42 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } -/* - * vfsmount lock must be held for write +/** + * attach_mnt - mount a mount, attach to @mount_hashtable and parent's + * list of child mounts + * @parent: the parent + * @mnt: the new mount + * @mp: the new mountpoint + * @beneath: whether to mount @mnt beneath or on top of @parent + * + * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * to @parent's child mount list and to @mount_hashtable. + * + * If @beneath is true, remove @mnt from its current parent and + * mountpoint and mount it on @mp on @parent, and mount @parent on the + * old parent and old mountpoint of @mnt. Finally, attach @parent to + * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. + * + * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * to the correct parent. + * + * Context: This function expects namespace_lock() and lock_mount_hash() + * to have been acquired in that order. */ -static void attach_mnt(struct mount *mnt, - struct mount *parent, - struct mountpoint *mp) +static void attach_mnt(struct mount *mnt, struct mount *parent, + struct mountpoint *mp, bool beneath) { - mnt_set_mountpoint(parent, mp, mnt); - __attach_mnt(mnt, parent); + if (beneath) + mnt_set_mountpoint_beneath(mnt, parent, mp); + else + mnt_set_mountpoint(parent, mp, mnt); + /* + * Note, @mnt->mnt_parent has to be used. If @mnt was mounted + * beneath @parent then @mnt will need to be attached to + * @parent's old parent, not @parent. IOW, @mnt->mnt_parent + * isn't the same mount as @parent. + */ + __attach_mnt(mnt, mnt->mnt_parent); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) @@ -937,7 +1014,7 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp); + attach_mnt(mnt, parent, mp, false); put_mountpoint(old_mp); mnt_add_count(old_parent, -1); @@ -1767,6 +1844,19 @@ bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +/** + * path_mounted - check whether path is mounted + * @path: path to check + * + * Determine whether @path refers to the root of a mount. + * + * Return: true if @path is the root of a mount, false if not. + */ +static inline bool path_mounted(const struct path *path) +{ + return path->mnt->mnt_root == path->dentry; +} + static void warn_mandlock(void) { pr_warn_once("=======================================================\n" @@ -1782,7 +1872,7 @@ static int can_umount(const struct path *path, int flags) if (!may_mount()) return -EPERM; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; @@ -1925,7 +2015,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); - attach_mnt(q, parent, p->mnt_mp); + attach_mnt(q, parent, p->mnt_mp, false); unlock_mount_hash(); } } @@ -2134,12 +2224,17 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) return 0; } -/* - * @source_mnt : mount tree to be attached - * @nd : place the mount tree @source_mnt is attached - * @parent_nd : if non-null, detach the source_mnt from its parent and - * store the parent mount and mountpoint dentry. - * (done when source_mnt is moved) +enum mnt_tree_flags_t { + MNT_TREE_MOVE = BIT(0), + MNT_TREE_BENEATH = BIT(1), +}; + +/** + * attach_recursive_mnt - attach a source mount tree + * @source_mnt: mount tree to be attached + * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mp: the mountpoint @source_mnt will be mounted at + * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2196,22 +2291,28 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. + * + * Context: The function expects namespace_lock() to be held. + * Return: If @source_mnt was successfully attached 0 is returned. + * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp, - bool moving) + struct mount *top_mnt, + struct mountpoint *dest_mp, + enum mnt_tree_flags_t flags) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mnt_namespace *ns = top_mnt->mnt_ns; struct mountpoint *smp; - struct mount *child, *p; + struct mount *child, *dest_mnt, *p; struct hlist_node *n; - int err; + int err = 0; + bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; - /* Preallocate a mountpoint in case the new mounts need - * to be tucked under other mounts. + /* + * Preallocate a mountpoint in case the new mounts need to be + * mounted beneath mounts on the same mountpoint. */ smp = get_mountpoint(source_mnt->mnt.mnt_root); if (IS_ERR(smp)) @@ -2224,29 +2325,41 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } + if (beneath) + dest_mnt = top_mnt->mnt_parent; + else + dest_mnt = top_mnt; + if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); - lock_mount_hash(); - if (err) - goto out_cleanup_ids; + } + lock_mount_hash(); + if (err) + goto out_cleanup_ids; + + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); - } else { - lock_mount_hash(); } + if (moving) { + if (beneath) + dest_mp = smp; unhash_mnt(source_mnt); - attach_mnt(source_mnt, dest_mnt, dest_mp); + attach_mnt(source_mnt, top_mnt, dest_mp, beneath); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ list_del_init(&source_mnt->mnt_ns->list); } - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + if (beneath) + mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); + else + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -2286,33 +2399,101 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } -static struct mountpoint *lock_mount(struct path *path) +/** + * do_lock_mount - lock mount and mountpoint + * @path: target path + * @beneath: whether the intention is to mount beneath @path + * + * Follow the mount stack on @path until the top mount @mnt is found. If + * the initial @path->{mnt,dentry} is a mountpoint lookup the first + * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} + * until nothing is stacked on top of it anymore. + * + * Acquire the inode_lock() on the top mount's ->mnt_root to protect + * against concurrent removal of the new mountpoint from another mount + * namespace. + * + * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint + * @mp on @mnt->mnt_parent must be acquired. This protects against a + * concurrent unlink of @mp->mnt_dentry from another mount namespace + * where @mnt doesn't have a child mount mounted @mp. A concurrent + * removal of @mnt->mnt_root doesn't matter as nothing will be mounted + * on top of it for @beneath. + * + * In addition, @beneath needs to make sure that @mnt hasn't been + * unmounted or moved from its current mountpoint in between dropping + * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt + * being unmounted would be detected later by e.g., calling + * check_mnt(mnt) in the function it's called from. For the @beneath + * case however, it's useful to detect it directly in do_lock_mount(). + * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points + * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will + * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. + * + * Return: Either the target mountpoint on the top mount or the top + * mount's mountpoint. + */ +static struct mountpoint *do_lock_mount(struct path *path, bool beneath) { - struct vfsmount *mnt; - struct dentry *dentry = path->dentry; -retry: - inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - return ERR_PTR(-ENOENT); - } - namespace_lock(); - mnt = lookup_mnt(path); - if (likely(!mnt)) { - struct mountpoint *mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { + struct vfsmount *mnt = path->mnt; + struct dentry *dentry; + struct mountpoint *mp = ERR_PTR(-ENOENT); + + for (;;) { + struct mount *m; + + if (beneath) { + m = real_mount(mnt); + read_seqlock_excl(&mount_lock); + dentry = dget(m->mnt_mountpoint); + read_sequnlock_excl(&mount_lock); + } else { + dentry = path->dentry; + } + + inode_lock(dentry->d_inode); + if (unlikely(cant_mount(dentry))) { + inode_unlock(dentry->d_inode); + goto out; + } + + namespace_lock(); + + if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { namespace_unlock(); inode_unlock(dentry->d_inode); - return mp; + goto out; } - return mp; + + mnt = lookup_mnt(path); + if (likely(!mnt)) + break; + + namespace_unlock(); + inode_unlock(dentry->d_inode); + if (beneath) + dput(dentry); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); } - namespace_unlock(); - inode_unlock(path->dentry->d_inode); - path_put(path); - path->mnt = mnt; - dentry = path->dentry = dget(mnt->mnt_root); - goto retry; + + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + } + +out: + if (beneath) + dput(dentry); + + return mp; +} + +static inline struct mountpoint *lock_mount(struct path *path) +{ + return do_lock_mount(path, false); } static void unlock_mount(struct mountpoint *where) @@ -2336,7 +2517,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, false); + return attach_recursive_mnt(mnt, p, mp, 0); } /* @@ -2367,7 +2548,7 @@ static int do_change_type(struct path *path, int ms_flags) int type; int err = 0; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; type = flags_to_propagation_type(ms_flags); @@ -2643,7 +2824,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (!check_mnt(mnt)) return -EINVAL; - if (path->dentry != mnt->mnt.mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) @@ -2682,7 +2863,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (!check_mnt(mnt)) return -EINVAL; - if (path->dentry != path->mnt->mnt_root) + if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) @@ -2772,9 +2953,9 @@ static int do_set_group(struct path *from_path, struct path *to_path) err = -EINVAL; /* To and From paths should be mount roots */ - if (from_path->dentry != from_path->mnt->mnt_root) + if (!path_mounted(from_path)) goto out; - if (to_path->dentry != to_path->mnt->mnt_root) + if (!path_mounted(to_path)) goto out; /* Setting sharing groups is only allowed across same superblock */ @@ -2818,7 +2999,110 @@ out: return err; } -static int do_move_mount(struct path *old_path, struct path *new_path) +/** + * path_overmounted - check if path is overmounted + * @path: path to check + * + * Check if path is overmounted, i.e., if there's a mount on top of + * @path->mnt with @path->dentry as mountpoint. + * + * Context: This function expects namespace_lock() to be held. + * Return: If path is overmounted true is returned, false if not. + */ +static inline bool path_overmounted(const struct path *path) +{ + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { + rcu_read_unlock(); + return true; + } + rcu_read_unlock(); + return false; +} + +/** + * can_move_mount_beneath - check that we can mount beneath the top mount + * @from: mount to mount beneath + * @to: mount under which to mount + * + * - Make sure that @to->dentry is actually the root of a mount under + * which we can mount another mount. + * - Make sure that nothing can be mounted beneath the caller's current + * root or the rootfs of the namespace. + * - Make sure that the caller can unmount the topmost mount ensuring + * that the caller could reveal the underlying mountpoint. + * - Ensure that nothing has been mounted on top of @from before we + * grabbed @namespace_sem to avoid creating pointless shadow mounts. + * - Prevent mounting beneath a mount if the propagation relationship + * between the source mount, parent mount, and top mount would lead to + * nonsensical mount trees. + * + * Context: This function expects namespace_lock() to be held. + * Return: On success 0, and on error a negative error code is returned. + */ +static int can_move_mount_beneath(const struct path *from, + const struct path *to, + const struct mountpoint *mp) +{ + struct mount *mnt_from = real_mount(from->mnt), + *mnt_to = real_mount(to->mnt), + *parent_mnt_to = mnt_to->mnt_parent; + + if (!mnt_has_parent(mnt_to)) + return -EINVAL; + + if (!path_mounted(to)) + return -EINVAL; + + if (IS_MNT_LOCKED(mnt_to)) + return -EINVAL; + + /* Avoid creating shadow mounts during mount propagation. */ + if (path_overmounted(from)) + return -EINVAL; + + /* + * Mounting beneath the rootfs only makes sense when the + * semantics of pivot_root(".", ".") are used. + */ + if (&mnt_to->mnt == current->fs->root.mnt) + return -EINVAL; + if (parent_mnt_to == current->nsproxy->mnt_ns->root) + return -EINVAL; + + for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) + if (p == mnt_to) + return -EINVAL; + + /* + * If the parent mount propagates to the child mount this would + * mean mounting @mnt_from on @mnt_to->mnt_parent and then + * propagating a copy @c of @mnt_from on top of @mnt_to. This + * defeats the whole purpose of mounting beneath another mount. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) + return -EINVAL; + + /* + * If @mnt_to->mnt_parent propagates to @mnt_from this would + * mean propagating a copy @c of @mnt_from on top of @mnt_from. + * Afterwards @mnt_from would be mounted on top of + * @mnt_to->mnt_parent and @mnt_to would be unmounted from + * @mnt->mnt_parent and remounted on @mnt_from. But since @c is + * already mounted on @mnt_from, @mnt_to would ultimately be + * remounted on top of @c. Afterwards, @mnt_from would be + * covered by a copy @c of @mnt_from and @c would be covered by + * @mnt_from itself. This defeats the whole purpose of mounting + * @mnt_from beneath @mnt_to. + */ + if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + return -EINVAL; + + return 0; +} + +static int do_move_mount(struct path *old_path, struct path *new_path, + bool beneath) { struct mnt_namespace *ns; struct mount *p; @@ -2827,8 +3111,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path) struct mountpoint *mp, *old_mp; int err; bool attached; + enum mnt_tree_flags_t flags = 0; - mp = lock_mount(new_path); + mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) return PTR_ERR(mp); @@ -2836,6 +3121,8 @@ static int do_move_mount(struct path *old_path, struct path *new_path) p = real_mount(new_path->mnt); parent = old->mnt_parent; attached = mnt_has_parent(old); + if (attached) + flags |= MNT_TREE_MOVE; old_mp = old->mnt_mp; ns = old->mnt_ns; @@ -2855,7 +3142,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) if (old->mnt.mnt_flags & MNT_LOCKED) goto out; - if (old_path->dentry != old_path->mnt->mnt_root) + if (!path_mounted(old_path)) goto out; if (d_is_dir(new_path->dentry) != @@ -2866,6 +3153,17 @@ static int do_move_mount(struct path *old_path, struct path *new_path) */ if (attached && IS_MNT_SHARED(parent)) goto out; + + if (beneath) { + err = can_move_mount_beneath(old_path, new_path, mp); + if (err) + goto out; + + err = -EINVAL; + p = p->mnt_parent; + flags |= MNT_TREE_BENEATH; + } + /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. @@ -2879,8 +3177,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path) if (p == old) goto out; - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, - attached); + err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); if (err) goto out; @@ -2912,7 +3209,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path); + err = do_move_mount(&old_path, path, false); path_put(&old_path); return err; } @@ -2937,8 +3234,7 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, } /* Refuse the same filesystem on the same mount point */ - if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && - path->mnt->mnt_root == path->dentry) + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) @@ -3079,13 +3375,10 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = -ENOENT; goto discard_locked; } - rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, dentry))) { - rcu_read_unlock(); + if (path_overmounted(path)) { err = 0; goto discard_locked; } - rcu_read_unlock(); mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); @@ -3777,6 +4070,10 @@ SYSCALL_DEFINE5(move_mount, if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; + if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == + (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) + return -EINVAL; + /* If someone gives a pathname, they aren't permitted to move * from an fd that requires unmount as we can't get at the flag * to clear it afterwards. @@ -3806,7 +4103,8 @@ SYSCALL_DEFINE5(move_mount, if (flags & MOVE_MOUNT_SET_GROUP) ret = do_set_group(&from_path, &to_path); else - ret = do_move_mount(&from_path, &to_path); + ret = do_move_mount(&from_path, &to_path, + (flags & MOVE_MOUNT_BENEATH)); out_to: path_put(&to_path); @@ -3917,11 +4215,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; - if (root.mnt->mnt_root != root.dentry) + if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ - if (new.mnt->mnt_root != new.dentry) + if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ @@ -3939,9 +4237,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp); + attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp); + attach_mnt(new_mnt, root_parent, root_mp, false); mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ @@ -4124,7 +4422,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) struct mount *mnt = real_mount(path->mnt); int err = 0; - if (path->dentry != mnt->mnt.mnt_root) + if (!path_mounted(path)) return -EINVAL; if (kattr->mnt_userns) { diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index f21259ead64b..4c9b87850ab1 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -80,6 +80,8 @@ enum { int nfsd_drc_slab_create(void); void nfsd_drc_slab_free(void); +int nfsd_net_reply_cache_init(struct nfsd_net *nn); +void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index ae85257b4238..11a0eaa2f914 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -97,7 +97,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) goto out; err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -107,7 +107,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) dprintk("found domain %s\n", buf); err = -EINVAL; - if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; fsidtype = simple_strtoul(buf, &ep, 10); if (*ep) @@ -593,7 +593,6 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; - int len; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; @@ -609,8 +608,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* client */ err = -EINVAL; - len = qword_get(&mesg, buf, PAGE_SIZE); - if (len <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; @@ -620,7 +618,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) /* path */ err = -EINVAL; - if ((len = qword_get(&mesg, buf, PAGE_SIZE)) <= 0) + if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); @@ -665,7 +663,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out3; exp.ex_fsid = an_int; - while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) { + while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index e6bb8eeb5bc2..fc8d5b7db9f8 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -151,8 +151,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp) { struct nfsd3_readargs *argp = rqstp->rq_argp; struct nfsd3_readres *resp = rqstp->rq_resp; - unsigned int len; - int v; dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n", SVCFH_fmt(&argp->fh), @@ -166,17 +164,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) if (argp->offset + argp->count > (u64)OFFSET_MAX) argp->count = (u64)OFFSET_MAX - argp->offset; - v = 0; - len = argp->count; resp->pages = rqstp->rq_next_page; - while (len > 0) { - struct page *page = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(page); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } /* Obtain buffer pointer for payload. * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) @@ -187,7 +175,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, v, &resp->count, &resp->eof); + &resp->count, &resp->eof); return rpc_success; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 3308dd671ef0..f32128955ec8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -828,7 +828,8 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->len) < 0) return false; - xdr_write_pages(xdr, resp->pages, 0, resp->len); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, 0, + resp->len); if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) return false; break; @@ -859,8 +860,9 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->count) < 0) return false; - xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, - resp->count); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) return false; break; @@ -961,7 +963,8 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (!svcxdr_encode_cookieverf3(xdr, resp->verf)) return false; - xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); /* no more entries */ if (xdr_stream_encode_item_absent(xdr) < 0) return false; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 76db2fe29624..26b1343c8035 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2541,6 +2541,20 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, return p; } +static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr, + struct timespec64 *tv) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 3); + if (!p) + return nfserr_resource; + + p = xdr_encode_hyper(p, (s64)tv->tv_sec); + *p = cpu_to_be32(tv->tv_nsec); + return nfs_ok; +} + /* * ctime (in NFSv4, time_metadata) is not writeable, and the client * doesn't really care what resolution could theoretically be stored by @@ -2566,12 +2580,16 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode) return p; } -static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c) +static __be32 +nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c) { - *p++ = cpu_to_be32(c->atomic); - p = xdr_encode_hyper(p, c->before_change); - p = xdr_encode_hyper(p, c->after_change); - return p; + if (xdr_stream_encode_bool(xdr, c->atomic) < 0) + return nfserr_resource; + if (xdr_stream_encode_u64(xdr, c->before_change) < 0) + return nfserr_resource; + if (xdr_stream_encode_u64(xdr, c->after_change) < 0) + return nfserr_resource; + return nfs_ok; } /* Encode as an array of strings the string given with components @@ -3348,11 +3366,9 @@ out_acl: p = xdr_encode_hyper(p, dummy64); } if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec); - *p++ = cpu_to_be32(stat.atime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.atime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_DELTA) { p = xdr_reserve_space(xdr, 12); @@ -3361,25 +3377,19 @@ out_acl: p = encode_time_delta(p, d_inode(dentry)); } if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec); - *p++ = cpu_to_be32(stat.ctime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.ctime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec); - *p++ = cpu_to_be32(stat.mtime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.mtime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_TIME_CREATE) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec); - *p++ = cpu_to_be32(stat.btime.tv_nsec); + status = nfsd4_encode_nfstime4(xdr, &stat.btime); + if (status) + goto out; } if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { u64 ino = stat.ino; @@ -3689,6 +3699,30 @@ fail: } static __be32 +nfsd4_encode_verifier4(struct xdr_stream *xdr, const nfs4_verifier *verf) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + memcpy(p, verf->data, sizeof(verf->data)); + return nfs_ok; +} + +static __be32 +nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(__be64)); + if (!p) + return nfserr_resource; + memcpy(p, clientid, sizeof(*clientid)); + return nfs_ok; +} + +static __be32 nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) { __be32 *p; @@ -3752,15 +3786,8 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_commit *commit = &u->commit; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, commit->co_verf.data, - NFS4_VERIFIER_SIZE); - return 0; + return nfsd4_encode_verifier4(resp->xdr, &commit->co_verf); } static __be32 @@ -3769,12 +3796,10 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - encode_cinfo(p, &create->cr_cinfo); + nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo); + if (nfserr) + return nfserr; return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], create->cr_bmval[1], create->cr_bmval[2]); } @@ -3892,13 +3917,8 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_link *link = &u->link; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &link->li_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &link->li_cinfo); } @@ -3913,11 +3933,11 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); if (nfserr) return nfserr; - p = xdr_reserve_space(xdr, 24); - if (!p) + nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); + if (nfserr) + return nfserr; + if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0) return nfserr_resource; - p = encode_cinfo(p, &open->op_cinfo); - *p++ = cpu_to_be32(open->op_rflags); nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], open->op_bmval[2]); @@ -3956,7 +3976,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, p = xdr_reserve_space(xdr, 32); if (!p) return nfserr_resource; - *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(open->op_recall); /* * TODO: space_limit's in delegations @@ -4018,6 +4038,11 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, return nfsd4_encode_stateid(xdr, &od->od_stateid); } +/* + * The operation of this function assumes that this is the only + * READ operation in the COMPOUND. If there are multiple READs, + * we use nfsd4_encode_readv(). + */ static __be32 nfsd4_encode_splice_read( struct nfsd4_compoundres *resp, struct nfsd4_read *read, @@ -4028,8 +4053,12 @@ static __be32 nfsd4_encode_splice_read( int status, space_left; __be32 nfserr; - /* Make sure there will be room for padding if needed */ - if (xdr->end - xdr->p < 1) + /* + * Make sure there is room at the end of buf->head for + * svcxdr_encode_opaque_pages() to create a tail buffer + * to XDR-pad the payload. + */ + if (xdr->iov != xdr->buf->head || xdr->end - xdr->p < 1) return nfserr_resource; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, @@ -4038,6 +4067,8 @@ static __be32 nfsd4_encode_splice_read( read->rd_length = maxcount; if (nfserr) goto out_err; + svcxdr_encode_opaque_pages(read->rd_rqstp, xdr, buf->pages, + buf->page_base, maxcount); status = svc_encode_result_payload(read->rd_rqstp, buf->head[0].iov_len, maxcount); if (status) { @@ -4045,31 +4076,19 @@ static __be32 nfsd4_encode_splice_read( goto out_err; } - buf->page_len = maxcount; - buf->len += maxcount; - xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1) - / PAGE_SIZE; - - /* Use rest of head for padding and remaining ops: */ - buf->tail[0].iov_base = xdr->p; - buf->tail[0].iov_len = 0; - xdr->iov = buf->tail; - if (maxcount&3) { - int pad = 4 - (maxcount&3); - - *(xdr->p++) = 0; - - buf->tail[0].iov_base += maxcount&3; - buf->tail[0].iov_len = pad; - buf->len += pad; - } - + /* + * Prepare to encode subsequent operations. + * + * xdr_truncate_encode() is not safe to use after a successful + * splice read has been done, so the following stream + * manipulations are open-coded. + */ space_left = min_t(int, (void *)xdr->end - (void *)xdr->p, buf->buflen - buf->len); buf->buflen = buf->len + space_left; xdr->end = (__be32 *)((void *)xdr->end + space_left); - return 0; + return nfs_ok; out_err: /* @@ -4090,13 +4109,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, __be32 zero = xdr_zero; __be32 nfserr; - read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount); - if (read->rd_vlen < 0) + if (xdr_reserve_space_vec(xdr, maxcount) < 0) return nfserr_resource; - nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount, - &read->rd_eof); + nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file, + read->rd_offset, &maxcount, + xdr->buf->page_len & ~PAGE_MASK, + &read->rd_eof); read->rd_length = maxcount; if (nfserr) return nfserr; @@ -4213,15 +4232,9 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, int starting_len = xdr->buf->len; __be32 *p; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - - /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */ - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - xdr->buf->head[0].iov_len = (char *)xdr->p - - (char *)xdr->buf->head[0].iov_base; + nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); + if (nfserr != nfs_ok) + return nfserr; /* * Number of bytes left for directory entries allowing for the @@ -4299,13 +4312,8 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_remove *remove = &u->remove; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &remove->rm_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &remove->rm_cinfo); } static __be32 @@ -4314,14 +4322,11 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_rename *rename = &u->rename; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 40); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &rename->rn_sinfo); - p = encode_cinfo(p, &rename->rn_tinfo); - return 0; + nfserr = nfsd4_encode_change_info4(xdr, &rename->rn_sinfo); + if (nfserr) + return nfserr; + return nfsd4_encode_change_info4(xdr, &rename->rn_tinfo); } static __be32 @@ -4448,23 +4453,25 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_setclientid *scd = &u->setclientid; struct xdr_stream *xdr = resp->xdr; - __be32 *p; if (!nfserr) { - p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8); - p = xdr_encode_opaque_fixed(p, &scd->se_confirm, - NFS4_VERIFIER_SIZE); - } - else if (nfserr == nfserr_clid_inuse) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); + nfserr = nfsd4_encode_clientid4(xdr, &scd->se_clientid); + if (nfserr != nfs_ok) + goto out; + nfserr = nfsd4_encode_verifier4(xdr, &scd->se_confirm); + } else if (nfserr == nfserr_clid_inuse) { + /* empty network id */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } + /* empty universal address */ + if (xdr_stream_encode_u32(xdr, 0) < 0) { + nfserr = nfserr_resource; + goto out; + } } +out: return nfserr; } @@ -4473,17 +4480,12 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 16); - if (!p) + if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0) return nfserr_resource; - *p++ = cpu_to_be32(write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_how_written); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - return 0; + if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0) + return nfserr_resource; + return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier); } static __be32 @@ -4505,20 +4507,15 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, server_scope = nn->nfsd_name; server_scope_sz = strlen(nn->nfsd_name); - p = xdr_reserve_space(xdr, - 8 /* eir_clientid */ + - 4 /* eir_sequenceid */ + - 4 /* eir_flags */ + - 4 /* spr_how */); - if (!p) + if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, exid->seqid) < 0) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, exid->flags) < 0) return nfserr_resource; - p = xdr_encode_opaque_fixed(p, &exid->clientid, 8); - *p++ = cpu_to_be32(exid->seqid); - *p++ = cpu_to_be32(exid->flags); - - *p++ = cpu_to_be32(exid->spa_how); - + if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0) + return nfserr_resource; switch (exid->spa_how) { case SP4_NONE: break; @@ -5099,15 +5096,8 @@ nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_setxattr *setxattr = &u->setxattr; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - - encode_cinfo(p, &setxattr->setxa_cinfo); - - return 0; + return nfsd4_encode_change_info4(xdr, &setxattr->setxa_cinfo); } /* @@ -5253,14 +5243,8 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_removexattr *removexattr = &u->removexattr; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - - p = encode_cinfo(p, &removexattr->rmxa_cinfo); - return 0; + return nfsd4_encode_change_info4(xdr, &removexattr->rmxa_cinfo); } typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u); @@ -5460,6 +5444,12 @@ status: release: if (opdesc && opdesc->op_release) opdesc->op_release(&op->u); + + /* + * Account for pages consumed while encoding this operation. + * The xdr_stream primitives don't manage rq_next_page. + */ + rqstp->rq_next_page = xdr->page_ptr + 1; } /* @@ -5528,9 +5518,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr) p = resp->statusp; *p++ = resp->cstate.status; - - rqstp->rq_next_page = xdr->page_ptr + 1; - *p++ = htonl(resp->taglen); memcpy(p, resp->tag, resp->taglen); p += XDR_QUADLEN(resp->taglen); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 041faa13b852..a8eda1c85829 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -148,12 +148,23 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } -static int nfsd_reply_cache_stats_init(struct nfsd_net *nn) +/** + * nfsd_net_reply_cache_init - per net namespace reply cache set-up + * @nn: nfsd_net being initialized + * + * Returns zero on succes; otherwise a negative errno is returned. + */ +int nfsd_net_reply_cache_init(struct nfsd_net *nn) { return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); } -static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn) +/** + * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down + * @nn: nfsd_net being freed + * + */ +void nfsd_net_reply_cache_destroy(struct nfsd_net *nn) { nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); } @@ -169,17 +180,13 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); - status = nfsd_reply_cache_stats_init(nn); - if (status) - goto out_nomem; - nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; status = register_shrinker(&nn->nfsd_reply_cache_shrinker, "nfsd-reply:%s", nn->nfsd_name); if (status) - goto out_stats_destroy; + return status; nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); @@ -195,9 +202,6 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: unregister_shrinker(&nn->nfsd_reply_cache_shrinker); -out_stats_destroy: - nfsd_reply_cache_stats_destroy(nn); -out_nomem: printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; } @@ -217,7 +221,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) rp, nn); } } - nfsd_reply_cache_stats_destroy(nn); kvfree(nn->drc_hashtbl); nn->drc_hashtbl = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index b4fd7a7062d5..1b8b1aab9a15 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -25,6 +25,7 @@ #include "netns.h" #include "pnfs.h" #include "filecache.h" +#include "trace.h" /* * We have a single directory with several nodes in it. @@ -109,12 +110,12 @@ static ssize_t nfsctl_transaction_write(struct file *file, const char __user *bu if (IS_ERR(data)) return PTR_ERR(data); - rv = write_op[ino](file, data, size); - if (rv >= 0) { - simple_transaction_set(file, rv); - rv = size; - } - return rv; + rv = write_op[ino](file, data, size); + if (rv < 0) + return rv; + + simple_transaction_set(file, rv); + return size; } static ssize_t nfsctl_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) @@ -230,6 +231,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (rpc_pton(net, fo_path, size, sap, salen) == 0) return -EINVAL; + trace_nfsd_ctl_unlock_ip(net, buf); return nlmsvc_unlock_all_by_ip(sap); } @@ -263,7 +265,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) fo_path = buf; if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - + trace_nfsd_ctl_unlock_fs(netns(file), fo_path); error = kern_path(fo_path, 0, &path); if (error) return error; @@ -324,7 +326,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) len = qword_get(&mesg, dname, size); if (len <= 0) return -EINVAL; - + path = dname+len+1; len = qword_get(&mesg, path, size); if (len <= 0) @@ -338,15 +340,17 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) return -EINVAL; maxsize = min(maxsize, NFS3_FHSIZE); - if (qword_get(&mesg, mesg, size)>0) + if (qword_get(&mesg, mesg, size) > 0) return -EINVAL; + trace_nfsd_ctl_filehandle(netns(file), dname, path, maxsize); + /* we have all the words, they are in buf.. */ dom = unix_domain_find(dname); if (!dom) return -ENOMEM; - len = exp_rootfh(netns(file), dom, path, &fh, maxsize); + len = exp_rootfh(netns(file), dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; @@ -399,6 +403,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return rv; if (newthreads < 0) return -EINVAL; + trace_nfsd_ctl_threads(net, newthreads); rv = nfsd_svc(newthreads, net, file->f_cred); if (rv < 0) return rv; @@ -418,8 +423,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing whitespace- - * separated unsigned integer values + * buf: C string containing whitespace- + * separated unsigned integer values * representing the number of NFSD * threads to start in each pool * size: non-zero length of C string in @buf @@ -471,6 +476,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) rv = -EINVAL; if (nthreads[i] < 0) goto out_free; + trace_nfsd_ctl_pool_threads(net, i, nthreads[i]); } rv = nfsd_set_nrthreads(i, nthreads, net); if (rv) @@ -526,7 +532,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) char *sep; struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); - if (size>0) { + if (size > 0) { if (nn->nfsd_serv) /* Cannot change versions without updating * nn->nfsd_serv->sv_xdrsize, and reallocing @@ -536,6 +542,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; + trace_nfsd_ctl_version(netns(file), buf); vers = mesg; len = qword_get(&mesg, vers, size); @@ -637,11 +644,11 @@ out: * OR * * Input: - * buf: C string containing whitespace- - * separated positive or negative - * integer values representing NFS - * protocol versions to enable ("+n") - * or disable ("-n") + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") * size: non-zero length of C string in @buf * Output: * On success: status of zero or more protocol versions has @@ -689,6 +696,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred err = get_int(&mesg, &fd); if (err != 0 || fd < 0) return -EINVAL; + trace_nfsd_ctl_ports_addfd(net, fd); err = nfsd_create_serv(net); if (err != 0) @@ -705,7 +713,7 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred } /* - * A transport listener is added by writing it's transport name and + * A transport listener is added by writing its transport name and * a port number. */ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cred *cred) @@ -720,6 +728,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr if (port < 1 || port > USHRT_MAX) return -EINVAL; + trace_nfsd_ctl_ports_addxprt(net, transport, port); err = nfsd_create_serv(net); if (err != 0) @@ -832,9 +841,9 @@ int nfsd_max_blksize; * OR * * Input: - * buf: C string containing an unsigned - * integer value representing the new - * NFS blksize + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string @@ -853,6 +862,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) int rv = get_int(&mesg, &bsize); if (rv) return rv; + trace_nfsd_ctl_maxblksize(netns(file), bsize); + /* force bsize into allowed range and * required alignment. */ @@ -881,9 +892,9 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: C string containing an unsigned - * integer value representing the new - * number of max connections + * buf: C string containing an unsigned + * integer value representing the new + * number of max connections * size: non-zero length of C string in @buf * Output: * On success: passed-in buffer filled with '\n'-terminated C string @@ -903,6 +914,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) if (rv) return rv; + trace_nfsd_ctl_maxconn(netns(file), maxconn); nn->max_connections = maxconn; } @@ -913,6 +925,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size) static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time64_t *time, struct nfsd_net *nn) { + struct dentry *dentry = file_dentry(file); char *mesg = buf; int rv, i; @@ -922,6 +935,9 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, rv = get_int(&mesg, &i); if (rv) return rv; + trace_nfsd_ctl_time(netns(file), dentry->d_name.name, + dentry->d_name.len, i); + /* * Some sanity checking. We don't have a reason for * these particular numbers, but problems with the @@ -1014,6 +1030,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, len = qword_get(&mesg, recdir, size); if (len <= 0) return -EINVAL; + trace_nfsd_ctl_recoverydir(netns(file), recdir); status = nfs4_reset_recoverydir(recdir); if (status) @@ -1065,7 +1082,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) * OR * * Input: - * buf: any value + * buf: any value * size: non-zero length of C string in @buf * Output: * passed-in buffer filled with "Y" or "N" with a newline @@ -1087,7 +1104,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case '1': if (!nn->nfsd_serv) return -EBUSY; - nfsd4_end_grace(nn); + trace_nfsd_end_grace(netns(file)); break; default: return -EINVAL; @@ -1192,8 +1209,8 @@ static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, * @content is assumed to be a NUL-terminated string that lives * longer than the symlink itself. */ -static void nfsd_symlink(struct dentry *parent, const char *name, - const char *content) +static void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) { struct inode *dir = parent->d_inode; struct dentry *dentry; @@ -1210,8 +1227,8 @@ out: inode_unlock(dir); } #else -static inline void nfsd_symlink(struct dentry *parent, const char *name, - const char *content) +static inline void _nfsd_symlink(struct dentry *parent, const char *name, + const char *content) { } @@ -1389,8 +1406,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) ret = simple_fill_super(sb, 0x6e667364, nfsd_files); if (ret) return ret; - nfsd_symlink(sb->s_root, "supported_krb5_enctypes", - "/proc/net/rpc/gss_krb5_enctypes"); + _nfsd_symlink(sb->s_root, "supported_krb5_enctypes", + "/proc/net/rpc/gss_krb5_enctypes"); dentry = nfsd_mkdir(sb->s_root, NULL, "clients"); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -1477,7 +1494,17 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; -static __net_init int nfsd_init_net(struct net *net) +/** + * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace + * @net: a freshly-created network namespace + * + * This information stays around as long as the network namespace is + * alive whether or not there is an NFSD instance running in the + * namespace. + * + * Returns zero on success, or a negative errno otherwise. + */ +static __net_init int nfsd_net_init(struct net *net) { int retval; struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -1488,6 +1515,9 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; + retval = nfsd_net_reply_cache_init(nn); + if (retval) + goto out_repcache_error; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); @@ -1496,22 +1526,32 @@ static __net_init int nfsd_init_net(struct net *net) return 0; +out_repcache_error: + nfsd_idmap_shutdown(net); out_idmap_error: nfsd_export_shutdown(net); out_export_error: return retval; } -static __net_exit void nfsd_exit_net(struct net *net) +/** + * nfsd_net_exit - Release the nfsd_net portion of a net namespace + * @net: a network namespace that is about to be destroyed + * + */ +static __net_exit void nfsd_net_exit(struct net *net) { + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + nfsd_net_reply_cache_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); - nfsd_netns_free_versions(net_generic(net, nfsd_net_id)); + nfsd_netns_free_versions(nn); } static struct pernet_operations nfsd_net_ops = { - .init = nfsd_init_net, - .exit = nfsd_exit_net, + .init = nfsd_net_init, + .exit = nfsd_net_exit, .id = &nfsd_net_id, .size = sizeof(struct nfsd_net), }; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ccd8485fee04..e8e13ae72e3c 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -623,16 +623,9 @@ void fh_fill_pre_attrs(struct svc_fh *fhp) inode = d_inode(fhp->fh_dentry); err = fh_getattr(fhp, &stat); - if (err) { - /* Grab the times from inode anyway */ - stat.mtime = inode->i_mtime; - stat.ctime = inode->i_ctime; - stat.size = inode->i_size; - if (v4 && IS_I_VERSION(inode)) { - stat.change_cookie = inode_query_iversion(inode); - stat.result_mask |= STATX_CHANGE_COOKIE; - } - } + if (err) + return; + if (v4) fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode); @@ -660,15 +653,10 @@ void fh_fill_post_attrs(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - if (err) { - fhp->fh_post_saved = false; - fhp->fh_post_attr.ctime = inode->i_ctime; - if (v4 && IS_I_VERSION(inode)) { - fhp->fh_post_attr.change_cookie = inode_query_iversion(inode); - fhp->fh_post_attr.result_mask |= STATX_CHANGE_COOKIE; - } - } else - fhp->fh_post_saved = true; + if (err) + return; + + fhp->fh_post_saved = true; if (v4) fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr, inode); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index c37195572fd0..a7315928a760 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -176,9 +176,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) { struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; - unsigned int len; u32 eof; - int v; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -187,17 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); - v = 0; - len = argp->count; resp->pages = rqstp->rq_next_page; - while (len > 0) { - struct page *page = *(rqstp->rq_next_page++); - - rqstp->rq_vec[v].iov_base = page_address(page); - rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE); - len -= rqstp->rq_vec[v].iov_len; - v++; - } /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. @@ -207,7 +195,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) resp->count = argp->count; fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, v, &resp->count, &eof); + &resp->count, &eof); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9c7b1ef5be40..2154fa63c5f2 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -402,6 +402,11 @@ void nfsd_reset_write_verifier(struct nfsd_net *nn) write_sequnlock(&nn->writeverf_lock); } +/* + * Crank up a set of per-namespace resources for a new NFSD instance, + * including lockd, a duplicate reply cache, an open file cache + * instance, and a cache of NFSv4 state objects. + */ static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index caf6355b18fa..5777f40c7353 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -468,7 +468,8 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr) case nfs_ok: if (xdr_stream_encode_u32(xdr, resp->len) < 0) return false; - xdr_write_pages(xdr, &resp->page, 0, resp->len); + svcxdr_encode_opaque_pages(rqstp, xdr, &resp->page, 0, + resp->len); if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0) return false; break; @@ -491,8 +492,9 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; if (xdr_stream_encode_u32(xdr, resp->count) < 0) return false; - xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base, - resp->count); + svcxdr_encode_opaque_pages(rqstp, xdr, resp->pages, + rqstp->rq_res.page_base, + resp->count); if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0) return false; break; @@ -511,7 +513,8 @@ nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr) return false; switch (resp->status) { case nfs_ok: - xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len); + svcxdr_encode_opaque_pages(rqstp, xdr, dirlist->pages, 0, + dirlist->len); /* no more entries */ if (xdr_stream_encode_item_absent(xdr) < 0) return false; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 72a906a053dc..2af74983f146 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1581,6 +1581,265 @@ TRACE_EVENT(nfsd_cb_recall_any_done, ) ); +TRACE_EVENT(nfsd_ctl_unlock_ip, + TP_PROTO( + const struct net *net, + const char *address + ), + TP_ARGS(net, address), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(address, address) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(address, address); + ), + TP_printk("address=%s", + __get_str(address) + ) +); + +TRACE_EVENT(nfsd_ctl_unlock_fs, + TP_PROTO( + const struct net *net, + const char *path + ), + TP_ARGS(net, path), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(path, path); + ), + TP_printk("path=%s", + __get_str(path) + ) +); + +TRACE_EVENT(nfsd_ctl_filehandle, + TP_PROTO( + const struct net *net, + const char *domain, + const char *path, + int maxsize + ), + TP_ARGS(net, domain, path, maxsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, maxsize) + __string(domain, domain) + __string(path, path) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->maxsize = maxsize; + __assign_str(domain, domain); + __assign_str(path, path); + ), + TP_printk("domain=%s path=%s maxsize=%d", + __get_str(domain), __get_str(path), __entry->maxsize + ) +); + +TRACE_EVENT(nfsd_ctl_threads, + TP_PROTO( + const struct net *net, + int newthreads + ), + TP_ARGS(net, newthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, newthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->newthreads = newthreads; + ), + TP_printk("newthreads=%d", + __entry->newthreads + ) +); + +TRACE_EVENT(nfsd_ctl_pool_threads, + TP_PROTO( + const struct net *net, + int pool, + int nrthreads + ), + TP_ARGS(net, pool, nrthreads), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, pool) + __field(int, nrthreads) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->pool = pool; + __entry->nrthreads = nrthreads; + ), + TP_printk("pool=%d nrthreads=%d", + __entry->pool, __entry->nrthreads + ) +); + +TRACE_EVENT(nfsd_ctl_version, + TP_PROTO( + const struct net *net, + const char *mesg + ), + TP_ARGS(net, mesg), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(mesg, mesg) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(mesg, mesg); + ), + TP_printk("%s", + __get_str(mesg) + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addfd, + TP_PROTO( + const struct net *net, + int fd + ), + TP_ARGS(net, fd), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, fd) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->fd = fd; + ), + TP_printk("fd=%d", + __entry->fd + ) +); + +TRACE_EVENT(nfsd_ctl_ports_addxprt, + TP_PROTO( + const struct net *net, + const char *transport, + int port + ), + TP_ARGS(net, transport, port), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, port) + __string(transport, transport) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->port = port; + __assign_str(transport, transport); + ), + TP_printk("transport=%s port=%d", + __get_str(transport), __entry->port + ) +); + +TRACE_EVENT(nfsd_ctl_maxblksize, + TP_PROTO( + const struct net *net, + int bsize + ), + TP_ARGS(net, bsize), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, bsize) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->bsize = bsize; + ), + TP_printk("bsize=%d", + __entry->bsize + ) +); + +TRACE_EVENT(nfsd_ctl_maxconn, + TP_PROTO( + const struct net *net, + int maxconn + ), + TP_ARGS(net, maxconn), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, maxconn) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->maxconn = maxconn; + ), + TP_printk("maxconn=%d", + __entry->maxconn + ) +); + +TRACE_EVENT(nfsd_ctl_time, + TP_PROTO( + const struct net *net, + const char *name, + size_t namelen, + int time + ), + TP_ARGS(net, name, namelen, time), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(int, time) + __string_len(name, name, namelen) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __entry->time = time; + __assign_str_len(name, name, namelen); + ), + TP_printk("file=%s time=%d\n", + __get_str(name), __entry->time + ) +); + +TRACE_EVENT(nfsd_ctl_recoverydir, + TP_PROTO( + const struct net *net, + const char *recdir + ), + TP_ARGS(net, recdir), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __string(recdir, recdir) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + __assign_str(recdir, recdir); + ), + TP_printk("recdir=%s", + __get_str(recdir) + ) +); + +TRACE_EVENT(nfsd_end_grace, + TP_PROTO( + const struct net *net + ), + TP_ARGS(net), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + ), + TP_fast_assign( + __entry->netns_ino = net->ns.inum; + ), + TP_printk("nn=%d", __entry->netns_ino + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index db67f8e19344..59b7d60ae33e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -388,7 +388,9 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) iap->ia_mode &= ~S_ISGID; } else { /* set ATTR_KILL_* bits and let VFS handle it */ - iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); + iap->ia_valid |= ATTR_KILL_SUID; + iap->ia_valid |= + setattr_should_drop_sgid(&nop_mnt_idmap, inode); } } } @@ -1001,6 +1003,18 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } } +/** + * nfsd_splice_read - Perform a VFS read using a splice pipe + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @file: opened struct file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, u32 *eof) @@ -1014,22 +1028,50 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); - rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count, - u32 *eof) +/** + * nfsd_iter_read - Perform a VFS read using an iterator + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @file: opened struct file of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @base: offset in first page of read buffer + * @eof: OUT: set non-zero if operation reached the end of the file + * + * Some filesystems or situations cannot use nfsd_splice_read. This + * function is the slightly less-performant fallback for those cases. + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. + */ +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, unsigned long *count, + unsigned int base, u32 *eof) { + unsigned long v, total; struct iov_iter iter; loff_t ppos = offset; + struct page *page; ssize_t host_err; + v = 0; + total = *count; + while (total) { + page = *(rqstp->rq_next_page++); + rqstp->rq_vec[v].iov_base = page_address(page) + base; + rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base); + total -= rqstp->rq_vec[v].iov_len; + ++v; + base = 0; + } + WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec)); + trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, ITER_DEST, vec, vlen, *count); + iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1159,14 +1201,24 @@ out_nfserr: return nfserr; } -/* - * Read data from a file. count must contain the requested read count - * on entry. On return, *count contains the number of bytes actually read. +/** + * nfsd_read - Read data from a file + * @rqstp: RPC transaction context + * @fhp: file handle of file to be read + * @offset: starting byte offset + * @count: IN: requested number of bytes; OUT: number of bytes read + * @eof: OUT: set non-zero if operation reached the end of the file + * + * The caller must verify that there is enough space in @rqstp.rq_res + * to perform this operation. + * * N.B. After this call fhp needs an fh_put + * + * Returns nfs_ok on success, otherwise an nfserr stat value is + * returned. */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count, - u32 *eof) + loff_t offset, unsigned long *count, u32 *eof) { struct nfsd_file *nf; struct file *file; @@ -1181,12 +1233,10 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); + err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof); nfsd_file_put(nf); - trace_nfsd_read_done(rqstp, fhp, offset, *count); - return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 43fb57a301d3..a6890ea7b765 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -110,13 +110,12 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, u32 *eof); -__be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, +__be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, - unsigned long *count, + unsigned long *count, unsigned int base, u32 *eof); -__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *, +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, unsigned long *count, u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index e956f886a1a1..5710833ac1cc 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -285,6 +285,14 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_erase_irq(&btnc->i_pages, newkey); unlock_page(ctxt->bh->b_page); - } else - brelse(nbh); + } else { + /* + * When canceling a buffer that a prepare operation has + * allocated to copy a node block to another location, use + * nilfs_btnode_delete() to initialize and release the buffer + * so that the buffer flags will not be in an inconsistent + * state when it is reallocated. + */ + nilfs_btnode_delete(nbh); + } } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 5cf30827f244..b4e54d079b7d 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) struct folio *folio = fbatch.folios[i]; folio_lock(folio); - nilfs_clear_dirty_page(&folio->page, silent); + + /* + * This folio may have been removed from the address + * space by truncation or invalidation when the lock + * was acquired. Skip processing in that case. + */ + if (likely(folio->mapping == mapping)) + nilfs_clear_dirty_page(&folio->page, silent); + folio_unlock(folio); } folio_batch_release(&fbatch); diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 1362ccb64ec7..6e59dc19a732 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) if (unlikely(!bh)) return -ENOMEM; + lock_buffer(bh); + if (!buffer_uptodate(bh)) { + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + } + unlock_buffer(bh); nilfs_segbuf_add_segsum_buffer(segbuf, bh); return 0; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index ac949fd7603f..c2553024bd25 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; + + lock_buffer(bh_sr); raw_sr = (struct nilfs_super_root *)bh_sr->b_data; isz = nilfs->ns_inode_size; srsz = NILFS_SR_BYTES(isz); + raw_sr->sr_sum = 0; /* Ensure initialization within this update */ raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? @@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); + set_buffer_uptodate(bh_sr); + unlock_buffer(bh_sr); } static void nilfs_redirty_inodes(struct list_head *head) @@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err) b_assoc_buffers) { clear_buffer_async_write(bh); if (bh == segbuf->sb_super_root) { + clear_buffer_uptodate(bh); if (bh->b_page != bd_page) { end_page_writeback(bd_page); bd_page = bh->b_page; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index dc359b56fdfa..2c6078a6b8ec 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -779,6 +779,15 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) goto out_header; sui->ncleansegs -= nsegs - newnsegs; + + /* + * If the sufile is successfully truncated, immediately adjust + * the segment allocation space while locking the semaphore + * "mi_sem" so that nilfs_sufile_alloc() never allocates + * segments in the truncated space. + */ + sui->allocmax = newnsegs - 1; + sui->allocmin = 0; } kaddr = kmap_atomic(header_bh->b_page); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 77f1e5778d1c..9ba4933087af 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) goto out; } nsbp = (void *)nsbh->b_data + offset; - memset(nsbp, 0, nilfs->ns_blocksize); + lock_buffer(nsbh); if (sb2i >= 0) { + /* + * The position of the second superblock only changes by 4KiB, + * which is larger than the maximum superblock data size + * (= 1KiB), so there is no need to use memmove() to allow + * overlap between source and destination. + */ memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + + /* + * Zero fill after copy to avoid overwriting in case of move + * within the same block. + */ + memset(nsbh->b_data, 0, offset); + memset((void *)nsbp + nilfs->ns_sbsize, 0, + nsbh->b_size - offset - nilfs->ns_sbsize); + } else { + memset(nsbh->b_data, 0, nsbh->b_size); + } + set_buffer_uptodate(nsbh); + unlock_buffer(nsbh); + + if (sb2i >= 0) { brelse(nilfs->ns_sbh[sb2i]); nilfs->ns_sbh[sb2i] = nsbh; nilfs->ns_sbp[sb2i] = nsbp; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2894152a6b25..0f0667957c81 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -405,6 +405,18 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) 100)); } +/** + * nilfs_max_segment_count - calculate the maximum number of segments + * @nilfs: nilfs object + */ +static u64 nilfs_max_segment_count(struct the_nilfs *nilfs) +{ + u64 max_count = U64_MAX; + + do_div(max_count, nilfs->ns_blocks_per_segment); + return min_t(u64, max_count, ULONG_MAX); +} + void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) { nilfs->ns_nsegments = nsegs; @@ -414,6 +426,8 @@ void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) static int nilfs_store_disk_layout(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { + u64 nsegments, nblocks; + if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) { nilfs_err(nilfs->ns_sb, "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).", @@ -457,7 +471,34 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, return -EINVAL; } - nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); + nsegments = le64_to_cpu(sbp->s_nsegments); + if (nsegments > nilfs_max_segment_count(nilfs)) { + nilfs_err(nilfs->ns_sb, + "segment count %llu exceeds upper limit (%llu segments)", + (unsigned long long)nsegments, + (unsigned long long)nilfs_max_segment_count(nilfs)); + return -EINVAL; + } + + nblocks = sb_bdev_nr_blocks(nilfs->ns_sb); + if (nblocks) { + u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment; + /* + * To avoid failing to mount early device images without a + * second superblock, exclude that block count from the + * "min_block_count" calculation. + */ + + if (nblocks < min_block_count) { + nilfs_err(nilfs->ns_sb, + "total number of segment blocks %llu exceeds device size (%llu blocks)", + (unsigned long long)min_block_count, + (unsigned long long)nblocks); + return -EINVAL; + } + } + + nilfs_set_nsegments(nilfs, nsegments); nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); return 0; } diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a3865bc4a0c6..f79408f9127a 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -2491,7 +2491,7 @@ conv_err_out: * byte offset @ofs inside the attribute with the constant byte @val. * * This function is effectively like memset() applied to an ntfs attribute. - * Note thie function actually only operates on the page cache pages belonging + * Note this function actually only operates on the page cache pages belonging * to the ntfs attribute and it marks them dirty after doing the memset(). * Thus it relies on the vm dirty page write code paths to cause the modified * pages to be written to the mft record/disk. diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index f9cb180b6f6b..761aaa0195d6 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -161,7 +161,7 @@ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], */ u8 *cb_end = cb_start + cb_size; /* End of cb. */ u8 *cb = cb_start; /* Current position in cb. */ - u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ /* Variables for uncompressed data / destination. */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 48030899dc6e..0155f106ec34 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1955,36 +1955,38 @@ undo_alloc: "attribute.%s", es); NVolSetErrors(vol); } - a = ctx->attr; + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { ntfs_error(vol->sb, "Failed to truncate mft data attribute " "runlist.%s", es); NVolSetErrors(vol); } - if (mp_rebuilt && !IS_ERR(ctx->mrec)) { - if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + if (ctx) { + a = ctx->attr; + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu( a->data.non_resident.mapping_pairs_offset), old_alen - le16_to_cpu( - a->data.non_resident.mapping_pairs_offset), + a->data.non_resident.mapping_pairs_offset), rl2, ll, -1, NULL)) { - ntfs_error(vol->sb, "Failed to restore mapping pairs " + ntfs_error(vol->sb, "Failed to restore mapping pairs " "array.%s", es); - NVolSetErrors(vol); - } - if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { - ntfs_error(vol->sb, "Failed to restore attribute " + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " + "context.%s", es); NVolSetErrors(vol); } - flush_dcache_mft_record_page(ctx->ntfs_ino); - mark_mft_record_dirty(ctx->ntfs_ino); - } else if (IS_ERR(ctx->mrec)) { - ntfs_error(vol->sb, "Failed to restore attribute search " - "context.%s", es); - NVolSetErrors(vol); - } - if (ctx) ntfs_attr_put_search_ctx(ctx); + } if (!IS_ERR(mrec)) unmap_mft_record(mft_ni); up_write(&mft_ni->runlist.lock); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2643a08182e1..56a7d5bd33e4 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1620,7 +1620,7 @@ read_partial_attrdef_page: memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); - }; + } if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; if (size) @@ -1689,7 +1689,7 @@ read_partial_upcase_page: memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), page_address(page), size); ntfs_unmap_page(page); - }; + } if (size == PAGE_SIZE) { size = i_size & ~PAGE_MASK; if (size) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index efb09de4343d..b173c36bcab3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2100,14 +2100,20 @@ static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, struct ocfs2_space_resv sr; int change_size = 1; int cmd = OCFS2_IOC_RESVSP64; + int ret = 0; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; if (!ocfs2_writes_unwritten_extents(osb)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_KEEP_SIZE) + if (mode & FALLOC_FL_KEEP_SIZE) { change_size = 0; + } else { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + return ret; + } if (mode & FALLOC_FL_PUNCH_HOLE) cmd = OCFS2_IOC_UNRESVSP64; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0b0e6a132101..988d1c076861 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -952,8 +952,10 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; - oinfo = sb_dqinfo(sb, type)->dqi_priv; - cancel_delayed_work_sync(&oinfo->dqi_sync_work); + if (!sb_has_quota_suspended(sb, type)) { + oinfo = sb_dqinfo(sb, type)->dqi_priv; + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + } inode = igrab(sb->s_dquot.files[type]); /* Turn off quotas. This will remove all dquot structures from * memory and so they will be automatically synced to global diff --git a/fs/open.c b/fs/open.c index 4478adcc4f3a..fb07b2840eb4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -700,10 +700,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) return do_fchmodat(AT_FDCWD, filename, mode); } -/** - * setattr_vfsuid - check and set ia_fsuid attribute - * @kuid: new inode owner - * +/* * Check whether @kuid is valid and if so generate and set vfsuid_t in * ia_vfsuid. * @@ -718,10 +715,7 @@ static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid) return true; } -/** - * setattr_vfsgid - check and set ia_fsgid attribute - * @kgid: new inode owner - * +/* * Check whether @kgid is valid and if so generate and set vfsgid_t in * ia_vfsgid. * @@ -989,7 +983,6 @@ cleanup_file: * @file: file pointer * @dentry: pointer to dentry * @open: open callback - * @opened: state of open * * This can be used to finish opening a file passed to i_op->atomic_open(). * @@ -1043,7 +1036,6 @@ EXPORT_SYMBOL(file_path); * vfs_open - open the file at the given path * @path: path to open * @file: newly allocated file with f_flag initialized - * @cred: credentials to use */ int vfs_open(const struct path *path, struct file *file) { @@ -1116,23 +1108,77 @@ struct file *dentry_create(const struct path *path, int flags, umode_t mode, } EXPORT_SYMBOL(dentry_create); -struct file *open_with_fake_path(const struct path *path, int flags, +/** + * kernel_file_open - open a file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @inode: the inode + * @cred: credentials for open + * + * Open a file for use by in-kernel consumers. The file is not accounted + * against nr_files and must not be installed into the file descriptor + * table. + * + * Return: Opened file on success, an error pointer on failure. + */ +struct file *kernel_file_open(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { - struct file *f = alloc_empty_file_noaccount(flags, cred); - if (!IS_ERR(f)) { - int error; + struct file *f; + int error; - f->f_path = *path; - error = do_dentry_open(f, inode, NULL); - if (error) { - fput(f); - f = ERR_PTR(error); - } + f = alloc_empty_file_noaccount(flags, cred); + if (IS_ERR(f)) + return f; + + f->f_path = *path; + error = do_dentry_open(f, inode, NULL); + if (error) { + fput(f); + f = ERR_PTR(error); } return f; } -EXPORT_SYMBOL(open_with_fake_path); +EXPORT_SYMBOL_GPL(kernel_file_open); + +/** + * backing_file_open - open a backing file for kernel internal use + * @path: path of the file to open + * @flags: open flags + * @path: path of the backing file + * @cred: credentials for open + * + * Open a backing file for a stackable filesystem (e.g., overlayfs). + * @path may be on the stackable filesystem and backing inode on the + * underlying filesystem. In this case, we want to be able to return + * the @real_path of the backing inode. This is done by embedding the + * returned file into a container structure that also stores the path of + * the backing inode on the underlying filesystem, which can be + * retrieved using backing_file_real_path(). + */ +struct file *backing_file_open(const struct path *path, int flags, + const struct path *real_path, + const struct cred *cred) +{ + struct file *f; + int error; + + f = alloc_empty_backing_file(flags, cred); + if (IS_ERR(f)) + return f; + + f->f_path = *path; + path_get(real_path); + *backing_file_real_path(f) = *real_path; + error = do_dentry_open(f, d_inode(real_path->dentry), NULL); + if (error) { + fput(f); + f = ERR_PTR(error); + } + + return f; +} +EXPORT_SYMBOL_GPL(backing_file_open); #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) @@ -1156,7 +1202,7 @@ inline struct open_how build_open_how(int flags, umode_t mode) inline int build_open_flags(const struct open_how *how, struct open_flags *op) { u64 flags = how->flags; - u64 strip = FMODE_NONOTIFY | O_CLOEXEC; + u64 strip = __FMODE_NONOTIFY | O_CLOEXEC; int lookup_flags = 0; int acc_mode = ACC_MODE(flags); diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 7c04f033aadd..dbbb156c2d67 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -34,8 +34,8 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode) return 'm'; } -/* No atime modification nor notify on underlying */ -#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY) +/* No atime modification on underlying */ +#define OVL_OPEN_FLAGS (O_NOATIME) static struct file *ovl_open_realfile(const struct file *file, const struct path *realpath) @@ -61,8 +61,8 @@ static struct file *ovl_open_realfile(const struct file *file, if (!inode_owner_or_capable(real_idmap, realinode)) flags &= ~O_NOATIME; - realfile = open_with_fake_path(&file->f_path, flags, realinode, - current_cred()); + realfile = backing_file_open(&file->f_path, flags, realpath, + current_cred()); } revert_creds(old_cred); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4d0b278f5630..23686e8a06c4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -329,8 +329,9 @@ static inline struct file *ovl_do_tmpfile(struct ovl_fs *ofs, struct dentry *dentry, umode_t mode) { struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = dentry }; - struct file *file = vfs_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, mode, - O_LARGEFILE | O_WRONLY, current_cred()); + struct file *file = kernel_tmpfile_open(ovl_upper_mnt_idmap(ofs), &path, + mode, O_LARGEFILE | O_WRONLY, + current_cred()); int err = PTR_ERR_OR_ZERO(file); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); diff --git a/fs/pnode.c b/fs/pnode.c index 3cede8b18c8b..e4d0340393d5 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -216,7 +216,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin) static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct hlist_head *list; -static inline bool peers(struct mount *m1, struct mount *m2) +static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } @@ -354,6 +354,46 @@ static inline int do_refcount_check(struct mount *mnt, int count) return mnt_get_count(mnt) > count; } +/** + * propagation_would_overmount - check whether propagation from @from + * would overmount @to + * @from: shared mount + * @to: mount to check + * @mp: future mountpoint of @to on @from + * + * If @from propagates mounts to @to, @from and @to must either be peers + * or one of the masters in the hierarchy of masters of @to must be a + * peer of @from. + * + * If the root of the @to mount is equal to the future mountpoint @mp of + * the @to mount on @from then @to will be overmounted by whatever is + * propagated to it. + * + * Context: This function expects namespace_lock() to be held and that + * @mp is stable. + * Return: If @from overmounts @to, true is returned, false if not. + */ +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp) +{ + if (!IS_MNT_SHARED(from)) + return false; + + if (IS_MNT_NEW(to)) + return false; + + if (to->mnt.mnt_root != mp->m_dentry) + return false; + + for (const struct mount *m = to; m; m = m->mnt_master) { + if (peers(from, m)) + return true; + } + + return false; +} + /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount diff --git a/fs/pnode.h b/fs/pnode.h index 988f1aa9b02a..0b02a6393891 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -53,4 +53,7 @@ struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); int count_mounts(struct mnt_namespace *ns, struct mount *mnt); +bool propagation_would_overmount(const struct mount *from, + const struct mount *to, + const struct mountpoint *mp); #endif /* _LINUX_PNODE_H */ diff --git a/fs/readdir.c b/fs/readdir.c index 9c53edb60c03..b264ce60114d 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -131,7 +131,7 @@ struct old_linux_dirent { unsigned long d_ino; unsigned long d_offset; unsigned short d_namlen; - char d_name[1]; + char d_name[]; }; struct readdir_callback { @@ -208,7 +208,7 @@ struct linux_dirent { unsigned long d_ino; unsigned long d_off; unsigned short d_reclen; - char d_name[1]; + char d_name[]; }; struct getdents_callback { @@ -388,7 +388,7 @@ struct compat_old_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_offset; unsigned short d_namlen; - char d_name[1]; + char d_name[]; }; struct compat_readdir_callback { @@ -460,7 +460,7 @@ struct compat_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_off; unsigned short d_reclen; - char d_name[1]; + char d_name[]; }; struct compat_getdents_callback { diff --git a/fs/remap_range.c b/fs/remap_range.c index 1331a890f2f2..87ae4f0dc3aa 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -15,6 +15,7 @@ #include <linux/mount.h> #include <linux/fs.h> #include <linux/dax.h> +#include <linux/overflow.h> #include "internal.h" #include <linux/uaccess.h> @@ -101,10 +102,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { + loff_t tmp; + if (unlikely(pos < 0 || len < 0)) return -EINVAL; - if (unlikely((loff_t) (pos + len) < 0)) + if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; return security_file_permission(file, write ? MAY_WRITE : MAY_READ); diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 5034b862cec2..b279f745466e 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/uaccess.h> +#include <uapi/linux/ethtool.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -130,12 +131,14 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) struct TCP_Server_Info *server = chan->server; seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx" - "\n\t\tNumber of credits: %d Dialect 0x%x" + "\n\t\tNumber of credits: %d,%d,%d Dialect 0x%x" "\n\t\tTCP status: %d Instance: %d" "\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d" "\n\t\tIn Send: %d In MaxReq Wait: %d", i+1, server->conn_id, server->credits, + server->echo_credits, + server->oplock_credits, server->dialect, server->tcpStatus, server->reconnect_instance, @@ -146,18 +149,62 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) atomic_read(&server->num_waiters)); } +static inline const char *smb_speed_to_str(size_t bps) +{ + size_t mbps = bps / 1000 / 1000; + + switch (mbps) { + case SPEED_10: + return "10Mbps"; + case SPEED_100: + return "100Mbps"; + case SPEED_1000: + return "1Gbps"; + case SPEED_2500: + return "2.5Gbps"; + case SPEED_5000: + return "5Gbps"; + case SPEED_10000: + return "10Gbps"; + case SPEED_14000: + return "14Gbps"; + case SPEED_20000: + return "20Gbps"; + case SPEED_25000: + return "25Gbps"; + case SPEED_40000: + return "40Gbps"; + case SPEED_50000: + return "50Gbps"; + case SPEED_56000: + return "56Gbps"; + case SPEED_100000: + return "100Gbps"; + case SPEED_200000: + return "200Gbps"; + case SPEED_400000: + return "400Gbps"; + case SPEED_800000: + return "800Gbps"; + default: + return "Unknown"; + } +} + static void cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) { struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; - seq_printf(m, "\tSpeed: %zu bps\n", iface->speed); + seq_printf(m, "\tSpeed: %s\n", smb_speed_to_str(iface->speed)); seq_puts(m, "\t\tCapabilities: "); if (iface->rdma_capable) seq_puts(m, "rdma "); if (iface->rss_capable) seq_puts(m, "rss "); + if (!iface->rdma_capable && !iface->rss_capable) + seq_puts(m, "None"); seq_putc(m, '\n'); if (iface->sockaddr.ss_family == AF_INET) seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); @@ -350,8 +397,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) atomic_read(&server->smbd_conn->mr_used_count)); skip_rdma: #endif - seq_printf(m, "\nNumber of credits: %d Dialect 0x%x", - server->credits, server->dialect); + seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x", + server->credits, + server->echo_credits, + server->oplock_credits, + server->dialect); if (server->compress_algorithm == SMB3_COMPRESS_LZNT1) seq_printf(m, " COMPRESS_LZNT1"); else if (server->compress_algorithm == SMB3_COMPRESS_LZ77) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 0d84bb1a8cd9..b212a4e16b39 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -970,43 +970,6 @@ release_iface(struct kref *ref) kfree(iface); } -/* - * compare two interfaces a and b - * return 0 if everything matches. - * return 1 if a has higher link speed, or rdma capable, or rss capable - * return -1 otherwise. - */ -static inline int -iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) -{ - int cmp_ret = 0; - - WARN_ON(!a || !b); - if (a->speed == b->speed) { - if (a->rdma_capable == b->rdma_capable) { - if (a->rss_capable == b->rss_capable) { - cmp_ret = memcmp(&a->sockaddr, &b->sockaddr, - sizeof(a->sockaddr)); - if (!cmp_ret) - return 0; - else if (cmp_ret > 0) - return 1; - else - return -1; - } else if (a->rss_capable > b->rss_capable) - return 1; - else - return -1; - } else if (a->rdma_capable > b->rdma_capable) - return 1; - else - return -1; - } else if (a->speed > b->speed) - return 1; - else - return -1; -} - struct cifs_chan { unsigned int in_reconnect : 1; /* if session setup in progress for this channel */ struct TCP_Server_Info *server; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index c1c704990b98..d127aded2f28 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -87,6 +87,7 @@ extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); extern int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx); extern int smb3_parse_opt(const char *options, const char *key, char **val); +extern int cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs); extern bool cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs); extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 8e9a672320ab..9d16626e7a66 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1288,6 +1288,56 @@ next_pdu: module_put_and_kthread_exit(0); } +int +cifs_ipaddr_cmp(struct sockaddr *srcaddr, struct sockaddr *rhs) +{ + struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr; + struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs; + struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; + + switch (srcaddr->sa_family) { + case AF_UNSPEC: + switch (rhs->sa_family) { + case AF_UNSPEC: + return 0; + case AF_INET: + case AF_INET6: + return 1; + default: + return -1; + } + case AF_INET: { + switch (rhs->sa_family) { + case AF_UNSPEC: + return -1; + case AF_INET: + return memcmp(saddr4, vaddr4, + sizeof(struct sockaddr_in)); + case AF_INET6: + return 1; + default: + return -1; + } + } + case AF_INET6: { + switch (rhs->sa_family) { + case AF_UNSPEC: + case AF_INET: + return -1; + case AF_INET6: + return memcmp(saddr6, + vaddr6, + sizeof(struct sockaddr_in6)); + default: + return -1; + } + } + default: + return -1; /* don't expect to be here */ + } +} + /* * Returns true if srcaddr isn't specified and rhs isn't specified, or * if srcaddr is specified and matches the IP address of the rhs argument @@ -4086,16 +4136,17 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->status == TID_GOOD) { + spin_unlock(&tcon->tc_lock); + return 0; + } + if (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON) { spin_unlock(&tcon->tc_lock); return -EHOSTDOWN; } - if (tcon->status == TID_GOOD) { - spin_unlock(&tcon->tc_lock); - return 0; - } tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 2f93bf8c3325..2390b2fedd6a 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -575,16 +575,17 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru /* only send once per connect */ spin_lock(&tcon->tc_lock); + if (tcon->status == TID_GOOD) { + spin_unlock(&tcon->tc_lock); + return 0; + } + if (tcon->status != TID_NEW && tcon->status != TID_NEED_TCON) { spin_unlock(&tcon->tc_lock); return -EHOSTDOWN; } - if (tcon->status == TID_GOOD) { - spin_unlock(&tcon->tc_lock); - return 0; - } tcon->status = TID_IN_TCON; spin_unlock(&tcon->tc_lock); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index df88b8c04d03..051283386e22 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -4942,9 +4942,13 @@ oplock_break_ack: * disconnected since oplock already released by the server */ if (!oplock_break_cancelled) { - rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, + /* check for server null since can race with kill_sb calling tree disconnect */ + if (tcon->ses && tcon->ses->server) { + rc = tcon->ses->server->ops->oplock_response(tcon, persistent_fid, volatile_fid, net_fid, cinode); - cifs_dbg(FYI, "Oplock release rc = %d\n", rc); + cifs_dbg(FYI, "Oplock release rc = %d\n", rc); + } else + pr_warn_once("lease break not sent for unmounted share\n"); } cifs_done_oplock_break(cinode); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 6e3be58cfe49..a8bb9d00d33a 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -34,6 +34,8 @@ static int change_conf(struct TCP_Server_Info *server) { server->credits += server->echo_credits + server->oplock_credits; + if (server->credits > server->max_credits) + server->credits = server->max_credits; server->oplock_credits = server->echo_credits = 0; switch (server->credits) { case 0: @@ -91,6 +93,7 @@ smb2_add_credits(struct TCP_Server_Info *server, server->conn_id, server->hostname, *val, add, server->in_flight); } + WARN_ON_ONCE(server->in_flight == 0); server->in_flight--; if (server->in_flight == 0 && ((optype & CIFS_OP_MASK) != CIFS_NEG_OP) && @@ -510,6 +513,43 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) return rsize; } +/* + * compare two interfaces a and b + * return 0 if everything matches. + * return 1 if a is rdma capable, or rss capable, or has higher link speed + * return -1 otherwise. + */ +static int +iface_cmp(struct cifs_server_iface *a, struct cifs_server_iface *b) +{ + int cmp_ret = 0; + + WARN_ON(!a || !b); + if (a->rdma_capable == b->rdma_capable) { + if (a->rss_capable == b->rss_capable) { + if (a->speed == b->speed) { + cmp_ret = cifs_ipaddr_cmp((struct sockaddr *) &a->sockaddr, + (struct sockaddr *) &b->sockaddr); + if (!cmp_ret) + return 0; + else if (cmp_ret > 0) + return 1; + else + return -1; + } else if (a->speed > b->speed) + return 1; + else + return -1; + } else if (a->rss_capable > b->rss_capable) + return 1; + else + return -1; + } else if (a->rdma_capable > b->rdma_capable) + return 1; + else + return -1; +} + static int parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, size_t buf_len, struct cifs_ses *ses, bool in_mount) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 7063b395d22f..17fe212ab895 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1305,7 +1305,12 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) } /* enough to enable echos and oplocks and one max size write */ - req->hdr.CreditRequest = cpu_to_le16(130); + if (server->credits >= server->max_credits) + req->hdr.CreditRequest = cpu_to_le16(0); + else + req->hdr.CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 130)); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -1899,7 +1904,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, rqst.rq_nvec = 2; /* Need 64 for max size write so ask for more in case not there yet */ - req->hdr.CreditRequest = cpu_to_le16(64); + if (server->credits >= server->max_credits) + req->hdr.CreditRequest = cpu_to_le16(0); + else + req->hdr.CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 64)); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -4227,6 +4237,7 @@ smb2_async_readv(struct cifs_readdata *rdata) struct TCP_Server_Info *server; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); unsigned int total_len; + int credit_request; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -4258,7 +4269,13 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); + credit_request = le16_to_cpu(shdr->CreditCharge) + 8; + if (server->credits >= server->max_credits) + shdr->CreditRequest = cpu_to_le16(0); + else + shdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, credit_request)); rc = adjust_credits(server, &rdata->credits, rdata->bytes); if (rc) @@ -4468,6 +4485,7 @@ smb2_async_writev(struct cifs_writedata *wdata, unsigned int total_len; struct cifs_io_parms _io_parms; struct cifs_io_parms *io_parms = NULL; + int credit_request; if (!wdata->server) server = wdata->server = cifs_pick_channel(tcon->ses); @@ -4572,7 +4590,13 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); - shdr->CreditRequest = cpu_to_le16(le16_to_cpu(shdr->CreditCharge) + 8); + credit_request = le16_to_cpu(shdr->CreditCharge) + 8; + if (server->credits >= server->max_credits) + shdr->CreditRequest = cpu_to_le16(0); + else + shdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, credit_request)); rc = adjust_credits(server, &wdata->credits, io_parms->length); if (rc) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 24bdd5f4d3bc..0474d0bba0a2 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -55,7 +55,7 @@ alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp->pid = current->pid; temp->command = cpu_to_le16(smb_buffer->Command); cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); - /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ + /* easier to use jiffies */ /* when mid allocated can be before when sent */ temp->when_alloc = jiffies; temp->server = server; diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 4882a812ea86..2a717d158f02 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -294,6 +294,9 @@ bool ksmbd_conn_alive(struct ksmbd_conn *conn) return true; } +#define SMB1_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb_hdr)) +#define SMB2_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr) + 4) + /** * ksmbd_conn_handler_loop() - session thread to listen on new smb requests * @p: connection instance @@ -350,6 +353,9 @@ int ksmbd_conn_handler_loop(void *p) if (pdu_size > MAX_STREAM_PROT_LEN) break; + if (pdu_size < SMB1_MIN_SUPPORTED_HEADER_SIZE) + break; + /* 4 for rfc1002 length field */ /* 1 for implied bcc[0] */ size = pdu_size + 4 + 1; @@ -358,8 +364,6 @@ int ksmbd_conn_handler_loop(void *p) break; memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf)); - if (!ksmbd_smb_request(conn)) - break; /* * We already read 4 bytes to find out PDU size, now @@ -377,6 +381,15 @@ int ksmbd_conn_handler_loop(void *p) continue; } + if (!ksmbd_smb_request(conn)) + break; + + if (((struct smb2_hdr *)smb2_get_msg(conn->request_buf))->ProtocolId == + SMB2_PROTO_NUMBER) { + if (pdu_size < SMB2_MIN_SUPPORTED_HEADER_SIZE) + break; + } + if (!default_conn_ops.process_fn) { pr_err("No connection request callback\n"); break; diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index db181bdad73a..844b303baf29 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1415,56 +1415,38 @@ void create_lease_buf(u8 *rbuf, struct lease *lease) */ struct lease_ctx_info *parse_lease_state(void *open_req) { - char *data_offset; struct create_context *cc; - unsigned int next = 0; - char *name; - bool found = false; struct smb2_create_req *req = (struct smb2_create_req *)open_req; - struct lease_ctx_info *lreq = kzalloc(sizeof(struct lease_ctx_info), - GFP_KERNEL); + struct lease_ctx_info *lreq; + + cc = smb2_find_context_vals(req, SMB2_CREATE_REQUEST_LEASE, 4); + if (IS_ERR_OR_NULL(cc)) + return NULL; + + lreq = kzalloc(sizeof(struct lease_ctx_info), GFP_KERNEL); if (!lreq) return NULL; - data_offset = (char *)req + le32_to_cpu(req->CreateContextsOffset); - cc = (struct create_context *)data_offset; - do { - cc = (struct create_context *)((char *)cc + next); - name = le16_to_cpu(cc->NameOffset) + (char *)cc; - if (le16_to_cpu(cc->NameLength) != 4 || - strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) { - next = le32_to_cpu(cc->Next); - continue; - } - found = true; - break; - } while (next != 0); + if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { + struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; - if (found) { - if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { - struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; - - memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); - lreq->req_state = lc->lcontext.LeaseState; - lreq->flags = lc->lcontext.LeaseFlags; - lreq->duration = lc->lcontext.LeaseDuration; - memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey, - SMB2_LEASE_KEY_SIZE); - lreq->version = 2; - } else { - struct create_lease *lc = (struct create_lease *)cc; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); + lreq->req_state = lc->lcontext.LeaseState; + lreq->flags = lc->lcontext.LeaseFlags; + lreq->duration = lc->lcontext.LeaseDuration; + memcpy(lreq->parent_lease_key, lc->lcontext.ParentLeaseKey, + SMB2_LEASE_KEY_SIZE); + lreq->version = 2; + } else { + struct create_lease *lc = (struct create_lease *)cc; - memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); - lreq->req_state = lc->lcontext.LeaseState; - lreq->flags = lc->lcontext.LeaseFlags; - lreq->duration = lc->lcontext.LeaseDuration; - lreq->version = 1; - } - return lreq; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); + lreq->req_state = lc->lcontext.LeaseState; + lreq->flags = lc->lcontext.LeaseFlags; + lreq->duration = lc->lcontext.LeaseDuration; + lreq->version = 1; } - - kfree(lreq); - return NULL; + return lreq; } /** diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index f9b2e0f19b03..ced7a9e916f0 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -185,24 +185,31 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, goto send; } - if (conn->ops->check_user_session) { - rc = conn->ops->check_user_session(work); - if (rc < 0) { - command = conn->ops->get_cmd_val(work); - conn->ops->set_rsp_status(work, - STATUS_USER_SESSION_DELETED); - goto send; - } else if (rc > 0) { - rc = conn->ops->get_ksmbd_tcon(work); + do { + if (conn->ops->check_user_session) { + rc = conn->ops->check_user_session(work); if (rc < 0) { - conn->ops->set_rsp_status(work, - STATUS_NETWORK_NAME_DELETED); + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_USER_SESSION_DELETED); goto send; + } else if (rc > 0) { + rc = conn->ops->get_ksmbd_tcon(work); + if (rc < 0) { + if (rc == -EINVAL) + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + else + conn->ops->set_rsp_status(work, + STATUS_NETWORK_NAME_DELETED); + goto send; + } } } - } - do { rc = __process_request(work, conn, &command); if (rc == SERVER_HANDLER_ABORT) break; diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 0ffe663b7590..33b7e6c4ceff 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -351,9 +351,16 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) int command; __u32 clc_len; /* calculated length */ __u32 len = get_rfc1002_len(work->request_buf); + __u32 req_struct_size, next_cmd = le32_to_cpu(hdr->NextCommand); - if (le32_to_cpu(hdr->NextCommand) > 0) - len = le32_to_cpu(hdr->NextCommand); + if ((u64)work->next_smb2_rcv_hdr_off + next_cmd > len) { + pr_err("next command(%u) offset exceeds smb msg size\n", + next_cmd); + return 1; + } + + if (next_cmd > 0) + len = next_cmd; else if (work->next_smb2_rcv_hdr_off) len -= work->next_smb2_rcv_hdr_off; @@ -373,17 +380,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } if (smb2_req_struct_sizes[command] != pdu->StructureSize2) { - if (command != SMB2_OPLOCK_BREAK_HE && - (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { - /* error packets have 9 byte structure size */ - ksmbd_debug(SMB, - "Illegal request size %u for command %d\n", - le16_to_cpu(pdu->StructureSize2), command); - return 1; - } else if (command == SMB2_OPLOCK_BREAK_HE && - hdr->Status == 0 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && - le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { + if (command == SMB2_OPLOCK_BREAK_HE && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { /* special case for SMB2.1 lease break message */ ksmbd_debug(SMB, "Illegal request size %d for oplock break\n", @@ -392,6 +391,14 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } } + req_struct_size = le16_to_cpu(pdu->StructureSize2) + + __SMB2_HEADER_STRUCTURE_SIZE; + if (command == SMB2_LOCK_HE) + req_struct_size -= sizeof(struct smb2_lock_element); + + if (req_struct_size > len + 1) + return 1; + if (smb2_calc_size(hdr, &clc_len)) return 1; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 7a81541de602..da1787c68ba0 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -91,7 +91,6 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) unsigned int cmd = le16_to_cpu(req_hdr->Command); int tree_id; - work->tcon = NULL; if (cmd == SMB2_TREE_CONNECT_HE || cmd == SMB2_CANCEL_HE || cmd == SMB2_LOGOFF_HE) { @@ -105,10 +104,28 @@ int smb2_get_ksmbd_tcon(struct ksmbd_work *work) } tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId); + + /* + * If request is not the first in Compound request, + * Just validate tree id in header with work->tcon->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->tcon) { + pr_err("The first operation in the compound does not have tcon\n"); + return -EINVAL; + } + if (work->tcon->id != tree_id) { + pr_err("tree id(%u) is different with id(%u) in first operation\n", + tree_id, work->tcon->id); + return -EINVAL; + } + return 1; + } + work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id); if (!work->tcon) { pr_err("Invalid tid %d\n", tree_id); - return -EINVAL; + return -ENOENT; } return 1; @@ -547,7 +564,6 @@ int smb2_check_user_session(struct ksmbd_work *work) unsigned int cmd = conn->ops->get_cmd_val(work); unsigned long long sess_id; - work->sess = NULL; /* * SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not * require a session id, so no need to validate user session's for @@ -558,15 +574,33 @@ int smb2_check_user_session(struct ksmbd_work *work) return 0; if (!ksmbd_conn_good(conn)) - return -EINVAL; + return -EIO; sess_id = le64_to_cpu(req_hdr->SessionId); + + /* + * If request is not the first in Compound request, + * Just validate session id in header with work->sess->id. + */ + if (work->next_smb2_rcv_hdr_off) { + if (!work->sess) { + pr_err("The first operation in the compound does not have sess\n"); + return -EINVAL; + } + if (work->sess->id != sess_id) { + pr_err("session id(%llu) is different with the first operation(%lld)\n", + sess_id, work->sess->id); + return -EINVAL; + } + return 1; + } + /* Check for validity of user session */ work->sess = ksmbd_session_lookup_all(conn, sess_id); if (work->sess) return 1; ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); - return -EINVAL; + return -ENOENT; } static void destroy_previous_session(struct ksmbd_conn *conn, @@ -963,13 +997,13 @@ static void decode_sign_cap_ctxt(struct ksmbd_conn *conn, static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, struct smb2_negotiate_req *req, - int len_of_smb) + unsigned int len_of_smb) { /* +4 is to account for the RFC1001 len field */ struct smb2_neg_context *pctx = (struct smb2_neg_context *)req; int i = 0, len_of_ctxts; - int offset = le32_to_cpu(req->NegotiateContextOffset); - int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount); + unsigned int offset = le32_to_cpu(req->NegotiateContextOffset); + unsigned int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount); __le32 status = STATUS_INVALID_PARAMETER; ksmbd_debug(SMB, "decoding %d negotiate contexts\n", neg_ctxt_cnt); @@ -983,7 +1017,7 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, while (i++ < neg_ctxt_cnt) { int clen, ctxt_len; - if (len_of_ctxts < sizeof(struct smb2_neg_context)) + if (len_of_ctxts < (int)sizeof(struct smb2_neg_context)) break; pctx = (struct smb2_neg_context *)((char *)pctx + offset); @@ -1038,9 +1072,8 @@ static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, } /* offsets must be 8 byte aligned */ - clen = (clen + 7) & ~0x7; - offset = clen + sizeof(struct smb2_neg_context); - len_of_ctxts -= clen + sizeof(struct smb2_neg_context); + offset = (ctxt_len + 7) & ~0x7; + len_of_ctxts -= offset; } return status; } @@ -2250,7 +2283,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* delete the EA only when it exits */ if (rc > 0) { rc = ksmbd_vfs_remove_xattr(idmap, - path->dentry, + path, attr_name); if (rc < 0) { @@ -2264,8 +2297,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, /* if the EA doesn't exist, just do nothing. */ rc = 0; } else { - rc = ksmbd_vfs_setxattr(idmap, - path->dentry, attr_name, value, + rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value, le16_to_cpu(eabuf->EaValueLength), 0); if (rc < 0) { ksmbd_debug(SMB, @@ -2322,8 +2354,7 @@ static noinline int smb2_set_stream_name_xattr(const struct path *path, return -EBADF; } - rc = ksmbd_vfs_setxattr(idmap, path->dentry, - xattr_stream_name, NULL, 0, 0); + rc = ksmbd_vfs_setxattr(idmap, path, xattr_stream_name, NULL, 0, 0); if (rc < 0) pr_err("Failed to store XATTR stream name :%d\n", rc); return 0; @@ -2351,7 +2382,7 @@ static int smb2_remove_smb_xattrs(const struct path *path) if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && !strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, path->dentry, + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", @@ -2398,8 +2429,7 @@ static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, const struct path * da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), - path->dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_idmap(path->mnt), path, &da); if (rc) ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); } @@ -2973,7 +3003,7 @@ int smb2_open(struct ksmbd_work *work) struct inode *inode = d_inode(path.dentry); posix_acl_rc = ksmbd_vfs_inherit_posix_acl(idmap, - path.dentry, + &path, d_inode(path.dentry->d_parent)); if (posix_acl_rc) ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); @@ -2989,7 +3019,7 @@ int smb2_open(struct ksmbd_work *work) if (rc) { if (posix_acl_rc) ksmbd_vfs_set_init_posix_acl(idmap, - path.dentry); + &path); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { @@ -3029,7 +3059,7 @@ int smb2_open(struct ksmbd_work *work) rc = ksmbd_vfs_set_sd_xattr(conn, idmap, - path.dentry, + &path, pntsd, pntsd_size); kfree(pntsd); @@ -5465,7 +5495,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; rc = ksmbd_vfs_setxattr(file_mnt_idmap(fp->filp), - fp->filp->f_path.dentry, + &fp->filp->f_path, xattr_stream_name, NULL, 0, 0); if (rc < 0) { @@ -5630,8 +5660,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | XATTR_DOSINFO_ITIME; - rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, - filp->f_path.dentry, &da); + rc = ksmbd_vfs_set_dos_attrib_xattr(idmap, &filp->f_path, &da); if (rc) ksmbd_debug(SMB, "failed to restore file attribute in EA\n"); @@ -7486,7 +7515,7 @@ static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, da.attr = le32_to_cpu(fp->f_ci->m_fattr); ret = ksmbd_vfs_set_dos_attrib_xattr(idmap, - fp->filp->f_path.dentry, &da); + &fp->filp->f_path, &da); if (ret) fp->f_ci->m_fattr = old_fattr; } diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c index af0c2a9b8529..569e5eecdf3d 100644 --- a/fs/smb/server/smb_common.c +++ b/fs/smb/server/smb_common.c @@ -158,7 +158,19 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) */ bool ksmbd_smb_request(struct ksmbd_conn *conn) { - return conn->request_buf[0] == 0; + __le32 *proto = (__le32 *)smb2_get_msg(conn->request_buf); + + if (*proto == SMB2_COMPRESSION_TRANSFORM_ID) { + pr_err_ratelimited("smb2 compression not support yet"); + return false; + } + + if (*proto != SMB1_PROTO_NUMBER && + *proto != SMB2_PROTO_NUMBER && + *proto != SMB2_TRANSFORM_PROTO_NUM) + return false; + + return true; } static bool supported_protocol(int idx) diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index 6d6cfb6957a9..ad919a4239d0 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1162,8 +1162,7 @@ pass: pntsd_size += sizeof(struct smb_acl) + nt_size; } - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, pntsd_size); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, pntsd_size); kfree(pntsd); } @@ -1290,7 +1289,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { posix_acls = get_inode_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); - if (posix_acls && !found) { + if (!IS_ERR_OR_NULL(posix_acls) && !found) { unsigned int id = -1; pa_entry = posix_acls->a_entries; @@ -1314,7 +1313,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, } } } - if (posix_acls) + if (!IS_ERR_OR_NULL(posix_acls)) posix_acl_release(posix_acls); } @@ -1383,7 +1382,7 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - ksmbd_vfs_remove_acl_xattrs(idmap, path->dentry); + ksmbd_vfs_remove_acl_xattrs(idmap, path); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { rc = set_posix_acl(idmap, path->dentry, @@ -1414,9 +1413,8 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { /* Update WinACL in xattr */ - ksmbd_vfs_remove_sd_xattrs(idmap, path->dentry); - ksmbd_vfs_set_sd_xattr(conn, idmap, - path->dentry, pntsd, ntsd_len); + ksmbd_vfs_remove_sd_xattrs(idmap, path); + ksmbd_vfs_set_sd_xattr(conn, idmap, path, pntsd, ntsd_len); } out: diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 6f302919e9f7..81489fdedd8e 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -170,6 +170,10 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err; + mode |= S_IFREG; err = vfs_create(mnt_idmap(path.mnt), d_inode(path.dentry), dentry, mode, true); @@ -179,6 +183,9 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) } else { pr_err("File(%s): creation failed (err:%d)\n", name, err); } + mnt_drop_write(path.mnt); + +out_err: done_path_create(&path, dentry); return err; } @@ -209,30 +216,35 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + err = mnt_want_write(path.mnt); + if (err) + goto out_err2; + idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); - if (err) { - goto out; - } else if (d_unhashed(dentry)) { + if (!err && d_unhashed(dentry)) { struct dentry *d; d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); - goto out; + goto out_err1; } if (unlikely(d_is_negative(d))) { dput(d); err = -ENOENT; - goto out; + goto out_err1; } ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); dput(d); } -out: + +out_err1: + mnt_drop_write(path.mnt); +out_err2: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -443,7 +455,7 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, memcpy(&stream_buf[*pos], buf, count); err = ksmbd_vfs_setxattr(idmap, - fp->filp->f_path.dentry, + &fp->filp->f_path, fp->stream.name, (void *)stream_buf, size, @@ -589,6 +601,10 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) goto out_err; } + err = mnt_want_write(path->mnt); + if (err) + goto out_err; + idmap = mnt_idmap(path->mnt); if (S_ISDIR(d_inode(path->dentry)->i_mode)) { err = vfs_rmdir(idmap, d_inode(parent), path->dentry); @@ -599,6 +615,7 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, const struct path *path) if (err) ksmbd_debug(VFS, "unlink failed, err %d\n", err); } + mnt_drop_write(path->mnt); out_err: ksmbd_revert_fsids(work); @@ -644,11 +661,16 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, goto out3; } + err = mnt_want_write(newpath.mnt); + if (err) + goto out3; + err = vfs_link(oldpath.dentry, mnt_idmap(newpath.mnt), d_inode(newpath.dentry), dentry, NULL); if (err) ksmbd_debug(VFS, "vfs_link failed err %d\n", err); + mnt_drop_write(newpath.mnt); out3: done_path_create(&newpath, dentry); @@ -694,6 +716,10 @@ retry: goto out2; } + err = mnt_want_write(old_path->mnt); + if (err) + goto out2; + trap = lock_rename_child(old_child, new_path.dentry); old_parent = dget(old_child->d_parent); @@ -757,6 +783,7 @@ out4: out3: dput(old_parent); unlock_rename(old_parent, new_path.dentry); + mnt_drop_write(old_path->mnt); out2: path_put(&new_path); @@ -897,19 +924,24 @@ ssize_t ksmbd_vfs_getxattr(struct mnt_idmap *idmap, * Return: 0 on success, otherwise error */ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags) { int err; + err = mnt_want_write(path->mnt); + if (err) + return err; + err = vfs_setxattr(idmap, - dentry, + path->dentry, attr_name, attr_value, attr_size, flags); if (err) ksmbd_debug(VFS, "setxattr failed, err %d\n", err); + mnt_drop_write(path->mnt); return err; } @@ -1013,9 +1045,18 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, } int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name) + const struct path *path, char *attr_name) { - return vfs_removexattr(idmap, dentry, attr_name); + int err; + + err = mnt_want_write(path->mnt); + if (err) + return err; + + err = vfs_removexattr(idmap, path->dentry, attr_name); + mnt_drop_write(path->mnt); + + return err; } int ksmbd_vfs_unlink(struct file *filp) @@ -1024,6 +1065,10 @@ int ksmbd_vfs_unlink(struct file *filp) struct dentry *dir, *dentry = filp->f_path.dentry; struct mnt_idmap *idmap = file_mnt_idmap(filp); + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + dir = dget_parent(dentry); err = ksmbd_vfs_lock_parent(dir, dentry); if (err) @@ -1041,6 +1086,7 @@ int ksmbd_vfs_unlink(struct file *filp) ksmbd_debug(VFS, "failed to delete, err %d\n", err); out: dput(dir); + mnt_drop_write(filp->f_path.mnt); return err; } @@ -1244,13 +1290,13 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, } int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) + const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1258,6 +1304,10 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, goto out; } + err = mnt_want_write(path->mnt); + if (err) + goto out; + for (name = xattr_list; name - xattr_list < xattr_list_len; name += strlen(name) + 1) { ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); @@ -1266,25 +1316,26 @@ int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { - err = vfs_remove_acl(idmap, dentry, name); + err = vfs_remove_acl(idmap, path->dentry, name); if (err) ksmbd_debug(SMB, "remove acl xattr failed : %s\n", name); } } + mnt_drop_write(path->mnt); + out: kvfree(xattr_list); return err; } -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry) +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path) { char *name, *xattr_list = NULL; ssize_t xattr_list_len; int err = 0; - xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; } else if (!xattr_list_len) { @@ -1297,7 +1348,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { - err = ksmbd_vfs_remove_xattr(idmap, dentry, name); + err = ksmbd_vfs_remove_xattr(idmap, path, name); if (err) ksmbd_debug(SMB, "remove xattr failed : %s\n", name); } @@ -1321,7 +1372,7 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct mnt_idmap *id return NULL; posix_acls = get_inode_acl(inode, acl_type); - if (!posix_acls) + if (IS_ERR_OR_NULL(posix_acls)) return NULL; smb_acl = kzalloc(sizeof(struct xattr_smb_acl) + @@ -1374,13 +1425,14 @@ out: int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len) { int rc; struct ndr sd_ndr = {0}, acl_ndr = {0}; struct xattr_ntacl acl = {0}; struct xattr_smb_acl *smb_acl, *def_smb_acl = NULL; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); acl.version = 4; @@ -1432,7 +1484,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_vfs_setxattr(idmap, dentry, + rc = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_SD, sd_ndr.data, sd_ndr.offset, 0); if (rc < 0) @@ -1522,7 +1574,7 @@ free_n_data: } int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da) { struct ndr n; @@ -1532,7 +1584,7 @@ int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, if (err) return err; - err = ksmbd_vfs_setxattr(idmap, dentry, XATTR_NAME_DOS_ATTRIBUTE, + err = ksmbd_vfs_setxattr(idmap, path, XATTR_NAME_DOS_ATTRIBUTE, (void *)n.data, n.offset, 0); if (err) ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); @@ -1769,10 +1821,11 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry) + struct path *path) { struct posix_acl_state acl_state; struct posix_acl *acls; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc; @@ -1802,6 +1855,11 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, return -ENOMEM; } posix_state_to_acl(&acl_state, acls->a_entries); + + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1813,16 +1871,20 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: free_acl_state(&acl_state); posix_acl_release(acls); return rc; } int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, struct inode *parent_inode) + struct path *path, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; + struct dentry *dentry = path->dentry; struct inode *inode = d_inode(dentry); int rc, i; @@ -1830,7 +1892,7 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, return -EOPNOTSUPP; acls = get_inode_acl(parent_inode, ACL_TYPE_DEFAULT); - if (!acls) + if (IS_ERR_OR_NULL(acls)) return -ENOENT; pace = acls->a_entries; @@ -1841,6 +1903,10 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, } } + rc = mnt_want_write(path->mnt); + if (rc) + goto out_err; + rc = set_posix_acl(idmap, dentry, ACL_TYPE_ACCESS, acls); if (rc < 0) ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", @@ -1852,6 +1918,9 @@ int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", rc); } + mnt_drop_write(path->mnt); + +out_err: posix_acl_release(acls); return rc; } diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index a4ae89f3230d..8c0931d4d531 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -108,12 +108,12 @@ ssize_t ksmbd_vfs_casexattr_len(struct mnt_idmap *idmap, struct dentry *dentry, char *attr_name, int attr_name_len); int ksmbd_vfs_setxattr(struct mnt_idmap *idmap, - struct dentry *dentry, const char *attr_name, + const struct path *path, const char *attr_name, void *attr_value, size_t attr_size, int flags); int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, char *attr_name); + const struct path *path, char *attr_name); int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); @@ -139,26 +139,25 @@ void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); int ksmbd_vfs_remove_acl_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); -int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, - struct dentry *dentry); + const struct path *path); +int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path); int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct smb_ntsd *pntsd, int len); int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, struct mnt_idmap *idmap, struct dentry *dentry, struct smb_ntsd **pntsd); int ksmbd_vfs_set_dos_attrib_xattr(struct mnt_idmap *idmap, - struct dentry *dentry, + const struct path *path, struct xattr_dos_attrib *da); int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry); + struct path *path); int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct dentry *dentry, + struct path *path, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 2d0138e72d78..f41f8d6108ce 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -252,7 +252,7 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp) if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { ci->m_flags &= ~S_DEL_ON_CLS_STREAM; err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp), - filp->f_path.dentry, + &filp->f_path, fp->stream.name); if (err) pr_err("remove xattr failed : %s\n", diff --git a/fs/super.c b/fs/super.c index 34afe411cf2b..48c29954d487 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, @@ -595,7 +595,7 @@ retry: fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; - strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); @@ -674,7 +674,7 @@ retry: return ERR_PTR(err); } s->s_type = type; - strlcpy(s->s_id, type->name, sizeof(s->s_id)); + strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); @@ -903,6 +903,7 @@ int reconfigure_super(struct fs_context *fc) struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; + bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) @@ -920,7 +921,7 @@ int reconfigure_super(struct fs_context *fc) bdev_read_only(sb->s_bdev)) return -EACCES; #endif - + remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } @@ -943,13 +944,18 @@ int reconfigure_super(struct fs_context *fc) */ if (remount_ro) { if (force) { - sb->s_readonly_remount = 1; - smp_wmb(); + sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } + } else if (remount_rw) { + /* + * Protect filesystem's reconfigure code from writes from + * userspace until reconfigure finishes. + */ + sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { @@ -965,9 +971,7 @@ int reconfigure_super(struct fs_context *fc) WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); - /* Needs to be ordered wrt mnt_is_readonly() */ - smp_wmb(); - sb->s_readonly_remount = 0; + sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the @@ -982,7 +986,7 @@ int reconfigure_super(struct fs_context *fc) return 0; cancel_readonly: - sb->s_readonly_remount = 0; + sb_end_ro_state_change(sb); return retval; } diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index cdb3d632c63d..0140010aa0c3 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -52,7 +52,7 @@ static int sysv_handle_dirsync(struct inode *dir) } /* - * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the + * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the * rules documented in mm/highmem.rst. * * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_page() @@ -103,11 +103,11 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN)) { - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return 0; } } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } return 0; } @@ -131,7 +131,7 @@ static inline int namecompare(int len, int maxlen, * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. * - * On Success put_and_unmap_page() should be called on *res_page. + * On Success unmap_and_put_page() should be called on *res_page. * * sysv_find_entry() acts as a call to dir_get_page() and must be treated * accordingly for nesting purposes. @@ -166,7 +166,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ name, de->name)) goto found; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } if (++n >= npages) @@ -209,7 +209,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode) goto out_page; de++; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } BUG(); return -EINVAL; @@ -228,7 +228,7 @@ got_it: mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return err; out_unlock: unlock_page(page); @@ -321,12 +321,12 @@ int sysv_empty_dir(struct inode * inode) if (de->name[1] != '.' || de->name[2]) goto not_empty; } - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); } return 1; not_empty: - put_and_unmap_page(page, kaddr); + unmap_and_put_page(page, kaddr); return 0; } @@ -352,7 +352,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } /* - * Calls to dir_get_page()/put_and_unmap_page() must be nested according to the + * Calls to dir_get_page()/unmap_and_put_page() must be nested according to the * rules documented in mm/highmem.rst. * * sysv_dotdot() acts as a call to dir_get_page() and must be treated @@ -376,7 +376,7 @@ ino_t sysv_inode_by_name(struct dentry *dentry) if (de) { res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - put_and_unmap_page(page, de); + unmap_and_put_page(page, de); } return res; } diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index b22764fe669c..58d7f43a1371 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -145,6 +145,10 @@ static int alloc_branch(struct inode *inode, */ parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); bh = sb_getblk(inode->i_sb, parent); + if (!bh) { + sysv_free_block(inode->i_sb, branch[n].key); + break; + } lock_buffer(bh); memset(bh->b_data, 0, blocksize); branch[n].bh = bh; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 2b2dba4c4f56..fcf163fea3ad 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -164,7 +164,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) inode->i_ctime = dir->i_ctime; inode_dec_link_count(inode); } - put_and_unmap_page(page, de); + unmap_and_put_page(page, de); return err; } @@ -227,7 +227,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; err = sysv_set_link(new_de, new_page, old_inode); - put_and_unmap_page(new_page, new_de); + unmap_and_put_page(new_page, new_de); if (err) goto out_dir; new_inode->i_ctime = current_time(new_inode); @@ -256,9 +256,9 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, out_dir: if (dir_de) - put_and_unmap_page(dir_page, dir_de); + unmap_and_put_page(dir_page, dir_de); out_old: - put_and_unmap_page(old_page, old_de); + unmap_and_put_page(old_page, old_de); out: return err; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index fd20423d3ed2..fd29a66e7241 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -793,11 +793,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!empty_dir(new_inode)) goto out_oiter; } - /* - * We need to protect against old_inode getting converted from - * ICB to normal directory. - */ - inode_lock_nested(old_inode, I_MUTEX_NONDIR2); retval = udf_fiiter_find_entry(old_inode, &dotdot_name, &diriter); if (retval == -ENOENT) { @@ -806,10 +801,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, old_inode->i_ino); retval = -EFSCORRUPTED; } - if (retval) { - inode_unlock(old_inode); + if (retval) goto out_oiter; - } has_diriter = true; tloc = lelb_to_cpu(diriter.fi.icb.extLocation); if (udf_get_lb_pblock(old_inode->i_sb, &tloc, 0) != @@ -889,7 +882,6 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, udf_dir_entry_len(&diriter.fi)); udf_fiiter_write_fi(&diriter, NULL); udf_fiiter_release(&diriter); - inode_unlock(old_inode); inode_dec_link_count(old_dir); if (new_inode) @@ -901,10 +893,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, } return 0; out_oiter: - if (has_diriter) { + if (has_diriter) udf_fiiter_release(&diriter); - inode_unlock(old_inode); - } udf_fiiter_release(&oiter); return retval; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0fd96d6e39ce..4e800bb7d2ab 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1332,6 +1332,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool basic_ioctls; unsigned long start, end, vma_end; struct vma_iterator vmi; + pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1459,6 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_iter_set(&vmi, start); prev = vma_prev(&vmi); + if (vma->vm_start < start) + prev = vma; ret = 0; for_each_vma_range(vmi, vma, end) { @@ -1482,8 +1485,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); @@ -1563,6 +1567,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; + pgoff_t pgoff; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1625,6 +1630,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, vma_iter_set(&vmi, start); prev = vma_prev(&vmi); + if (vma->vm_start < start) + prev = vma; + ret = 0; for_each_vma_range(vmi, vma, end) { cond_resched(); @@ -1662,8 +1670,9 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, vma->vm_pgoff, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); if (prev) { diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index a7ffd718f171..e1036e535352 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -39,14 +39,14 @@ config FS_VERITY_BUILTIN_SIGNATURES depends on FS_VERITY select SYSTEM_DATA_VERIFICATION help - Support verifying signatures of verity files against the X.509 - certificates that have been loaded into the ".fs-verity" - kernel keyring. + This option adds support for in-kernel verification of + fs-verity builtin signatures. - This is meant as a relatively simple mechanism that can be - used to provide an authenticity guarantee for verity files, as - an alternative to IMA appraisal. Userspace programs still - need to check that the verity bit is set in order to get an - authenticity guarantee. + Please take great care before using this feature. It is not + the only way to do signatures with fs-verity, and the + alternatives (such as userspace signature verification, and + IMA appraisal) can be much better. For details about the + limitations of this feature, see + Documentation/filesystems/fsverity.rst. If unsure, say N. diff --git a/fs/verity/enable.c b/fs/verity/enable.c index fc4c50e5219d..c284f46d1b53 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -7,6 +7,7 @@ #include "fsverity_private.h" +#include <crypto/hash.h> #include <linux/mount.h> #include <linux/sched/signal.h> #include <linux/uaccess.h> @@ -20,7 +21,7 @@ struct block_buffer { /* Hash a block, writing the result to the next level's pending block buffer. */ static int hash_one_block(struct inode *inode, const struct merkle_tree_params *params, - struct ahash_request *req, struct block_buffer *cur) + struct block_buffer *cur) { struct block_buffer *next = cur + 1; int err; @@ -36,8 +37,7 @@ static int hash_one_block(struct inode *inode, /* Zero-pad the block if it's shorter than the block size. */ memset(&cur->data[cur->filled], 0, params->block_size - cur->filled); - err = fsverity_hash_block(params, inode, req, virt_to_page(cur->data), - offset_in_page(cur->data), + err = fsverity_hash_block(params, inode, cur->data, &next->data[next->filled]); if (err) return err; @@ -76,7 +76,6 @@ static int build_merkle_tree(struct file *filp, struct inode *inode = file_inode(filp); const u64 data_size = inode->i_size; const int num_levels = params->num_levels; - struct ahash_request *req; struct block_buffer _buffers[1 + FS_VERITY_MAX_LEVELS + 1] = {}; struct block_buffer *buffers = &_buffers[1]; unsigned long level_offset[FS_VERITY_MAX_LEVELS]; @@ -90,9 +89,6 @@ static int build_merkle_tree(struct file *filp, return 0; } - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(params->hash_alg, GFP_KERNEL); - /* * Allocate the block buffers. Buffer "-1" is for data blocks. * Buffers 0 <= level < num_levels are for the actual tree levels. @@ -130,7 +126,7 @@ static int build_merkle_tree(struct file *filp, fsverity_err(inode, "Short read of file data"); goto out; } - err = hash_one_block(inode, params, req, &buffers[-1]); + err = hash_one_block(inode, params, &buffers[-1]); if (err) goto out; for (level = 0; level < num_levels; level++) { @@ -141,8 +137,7 @@ static int build_merkle_tree(struct file *filp, } /* Next block at @level is full */ - err = hash_one_block(inode, params, req, - &buffers[level]); + err = hash_one_block(inode, params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -162,8 +157,7 @@ static int build_merkle_tree(struct file *filp, /* Finish all nonempty pending tree blocks. */ for (level = 0; level < num_levels; level++) { if (buffers[level].filled != 0) { - err = hash_one_block(inode, params, req, - &buffers[level]); + err = hash_one_block(inode, params, &buffers[level]); if (err) goto out; err = write_merkle_tree_block(inode, @@ -183,7 +177,6 @@ static int build_merkle_tree(struct file *filp, out: for (level = -1; level < num_levels; level++) kfree(buffers[level].data); - fsverity_free_hash_request(params->hash_alg, req); return err; } @@ -215,7 +208,7 @@ static int enable_verity(struct file *filp, } desc->salt_size = arg->salt_size; - /* Get the signature if the user provided one */ + /* Get the builtin signature if the user provided one */ if (arg->sig_size && copy_from_user(desc->signature, u64_to_user_ptr(arg->sig_ptr), arg->sig_size)) { diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index d34dcc033d72..49bf3a1eb2a0 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -11,9 +11,6 @@ #define pr_fmt(fmt) "fs-verity: " fmt #include <linux/fsverity.h> -#include <linux/mempool.h> - -struct ahash_request; /* * Implementation limit: maximum depth of the Merkle tree. For now 8 is plenty; @@ -23,11 +20,10 @@ struct ahash_request; /* A hash algorithm supported by fs-verity */ struct fsverity_hash_alg { - struct crypto_ahash *tfm; /* hash tfm, allocated on demand */ + struct crypto_shash *tfm; /* hash tfm, allocated on demand */ const char *name; /* crypto API name, e.g. sha256 */ unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */ unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */ - mempool_t req_pool; /* mempool with a preallocated hash request */ /* * The HASH_ALGO_* constant for this algorithm. This is different from * FS_VERITY_HASH_ALG_*, which uses a different numbering scheme. @@ -37,7 +33,7 @@ struct fsverity_hash_alg { /* Merkle tree parameters: hash algorithm, initial hash state, and topology */ struct merkle_tree_params { - struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ + const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */ const u8 *hashstate; /* initial hash state or NULL */ unsigned int digest_size; /* same as hash_alg->digest_size */ unsigned int block_size; /* size of data and tree blocks */ @@ -83,18 +79,13 @@ struct fsverity_info { extern struct fsverity_hash_alg fsverity_hash_algs[]; -struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num); -struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, - gfp_t gfp_flags); -void fsverity_free_hash_request(struct fsverity_hash_alg *alg, - struct ahash_request *req); -const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num); +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size); int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, struct ahash_request *req, - struct page *page, unsigned int offset, u8 *out); -int fsverity_hash_buffer(struct fsverity_hash_alg *alg, + const struct inode *inode, const void *data, u8 *out); +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out); void __init fsverity_check_hash_algs(void); diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index ea00dbedf756..c598d2035476 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -8,7 +8,6 @@ #include "fsverity_private.h" #include <crypto/hash.h> -#include <linux/scatterlist.h> /* The hash algorithms supported by fs-verity */ struct fsverity_hash_alg fsverity_hash_algs[] = { @@ -40,11 +39,11 @@ static DEFINE_MUTEX(fsverity_hash_alg_init_mutex); * * Return: pointer to the hash alg on success, else an ERR_PTR() */ -struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, - unsigned int num) +const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, + unsigned int num) { struct fsverity_hash_alg *alg; - struct crypto_ahash *tfm; + struct crypto_shash *tfm; int err; if (num >= ARRAY_SIZE(fsverity_hash_algs) || @@ -63,11 +62,7 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, if (alg->tfm != NULL) goto out_unlock; - /* - * Using the shash API would make things a bit simpler, but the ahash - * API is preferable as it allows the use of crypto accelerators. - */ - tfm = crypto_alloc_ahash(alg->name, 0, 0); + tfm = crypto_alloc_shash(alg->name, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fsverity_warn(inode, @@ -84,26 +79,20 @@ struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, } err = -EINVAL; - if (WARN_ON_ONCE(alg->digest_size != crypto_ahash_digestsize(tfm))) + if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm))) goto err_free_tfm; - if (WARN_ON_ONCE(alg->block_size != crypto_ahash_blocksize(tfm))) - goto err_free_tfm; - - err = mempool_init_kmalloc_pool(&alg->req_pool, 1, - sizeof(struct ahash_request) + - crypto_ahash_reqsize(tfm)); - if (err) + if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm))) goto err_free_tfm; pr_info("%s using implementation \"%s\"\n", - alg->name, crypto_ahash_driver_name(tfm)); + alg->name, crypto_shash_driver_name(tfm)); /* pairs with smp_load_acquire() above */ smp_store_release(&alg->tfm, tfm); goto out_unlock; err_free_tfm: - crypto_free_ahash(tfm); + crypto_free_shash(tfm); alg = ERR_PTR(err); out_unlock: mutex_unlock(&fsverity_hash_alg_init_mutex); @@ -111,42 +100,6 @@ out_unlock: } /** - * fsverity_alloc_hash_request() - allocate a hash request object - * @alg: the hash algorithm for which to allocate the request - * @gfp_flags: memory allocation flags - * - * This is mempool-backed, so this never fails if __GFP_DIRECT_RECLAIM is set in - * @gfp_flags. However, in that case this might need to wait for all - * previously-allocated requests to be freed. So to avoid deadlocks, callers - * must never need multiple requests at a time to make forward progress. - * - * Return: the request object on success; NULL on failure (but see above) - */ -struct ahash_request *fsverity_alloc_hash_request(struct fsverity_hash_alg *alg, - gfp_t gfp_flags) -{ - struct ahash_request *req = mempool_alloc(&alg->req_pool, gfp_flags); - - if (req) - ahash_request_set_tfm(req, alg->tfm); - return req; -} - -/** - * fsverity_free_hash_request() - free a hash request object - * @alg: the hash algorithm - * @req: the hash request object to free - */ -void fsverity_free_hash_request(struct fsverity_hash_alg *alg, - struct ahash_request *req) -{ - if (req) { - ahash_request_zero(req); - mempool_free(req, &alg->req_pool); - } -} - -/** * fsverity_prepare_hash_state() - precompute the initial hash state * @alg: hash algorithm * @salt: a salt which is to be prepended to all data to be hashed @@ -155,27 +108,24 @@ void fsverity_free_hash_request(struct fsverity_hash_alg *alg, * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed * initial hash state on success or an ERR_PTR() on failure. */ -const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, +const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size) { u8 *hashstate = NULL; - struct ahash_request *req = NULL; + SHASH_DESC_ON_STACK(desc, alg->tfm); u8 *padded_salt = NULL; size_t padded_salt_size; - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); int err; + desc->tfm = alg->tfm; + if (salt_size == 0) return NULL; - hashstate = kmalloc(crypto_ahash_statesize(alg->tfm), GFP_KERNEL); + hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL); if (!hashstate) return ERR_PTR(-ENOMEM); - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(alg, GFP_KERNEL); - /* * Zero-pad the salt to the next multiple of the input size of the hash * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128 @@ -190,26 +140,18 @@ const u8 *fsverity_prepare_hash_state(struct fsverity_hash_alg *alg, goto err_free; } memcpy(padded_salt, salt, salt_size); - - sg_init_one(&sg, padded_salt, padded_salt_size); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, NULL, padded_salt_size); - - err = crypto_wait_req(crypto_ahash_init(req), &wait); + err = crypto_shash_init(desc); if (err) goto err_free; - err = crypto_wait_req(crypto_ahash_update(req), &wait); + err = crypto_shash_update(desc, padded_salt, padded_salt_size); if (err) goto err_free; - err = crypto_ahash_export(req, hashstate); + err = crypto_shash_export(desc, hashstate); if (err) goto err_free; out: - fsverity_free_hash_request(alg, req); kfree(padded_salt); return hashstate; @@ -223,9 +165,7 @@ err_free: * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters * @inode: inode for which the hashing is being done - * @req: preallocated hash request - * @page: the page containing the block to hash - * @offset: the offset of the block within @page + * @data: virtual address of a buffer containing the block to hash * @out: output digest, size 'params->digest_size' bytes * * Hash a single data or hash block. The hash is salted if a salt is specified @@ -234,33 +174,24 @@ err_free: * Return: 0 on success, -errno on failure */ int fsverity_hash_block(const struct merkle_tree_params *params, - const struct inode *inode, struct ahash_request *req, - struct page *page, unsigned int offset, u8 *out) + const struct inode *inode, const void *data, u8 *out) { - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); + SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm); int err; - sg_init_table(&sg, 1); - sg_set_page(&sg, page, params->block_size, offset); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, params->block_size); + desc->tfm = params->hash_alg->tfm; if (params->hashstate) { - err = crypto_ahash_import(req, params->hashstate); + err = crypto_shash_import(desc, params->hashstate); if (err) { fsverity_err(inode, "Error %d importing hash state", err); return err; } - err = crypto_ahash_finup(req); + err = crypto_shash_finup(desc, data, params->block_size, out); } else { - err = crypto_ahash_digest(req); + err = crypto_shash_digest(desc, data, params->block_size, out); } - - err = crypto_wait_req(err, &wait); if (err) fsverity_err(inode, "Error %d computing block hash", err); return err; @@ -273,32 +204,12 @@ int fsverity_hash_block(const struct merkle_tree_params *params, * @size: size of data to hash, in bytes * @out: output digest, size 'alg->digest_size' bytes * - * Hash some data which is located in physically contiguous memory (i.e. memory - * allocated by kmalloc(), not by vmalloc()). No salt is used. - * * Return: 0 on success, -errno on failure */ -int fsverity_hash_buffer(struct fsverity_hash_alg *alg, +int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out) { - struct ahash_request *req; - struct scatterlist sg; - DECLARE_CRYPTO_WAIT(wait); - int err; - - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(alg, GFP_KERNEL); - - sg_init_one(&sg, data, size); - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - ahash_request_set_crypt(req, &sg, out, size); - - err = crypto_wait_req(crypto_ahash_digest(req), &wait); - - fsverity_free_hash_request(alg, req); - return err; + return crypto_shash_tfm_digest(alg->tfm, data, size, out); } void __init fsverity_check_hash_algs(void) diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 5c79ea1b2468..eec5956141da 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -61,27 +61,42 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure); /** * fsverity_get_digest() - get a verity file's digest * @inode: inode to get digest of - * @digest: (out) pointer to the digest - * @alg: (out) pointer to the hash algorithm enumeration + * @raw_digest: (out) the raw file digest + * @alg: (out) the digest's algorithm, as a FS_VERITY_HASH_ALG_* value + * @halg: (out) the digest's algorithm, as a HASH_ALGO_* value * - * Return the file hash algorithm and digest of an fsverity protected file. - * Assumption: before calling this, the file must have been opened. + * Retrieves the fsverity digest of the given file. The file must have been + * opened at least once since the inode was last loaded into the inode cache; + * otherwise this function will not recognize when fsverity is enabled. * - * Return: 0 on success, -errno on failure + * The file's fsverity digest consists of @raw_digest in combination with either + * @alg or @halg. (The caller can choose which one of @alg or @halg to use.) + * + * IMPORTANT: Callers *must* make use of one of the two algorithm IDs, since + * @raw_digest is meaningless without knowing which algorithm it uses! fsverity + * provides no security guarantee for users who ignore the algorithm ID, even if + * they use the digest size (since algorithms can share the same digest size). + * + * Return: The size of the raw digest in bytes, or 0 if the file doesn't have + * fsverity enabled. */ int fsverity_get_digest(struct inode *inode, - u8 digest[FS_VERITY_MAX_DIGEST_SIZE], - enum hash_algo *alg) + u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE], + u8 *alg, enum hash_algo *halg) { const struct fsverity_info *vi; const struct fsverity_hash_alg *hash_alg; vi = fsverity_get_info(inode); if (!vi) - return -ENODATA; /* not a verity file */ + return 0; /* not a verity file */ hash_alg = vi->tree_params.hash_alg; - memcpy(digest, vi->file_digest, hash_alg->digest_size); - *alg = hash_alg->algo_id; - return 0; + memcpy(raw_digest, vi->file_digest, hash_alg->digest_size); + if (alg) + *alg = hash_alg - fsverity_hash_algs; + if (halg) + *halg = hash_alg->algo_id; + return hash_alg->digest_size; } +EXPORT_SYMBOL_GPL(fsverity_get_digest); diff --git a/fs/verity/open.c b/fs/verity/open.c index 52048b7630dc..1db5106a9c38 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -32,7 +32,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, unsigned int log_blocksize, const u8 *salt, size_t salt_size) { - struct fsverity_hash_alg *hash_alg; + const struct fsverity_hash_alg *hash_alg; int err; u64 blocks; u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; @@ -156,9 +156,9 @@ out_err: /* * Compute the file digest by hashing the fsverity_descriptor excluding the - * signature and with the sig_size field set to 0. + * builtin signature and with the sig_size field set to 0. */ -static int compute_file_digest(struct fsverity_hash_alg *hash_alg, +static int compute_file_digest(const struct fsverity_hash_alg *hash_alg, struct fsverity_descriptor *desc, u8 *file_digest) { @@ -174,7 +174,7 @@ static int compute_file_digest(struct fsverity_hash_alg *hash_alg, /* * Create a new fsverity_info from the given fsverity_descriptor (with optional - * appended signature), and check the signature if present. The + * appended builtin signature), and check the signature if present. The * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, @@ -319,8 +319,8 @@ static bool validate_fsverity_descriptor(struct inode *inode, } /* - * Read the inode's fsverity_descriptor (with optional appended signature) from - * the filesystem, and do basic validation of it. + * Read the inode's fsverity_descriptor (with optional appended builtin + * signature) from the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret) diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c index 2aefc5565152..f58432772d9e 100644 --- a/fs/verity/read_metadata.c +++ b/fs/verity/read_metadata.c @@ -105,7 +105,7 @@ static int fsverity_read_descriptor(struct inode *inode, if (res) return res; - /* don't include the signature */ + /* don't include the builtin signature */ desc_size = offsetof(struct fsverity_descriptor, signature); desc->sig_size = 0; @@ -131,7 +131,7 @@ static int fsverity_read_signature(struct inode *inode, } /* - * Include only the signature. Note that fsverity_get_descriptor() + * Include only the builtin signature. fsverity_get_descriptor() * already verified that sig_size is in-bounds. */ res = fsverity_read_buffer(buf, offset, length, desc->signature, diff --git a/fs/verity/signature.c b/fs/verity/signature.c index b8c51ad40d3a..72034bc71c9d 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -5,6 +5,14 @@ * Copyright 2019 Google LLC */ +/* + * This file implements verification of fs-verity builtin signatures. Please + * take great care before using this feature. It is not the only way to do + * signatures with fs-verity, and the alternatives (such as userspace signature + * verification, and IMA appraisal) can be much better. For details about the + * limitations of this feature, see Documentation/filesystems/fsverity.rst. + */ + #include "fsverity_private.h" #include <linux/cred.h> diff --git a/fs/verity/verify.c b/fs/verity/verify.c index e2508222750b..433cef51f5f6 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -12,38 +12,6 @@ static struct workqueue_struct *fsverity_read_workqueue; -static inline int cmp_hashes(const struct fsverity_info *vi, - const u8 *want_hash, const u8 *real_hash, - u64 data_pos, int level) -{ - const unsigned int hsize = vi->tree_params.digest_size; - - if (memcmp(want_hash, real_hash, hsize) == 0) - return 0; - - fsverity_err(vi->inode, - "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", - data_pos, level, - vi->tree_params.hash_alg->name, hsize, want_hash, - vi->tree_params.hash_alg->name, hsize, real_hash); - return -EBADMSG; -} - -static bool data_is_zeroed(struct inode *inode, struct page *page, - unsigned int len, unsigned int offset) -{ - void *virt = kmap_local_page(page); - - if (memchr_inv(virt + offset, 0, len)) { - kunmap_local(virt); - fsverity_err(inode, - "FILE CORRUPTED! Data past EOF is not zeroed"); - return false; - } - kunmap_local(virt); - return true; -} - /* * Returns true if the hash block with index @hblock_idx in the tree, located in * @hpage, has already been verified. @@ -122,9 +90,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage, */ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi, - struct ahash_request *req, struct page *data_page, - u64 data_pos, unsigned int dblock_offset_in_page, - unsigned long max_ra_pages) + const void *data, u64 data_pos, unsigned long max_ra_pages) { const struct merkle_tree_params *params = &vi->tree_params; const unsigned int hsize = params->digest_size; @@ -136,11 +102,11 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, struct { /* Page containing the hash block */ struct page *page; + /* Mapped address of the hash block (will be within @page) */ + const void *addr; /* Index of the hash block in the tree overall */ unsigned long index; - /* Byte offset of the hash block within @page */ - unsigned int offset_in_page; - /* Byte offset of the wanted hash within @page */ + /* Byte offset of the wanted hash relative to @addr */ unsigned int hoffset; } hblocks[FS_VERITY_MAX_LEVELS]; /* @@ -148,7 +114,9 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * index of that block's hash within the current level. */ u64 hidx = data_pos >> params->log_blocksize; - int err; + + /* Up to 1 + FS_VERITY_MAX_LEVELS pages may be mapped at once */ + BUILD_BUG_ON(1 + FS_VERITY_MAX_LEVELS > KM_MAX_IDX); if (unlikely(data_pos >= inode->i_size)) { /* @@ -159,8 +127,12 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, * any part past EOF should be all zeroes. Therefore, we need * to verify that any data blocks fully past EOF are all zeroes. */ - return data_is_zeroed(inode, data_page, params->block_size, - dblock_offset_in_page); + if (memchr_inv(data, 0, params->block_size)) { + fsverity_err(inode, + "FILE CORRUPTED! Data past EOF is not zeroed"); + return false; + } + return true; } /* @@ -175,6 +147,7 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, unsigned int hblock_offset_in_page; unsigned int hoffset; struct page *hpage; + const void *haddr; /* * The index of the block in the current level; also the index @@ -192,30 +165,30 @@ verify_data_block(struct inode *inode, struct fsverity_info *vi, hblock_offset_in_page = (hblock_idx << params->log_blocksize) & ~PAGE_MASK; - /* Byte offset of the hash within the page */ - hoffset = hblock_offset_in_page + - ((hidx << params->log_digestsize) & - (params->block_size - 1)); + /* Byte offset of the hash within the block */ + hoffset = (hidx << params->log_digestsize) & + (params->block_size - 1); hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode, hpage_idx, level == 0 ? min(max_ra_pages, params->tree_pages - hpage_idx) : 0); if (IS_ERR(hpage)) { - err = PTR_ERR(hpage); fsverity_err(inode, - "Error %d reading Merkle tree page %lu", - err, hpage_idx); - goto out; + "Error %ld reading Merkle tree page %lu", + PTR_ERR(hpage), hpage_idx); + goto error; } + haddr = kmap_local_page(hpage) + hblock_offset_in_page; if (is_hash_block_verified(vi, hpage, hblock_idx)) { - memcpy_from_page(_want_hash, hpage, hoffset, hsize); + memcpy(_want_hash, haddr + hoffset, hsize); want_hash = _want_hash; + kunmap_local(haddr); put_page(hpage); goto descend; } hblocks[level].page = hpage; + hblocks[level].addr = haddr; hblocks[level].index = hblock_idx; - hblocks[level].offset_in_page = hblock_offset_in_page; hblocks[level].hoffset = hoffset; hidx = next_hidx; } @@ -225,18 +198,14 @@ descend: /* Descend the tree verifying hash blocks. */ for (; level > 0; level--) { struct page *hpage = hblocks[level - 1].page; + const void *haddr = hblocks[level - 1].addr; unsigned long hblock_idx = hblocks[level - 1].index; - unsigned int hblock_offset_in_page = - hblocks[level - 1].offset_in_page; unsigned int hoffset = hblocks[level - 1].hoffset; - err = fsverity_hash_block(params, inode, req, hpage, - hblock_offset_in_page, real_hash); - if (err) - goto out; - err = cmp_hashes(vi, want_hash, real_hash, data_pos, level - 1); - if (err) - goto out; + if (fsverity_hash_block(params, inode, haddr, real_hash) != 0) + goto error; + if (memcmp(want_hash, real_hash, hsize) != 0) + goto corrupted; /* * Mark the hash block as verified. This must be atomic and * idempotent, as the same hash block might be verified by @@ -246,29 +215,39 @@ descend: set_bit(hblock_idx, vi->hash_block_verified); else SetPageChecked(hpage); - memcpy_from_page(_want_hash, hpage, hoffset, hsize); + memcpy(_want_hash, haddr + hoffset, hsize); want_hash = _want_hash; + kunmap_local(haddr); put_page(hpage); } /* Finally, verify the data block. */ - err = fsverity_hash_block(params, inode, req, data_page, - dblock_offset_in_page, real_hash); - if (err) - goto out; - err = cmp_hashes(vi, want_hash, real_hash, data_pos, -1); -out: - for (; level > 0; level--) - put_page(hblocks[level - 1].page); + if (fsverity_hash_block(params, inode, data, real_hash) != 0) + goto error; + if (memcmp(want_hash, real_hash, hsize) != 0) + goto corrupted; + return true; - return err == 0; +corrupted: + fsverity_err(inode, + "FILE CORRUPTED! pos=%llu, level=%d, want_hash=%s:%*phN, real_hash=%s:%*phN", + data_pos, level - 1, + params->hash_alg->name, hsize, want_hash, + params->hash_alg->name, hsize, real_hash); +error: + for (; level > 0; level--) { + kunmap_local(hblocks[level - 1].addr); + put_page(hblocks[level - 1].page); + } + return false; } static bool -verify_data_blocks(struct inode *inode, struct fsverity_info *vi, - struct ahash_request *req, struct folio *data_folio, - size_t len, size_t offset, unsigned long max_ra_pages) +verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, + unsigned long max_ra_pages) { + struct inode *inode = data_folio->mapping->host; + struct fsverity_info *vi = inode->i_verity_info; const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; @@ -278,11 +257,14 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi, folio_test_uptodate(data_folio))) return false; do { - struct page *data_page = - folio_page(data_folio, offset >> PAGE_SHIFT); - - if (!verify_data_block(inode, vi, req, data_page, pos + offset, - offset & ~PAGE_MASK, max_ra_pages)) + void *data; + bool valid; + + data = kmap_local_folio(data_folio, offset); + valid = verify_data_block(inode, vi, data, pos + offset, + max_ra_pages); + kunmap_local(data); + if (!valid) return false; offset += block_size; len -= block_size; @@ -304,19 +286,7 @@ verify_data_blocks(struct inode *inode, struct fsverity_info *vi, */ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset) { - struct inode *inode = folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - struct ahash_request *req; - bool valid; - - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - - valid = verify_data_blocks(inode, vi, req, folio, len, offset, 0); - - fsverity_free_hash_request(vi->tree_params.hash_alg, req); - - return valid; + return verify_data_blocks(folio, len, offset, 0); } EXPORT_SYMBOL_GPL(fsverity_verify_blocks); @@ -337,15 +307,9 @@ EXPORT_SYMBOL_GPL(fsverity_verify_blocks); */ void fsverity_verify_bio(struct bio *bio) { - struct inode *inode = bio_first_page_all(bio)->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; - struct ahash_request *req; struct folio_iter fi; unsigned long max_ra_pages = 0; - /* This allocation never fails, since it's mempool-backed. */ - req = fsverity_alloc_hash_request(vi->tree_params.hash_alg, GFP_NOFS); - if (bio->bi_opf & REQ_RAHEAD) { /* * If this bio is for data readahead, then we also do readahead @@ -360,14 +324,12 @@ void fsverity_verify_bio(struct bio *bio) } bio_for_each_folio_all(fi, bio) { - if (!verify_data_blocks(inode, vi, req, fi.folio, fi.length, - fi.offset, max_ra_pages)) { + if (!verify_data_blocks(fi.folio, fi.length, fi.offset, + max_ra_pages)) { bio->bi_status = BLK_STS_IOERR; break; } } - - fsverity_free_hash_request(vi->tree_params.hash_alg, req); } EXPORT_SYMBOL_GPL(fsverity_verify_bio); #endif /* CONFIG_BLOCK */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 9b373a0c7aaf..ee84835ebc66 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -984,7 +984,10 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true); + err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + true); + if (err2) + goto resv_err; /* * Roll the transaction before trying to re-init the per-ag diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index fdfa08cbf4db..c20fe99405d8 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -628,6 +628,25 @@ xfs_alloc_fixup_trees( return 0; } +/* + * We do not verify the AGFL contents against AGF-based index counters here, + * even though we may have access to the perag that contains shadow copies. We + * don't know if the AGF based counters have been checked, and if they have they + * still may be inconsistent because they haven't yet been reset on the first + * allocation after the AGF has been read in. + * + * This means we can only check that all agfl entries contain valid or null + * values because we can't reliably determine the active range to exclude + * NULLAGBNO as a valid value. + * + * However, we can't even do that for v4 format filesystems because there are + * old versions of mkfs out there that does not initialise the AGFL to known, + * verifiable values. HEnce we can't tell the difference between a AGFL block + * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. + * + * As a result, we can only fully validate AGFL block numbers when we pull them + * from the freelist in xfs_alloc_get_freelist(). + */ static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) @@ -637,12 +656,6 @@ xfs_agfl_verify( __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; - /* - * There is no verification of non-crc AGFLs because mkfs does not - * initialise the AGFL to zero or NULL. Hence the only valid part of the - * AGFL is what the AGF says is active. We can't get to the AGF, so we - * can't verify just those entries are valid. - */ if (!xfs_has_crc(mp)) return NULL; @@ -2321,12 +2334,16 @@ xfs_free_agfl_block( } /* - * Check the agfl fields of the agf for inconsistency or corruption. The purpose - * is to detect an agfl header padding mismatch between current and early v5 - * kernels. This problem manifests as a 1-slot size difference between the - * on-disk flcount and the active [first, last] range of a wrapped agfl. This - * may also catch variants of agfl count corruption unrelated to padding. Either - * way, we'll reset the agfl and warn the user. + * Check the agfl fields of the agf for inconsistency or corruption. + * + * The original purpose was to detect an agfl header padding mismatch between + * current and early v5 kernels. This problem manifests as a 1-slot size + * difference between the on-disk flcount and the active [first, last] range of + * a wrapped agfl. + * + * However, we need to use these same checks to catch agfl count corruptions + * unrelated to padding. This could occur on any v4 or v5 filesystem, so either + * way, we need to reset the agfl and warn the user. * * Return true if a reset is required before the agfl can be used, false * otherwise. @@ -2342,10 +2359,6 @@ xfs_agfl_needs_reset( int agfl_size = xfs_agfl_size(mp); int active; - /* no agfl header on v4 supers */ - if (!xfs_has_crc(mp)) - return false; - /* * The agf read verifier catches severe corruption of these fields. * Repeat some sanity checks to cover a packed -> unpacked mismatch if @@ -2418,7 +2431,7 @@ xfs_agfl_reset( * the real allocation can proceed. Deferring the free disconnects freeing up * the AGFL slot from freeing the block. */ -STATIC void +static int xfs_defer_agfl_block( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -2437,17 +2450,21 @@ xfs_defer_agfl_block( xefi->xefi_blockcount = 1; xefi->xefi_owner = oinfo->oi_owner; + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock))) + return -EFSCORRUPTED; + trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + return 0; } /* * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -void +int __xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, @@ -2474,6 +2491,9 @@ __xfs_free_extent_later( #endif ASSERT(xfs_extfree_item_cache != NULL); + if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) + return -EFSCORRUPTED; + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); xefi->xefi_startblock = bno; @@ -2497,6 +2517,7 @@ __xfs_free_extent_later( xfs_extent_free_get_group(mp, xefi); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + return 0; } #ifdef DEBUG @@ -2657,7 +2678,9 @@ xfs_alloc_fix_freelist( goto out_agbp_relse; /* defer agfl frees */ - xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); + if (error) + goto out_agbp_relse; } targs.tp = tp; @@ -2767,6 +2790,9 @@ xfs_alloc_get_freelist( */ agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) + return -EFSCORRUPTED; + be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) @@ -2889,6 +2915,19 @@ xfs_alloc_put_freelist( return 0; } +/* + * Verify the AGF is consistent. + * + * We do not verify the AGFL indexes in the AGF are fully consistent here + * because of issues with variable on-disk structure sizes. Instead, we check + * the agfl indexes for consistency when we initialise the perag from the AGF + * information after a read completes. + * + * If the index is inconsistent, then we mark the perag as needing an AGFL + * reset. The first AGFL update performed then resets the AGFL indexes and + * refills the AGFL with known good free blocks, allowing the filesystem to + * continue operating normally at the cost of a few leaked free space blocks. + */ static xfs_failaddr_t xfs_agf_verify( struct xfs_buf *bp) @@ -2962,7 +3001,6 @@ xfs_agf_verify( return __this_address; return NULL; - } static void @@ -3187,7 +3225,8 @@ xfs_alloc_vextent_check_args( */ static int xfs_alloc_vextent_prepare_ag( - struct xfs_alloc_arg *args) + struct xfs_alloc_arg *args, + uint32_t flags) { bool need_pag = !args->pag; int error; @@ -3196,7 +3235,7 @@ xfs_alloc_vextent_prepare_ag( args->pag = xfs_perag_get(args->mp, args->agno); args->agbp = NULL; - error = xfs_alloc_fix_freelist(args, 0); + error = xfs_alloc_fix_freelist(args, flags); if (error) { trace_xfs_alloc_vextent_nofix(args); if (need_pag) @@ -3336,7 +3375,7 @@ xfs_alloc_vextent_this_ag( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_size(args); @@ -3380,7 +3419,7 @@ restart: for_each_perag_wrap_range(mp, start_agno, restart_agno, mp->m_sb.sb_agcount, agno, args->pag) { args->agno = agno; - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, flags); if (error) break; if (!args->agbp) { @@ -3546,7 +3585,7 @@ xfs_alloc_vextent_exact_bno( return error; } - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_exact(args); @@ -3587,7 +3626,7 @@ xfs_alloc_vextent_near_bno( if (needs_perag) args->pag = xfs_perag_grab(mp, args->agno); - error = xfs_alloc_vextent_prepare_ag(args); + error = xfs_alloc_vextent_prepare_ag(args, 0); if (!error && args->agbp) error = xfs_alloc_ag_vextent_near(args); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 5dbb25546d0b..85ac470be0da 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -230,7 +230,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); @@ -254,14 +254,14 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ -static inline void +static inline int xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo) { - __xfs_free_extent_later(tp, bno, len, oinfo, false); + return __xfs_free_extent_later(tp, bno, len, oinfo, false); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index cd8870a16fd1..fef35696adb7 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -572,8 +572,12 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); + if (error) + return error; + ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -5230,10 +5234,12 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - __xfs_free_extent_later(tp, del->br_startblock, + error = __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, (bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN); + if (error) + goto done; } } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1b40e5f8b1ec..36564ae3084f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -268,11 +268,14 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; + int error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); - xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); - ip->i_nblocks--; + error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); + if (error) + return error; + ip->i_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a16d5de16933..34600f94c2f4 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1834,7 +1834,7 @@ retry: * might be sparse and only free the regions that are allocated as part of the * chunk. */ -STATIC void +static int xfs_difree_inode_chunk( struct xfs_trans *tp, xfs_agnumber_t agno, @@ -1851,10 +1851,10 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - M_IGEO(mp)->ialloc_blks, - &XFS_RMAP_OINFO_INODES); - return; + return xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, sagbno), + M_IGEO(mp)->ialloc_blks, + &XFS_RMAP_OINFO_INODES); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1871,6 +1871,8 @@ xfs_difree_inode_chunk( XFS_INOBT_HOLEMASK_BITS); nextbit = startidx + 1; while (startidx < XFS_INOBT_HOLEMASK_BITS) { + int error; + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, nextbit); /* @@ -1896,8 +1898,11 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), - contigblk, &XFS_RMAP_OINFO_INODES); + error = xfs_free_extent_later(tp, + XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk, &XFS_RMAP_OINFO_INODES); + if (error) + return error; /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -1905,6 +1910,7 @@ xfs_difree_inode_chunk( next: nextbit++; } + return 0; } STATIC int @@ -2003,7 +2009,9 @@ xfs_difree_inobt( goto error0; } - xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec); + if (error) + goto error0; } else { xic->deleted = false; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index f13e0809dc63..269573c82808 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 { #define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ #define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ - /* * The timestamps are dirty, but not necessarily anything else in the inode * core. Unlike the other fields above this one must never make it to disk @@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 { */ #define XFS_ILOG_TIMESTAMP 0x4000 +/* + * The version field has been changed, but not necessarily anything else of + * interest. This must never make it to disk - it is used purely to ensure that + * the inode item ->precommit operation can update the fsync flag triggers + * in the inode item correctly. + */ +#define XFS_ILOG_IVERSION 0x8000 + #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index c1c65774dcc2..b6e21433925c 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1151,8 +1151,10 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, tmp.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, + error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL); + if (error) + goto out_error; } (*agbno) += tmp.rc_blockcount; @@ -1210,8 +1212,10 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, ext.rc_startblock); - xfs_free_extent_later(cur->bc_tp, fsbno, + error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL); + if (error) + goto out_error; } skip: @@ -1976,7 +1980,10 @@ xfs_refcount_recover_cow_leftovers( rr->rr_rrec.rc_blockcount); /* Free the block. */ - xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); + error = xfs_free_extent_later(tp, fsb, + rr->rr_rrec.rc_blockcount, NULL); + if (error) + goto out_trans; error = xfs_trans_commit(tp); if (error) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 8b5547073379..cb4796b6e693 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -40,9 +40,8 @@ xfs_trans_ijoin( iip->ili_lock_flags = lock_flags; ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); - /* - * Get a log_item_desc to point at the new item. - */ + /* Reset the per-tx dirty context and add the item to the tx. */ + iip->ili_dirty_flags = 0; xfs_trans_add_item(tp, &iip->ili_item); } @@ -76,17 +75,10 @@ xfs_trans_ichgtime( /* * This is called to mark the fields indicated in fieldmask as needing to be * logged when the transaction is committed. The inode must already be - * associated with the given transaction. - * - * The values for fieldmask are defined in xfs_inode_item.h. We always log all - * of the core inode if any of it has changed, and we always log all of the - * inline data/extents/b-tree root if any of them has changed. - * - * Grab and pin the cluster buffer associated with this inode to avoid RMW - * cycles at inode writeback time. Avoid the need to add error handling to every - * xfs_trans_log_inode() call by shutting down on read error. This will cause - * transactions to fail and everything to error out, just like if we return a - * read error in a dirty transaction and cancel it. + * associated with the given transaction. All we do here is record where the + * inode was dirtied and mark the transaction and inode log item dirty; + * everything else is done in the ->precommit log item operation after the + * changes in the transaction have been completed. */ void xfs_trans_log_inode( @@ -96,7 +88,6 @@ xfs_trans_log_inode( { struct xfs_inode_log_item *iip = ip->i_itemp; struct inode *inode = VFS_I(ip); - uint iversion_flags = 0; ASSERT(iip); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -105,18 +96,6 @@ xfs_trans_log_inode( tp->t_flags |= XFS_TRANS_DIRTY; /* - * Don't bother with i_lock for the I_DIRTY_TIME check here, as races - * don't matter - we either will need an extra transaction in 24 hours - * to log the timestamps, or will clear already cleared fields in the - * worst case. - */ - if (inode->i_state & I_DIRTY_TIME) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); - } - - /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually @@ -128,86 +107,10 @@ xfs_trans_log_inode( if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) - iversion_flags = XFS_ILOG_CORE; - } - - /* - * If we're updating the inode core or the timestamps and it's possible - * to upgrade this inode to bigtime format, do so now. - */ - if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && - xfs_has_bigtime(ip->i_mount) && - !xfs_inode_has_bigtime(ip)) { - ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; - flags |= XFS_ILOG_CORE; - } - - /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + flags |= XFS_ILOG_IVERSION; } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. - */ - spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= flags; - - if (!iip->ili_item.li_buf) { - struct xfs_buf *bp; - int error; - - /* - * We hold the ILOCK here, so this inode is not going to be - * flushed while we are here. Further, because there is no - * buffer attached to the item, we know that there is no IO in - * progress, so nothing will clear the ili_fields while we read - * in the buffer. Hence we can safely drop the spin lock and - * read the buffer knowing that the state will not change from - * here. - */ - spin_unlock(&iip->ili_lock); - error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); - if (error) { - xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); - return; - } - - /* - * We need an explicit buffer reference for the log item but - * don't want the buffer to remain attached to the transaction. - * Hold the buffer but release the transaction reference once - * we've attached the inode log item to the buffer log item - * list. - */ - xfs_buf_hold(bp); - spin_lock(&iip->ili_lock); - iip->ili_item.li_buf = bp; - bp->b_flags |= _XBF_INODES; - list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); - xfs_trans_brelse(tp, bp); - } - - /* - * Always OR in the bits from the ili_last_fields field. This is to - * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines - * in the eventual clearing of the ili_fields bits. See the big comment - * in xfs_iflush() for an explanation of this coordination mechanism. - */ - iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); - spin_unlock(&iip->ili_lock); + iip->ili_dirty_flags |= flags; } int diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 69bc89d0fc68..5bf4326e9783 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -769,14 +769,14 @@ xchk_are_bmaps_contiguous( * mapping or false if there are no more mappings. Caller must ensure that * @info.icur is zeroed before the first call. */ -static int +static bool xchk_bmap_iext_iter( struct xchk_bmap_info *info, struct xfs_bmbt_irec *irec) { struct xfs_bmbt_irec got; struct xfs_ifork *ifp; - xfs_filblks_t prev_len; + unsigned int nr = 0; ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork); @@ -790,12 +790,12 @@ xchk_bmap_iext_iter( irec->br_startoff); return false; } + nr++; /* * Iterate subsequent iextent records and merge them with the one * that we just read, if possible. */ - prev_len = irec->br_blockcount; while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) { if (!xchk_are_bmaps_contiguous(irec, &got)) break; @@ -805,20 +805,21 @@ xchk_bmap_iext_iter( got.br_startoff); return false; } - - /* - * Notify the user of mergeable records in the data or attr - * forks. CoW forks only exist in memory so we ignore them. - */ - if (info->whichfork != XFS_COW_FORK && - prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK) - xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + nr++; irec->br_blockcount += got.br_blockcount; - prev_len = got.br_blockcount; xfs_iext_next(ifp, &info->icur); } + /* + * If the merged mapping could be expressed with fewer bmbt records + * than we actually found, notify the user that this fork could be + * optimized. CoW forks only exist in memory so we ignore them. + */ + if (nr > 1 && info->whichfork != XFS_COW_FORK && + howmany_64(irec->br_blockcount, XFS_MAX_BMBT_EXTLEN) < nr) + xchk_ino_set_preen(info->sc, info->sc->ip->i_ino); + return true; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index b38e93830dde..e113f2f5c254 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -105,10 +105,10 @@ struct xfs_scrub { }; /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ -#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ -#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */ -#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */ -#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ +#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ +#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* * The XCHK_FSGATES* flags reflect functionality in the main filesystem that diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index df7322ed73fa..023d4e0385dd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -452,10 +452,18 @@ xfs_buf_item_format( * This is called to pin the buffer associated with the buf log item in memory * so it cannot be written out. * - * We also always take a reference to the buffer log item here so that the bli - * is held while the item is pinned in memory. This means that we can - * unconditionally drop the reference count a transaction holds when the - * transaction is completed. + * We take a reference to the buffer log item here so that the BLI life cycle + * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and + * inserted into the AIL. + * + * We also need to take a reference to the buffer itself as the BLI unpin + * processing requires accessing the buffer after the BLI has dropped the final + * BLI reference. See xfs_buf_item_unpin() for an explanation. + * If unpins race to drop the final BLI reference and only the + * BLI owns a reference to the buffer, then the loser of the race can have the + * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per + * pin count ensures the life cycle of the buffer extends for as + * long as we hold the buffer pin reference in xfs_buf_item_unpin(). */ STATIC void xfs_buf_item_pin( @@ -470,13 +478,30 @@ xfs_buf_item_pin( trace_xfs_buf_item_pin(bip); + xfs_buf_hold(bip->bli_buf); atomic_inc(&bip->bli_refcount); atomic_inc(&bip->bli_buf->b_pin_count); } /* - * This is called to unpin the buffer associated with the buf log item which - * was previously pinned with a call to xfs_buf_item_pin(). + * This is called to unpin the buffer associated with the buf log item which was + * previously pinned with a call to xfs_buf_item_pin(). We enter this function + * with a buffer pin count, a buffer reference and a BLI reference. + * + * We must drop the BLI reference before we unpin the buffer because the AIL + * doesn't acquire a BLI reference whenever it accesses it. Therefore if the + * refcount drops to zero, the bli could still be AIL resident and the buffer + * submitted for I/O at any point before we return. This can result in IO + * completion freeing the buffer while we are still trying to access it here. + * This race condition can also occur in shutdown situations where we abort and + * unpin buffers from contexts other that journal IO completion. + * + * Hence we have to hold a buffer reference per pin count to ensure that the + * buffer cannot be freed until we have finished processing the unpin operation. + * The reference is taken in xfs_buf_item_pin(), and we must hold it until we + * are done processing the buffer state. In the case of an abort (remove = + * true) then we re-use the current pin reference as the IO reference we hand + * off to IO failure handling. */ STATIC void xfs_buf_item_unpin( @@ -493,24 +518,18 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin(bip); - /* - * Drop the bli ref associated with the pin and grab the hold required - * for the I/O simulation failure in the abort case. We have to do this - * before the pin count drops because the AIL doesn't acquire a bli - * reference. Therefore if the refcount drops to zero, the bli could - * still be AIL resident and the buffer submitted for I/O (and freed on - * completion) at any point before we return. This can be removed once - * the AIL properly holds a reference on the bli. - */ freed = atomic_dec_and_test(&bip->bli_refcount); - if (freed && !stale && remove) - xfs_buf_hold(bp); if (atomic_dec_and_test(&bp->b_pin_count)) wake_up_all(&bp->b_waiters); - /* nothing to do but drop the pin count if the bli is active */ - if (!freed) + /* + * Nothing to do but drop the buffer pin reference if the BLI is + * still active. + */ + if (!freed) { + xfs_buf_rele(bp); return; + } if (stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); @@ -523,6 +542,15 @@ xfs_buf_item_unpin( trace_xfs_buf_item_unpin_stale(bip); /* + * The buffer has been locked and referenced since it was marked + * stale so we own both lock and reference exclusively here. We + * do not need the pin reference any more, so drop it now so + * that we only have one reference to drop once item completion + * processing is complete. + */ + xfs_buf_rele(bp); + + /* * If we get called here because of an IO error, we may or may * not have the item on the AIL. xfs_trans_ail_delete() will * take care of that situation. xfs_trans_ail_delete() drops @@ -538,16 +566,30 @@ xfs_buf_item_unpin( ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); - } else if (remove) { + return; + } + + if (remove) { /* - * The buffer must be locked and held by the caller to simulate - * an async I/O failure. We acquired the hold for this case - * before the buffer was unpinned. + * We need to simulate an async IO failures here to ensure that + * the correct error completion is run on this buffer. This + * requires a reference to the buffer and for the buffer to be + * locked. We can safely pass ownership of the pin reference to + * the IO to ensure that nothing can free the buffer while we + * wait for the lock and then run the IO failure completion. */ xfs_buf_lock(bp); bp->b_flags |= XBF_ASYNC; xfs_buf_ioend_fail(bp); + return; } + + /* + * BLI has no more active references - it will be moved to the AIL to + * manage the remaining BLI/buffer life cycle. There is nothing left for + * us to do here so drop the pin reference to the buffer. + */ + xfs_buf_rele(bp); } STATIC uint diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 22c13933c8f8..2fc98d313708 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -78,7 +78,6 @@ restart: *longest = 0; err = xfs_bmap_longest_free_extent(pag, NULL, longest); if (err) { - xfs_perag_rele(pag); if (err != -EAGAIN) break; /* Couldn't lock the AGF, skip this AG. */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0f60e301eb1f..453890942d9f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -454,6 +454,27 @@ xfs_inodegc_queue_all( return ret; } +/* Wait for all queued work and collect errors */ +static int +xfs_inodegc_wait_all( + struct xfs_mount *mp) +{ + int cpu; + int error = 0; + + flush_workqueue(mp->m_inodegc_wq); + for_each_online_cpu(cpu) { + struct xfs_inodegc *gc; + + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (gc->error && !error) + error = gc->error; + gc->error = 0; + } + + return error; +} + /* * Check the validity of the inode we just found it the cache */ @@ -1491,15 +1512,14 @@ xfs_blockgc_free_space( if (error) return error; - xfs_inodegc_flush(mp); - return 0; + return xfs_inodegc_flush(mp); } /* * Reclaim all the free space that we can by scheduling the background blockgc * and inodegc workers immediately and waiting for them all to clear. */ -void +int xfs_blockgc_flush_all( struct xfs_mount *mp) { @@ -1520,7 +1540,7 @@ xfs_blockgc_flush_all( for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) flush_delayed_work(&pag->pag_blockgc_work); - xfs_inodegc_flush(mp); + return xfs_inodegc_flush(mp); } /* @@ -1842,13 +1862,17 @@ xfs_inodegc_set_reclaimable( * This is the last chance to make changes to an otherwise unreferenced file * before incore reclamation happens. */ -static void +static int xfs_inodegc_inactivate( struct xfs_inode *ip) { + int error; + trace_xfs_inode_inactivating(ip); - xfs_inactive(ip); + error = xfs_inactive(ip); xfs_inodegc_set_reclaimable(ip); + return error; + } void @@ -1880,8 +1904,12 @@ xfs_inodegc_worker( WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { + int error; + xfs_iflags_set(ip, XFS_INACTIVATING); - xfs_inodegc_inactivate(ip); + error = xfs_inodegc_inactivate(ip); + if (error && !gc->error) + gc->error = error; } memalloc_nofs_restore(nofs_flag); @@ -1905,13 +1933,13 @@ xfs_inodegc_push( * Force all currently queued inode inactivation work to run immediately and * wait for the work to finish. */ -void +int xfs_inodegc_flush( struct xfs_mount *mp) { xfs_inodegc_push(mp); trace_xfs_inodegc_flush(mp, __return_address); - flush_workqueue(mp->m_inodegc_wq); + return xfs_inodegc_wait_all(mp); } /* diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 87910191a9dd..1dcdcb23796e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -62,7 +62,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); -void xfs_blockgc_flush_all(struct xfs_mount *mp); +int xfs_blockgc_flush_all(struct xfs_mount *mp); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); @@ -80,7 +80,7 @@ void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); void xfs_inodegc_push(struct xfs_mount *mp); -void xfs_inodegc_flush(struct xfs_mount *mp); +int xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5808abab786c..9e62cc500140 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1620,16 +1620,7 @@ xfs_inactive_ifree( */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); - /* - * Just ignore errors at this point. There is nothing we can do except - * to try to keep going. Make sure it's not a silent error. - */ - error = xfs_trans_commit(tp); - if (error) - xfs_notice(mp, "%s: xfs_trans_commit returned error %d", - __func__, error); - - return 0; + return xfs_trans_commit(tp); } /* @@ -1693,12 +1684,12 @@ xfs_inode_needs_inactive( * now be truncated. Also, we clear all of the read-ahead state * kept for the inode here since the file is now closed. */ -void +int xfs_inactive( xfs_inode_t *ip) { struct xfs_mount *mp; - int error; + int error = 0; int truncate = 0; /* @@ -1736,7 +1727,7 @@ xfs_inactive( * reference to the inode at this point anyways. */ if (xfs_can_free_eofblocks(ip, true)) - xfs_free_eofblocks(ip); + error = xfs_free_eofblocks(ip); goto out; } @@ -1773,7 +1764,7 @@ xfs_inactive( /* * Free the inode. */ - xfs_inactive_ifree(ip); + error = xfs_inactive_ifree(ip); out: /* @@ -1781,6 +1772,7 @@ out: * the attached dquots. */ xfs_qm_dqdetach(ip); + return error; } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 69d21e42c10a..7547caf2f2ab 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -470,7 +470,7 @@ enum layout_break_reason { (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) int xfs_release(struct xfs_inode *ip); -void xfs_inactive(struct xfs_inode *ip); +int xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); int xfs_create(struct mnt_idmap *idmap, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index ca2941ab6cbc..91c847a84e10 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_inode_log_item, ili_item); } +static uint64_t +xfs_inode_item_sort( + struct xfs_log_item *lip) +{ + return INODE_ITEM(lip)->ili_inode->i_ino; +} + +/* + * Prior to finally logging the inode, we have to ensure that all the + * per-modification inode state changes are applied. This includes VFS inode + * state updates, format conversions, verifier state synchronisation and + * ensuring the inode buffer remains in memory whilst the inode is dirty. + * + * We have to be careful when we grab the inode cluster buffer due to lock + * ordering constraints. The unlinked inode modifications (xfs_iunlink_item) + * require AGI -> inode cluster buffer lock order. The inode cluster buffer is + * not locked until ->precommit, so it happens after everything else has been + * modified. + * + * Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we + * have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we + * cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because + * it can be called on a inode (e.g. via bumplink/droplink) before we take the + * AGF lock modifying directory blocks. + * + * Rather than force a complete rework of all the transactions to call + * xfs_trans_log_inode() once and once only at the end of every transaction, we + * move the pinning of the inode cluster buffer to a ->precommit operation. This + * matches how the xfs_iunlink_item locks the inode cluster buffer, and it + * ensures that the inode cluster buffer locking is always done last in a + * transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode + * cluster buffer. + * + * If we return the inode number as the precommit sort key then we'll also + * guarantee that the order all inode cluster buffer locking is the same all the + * inodes and unlink items in the transaction. + */ +static int +xfs_inode_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct inode *inode = VFS_I(ip); + unsigned int flags = iip->ili_dirty_flags; + + /* + * Don't bother with i_lock for the I_DIRTY_TIME check here, as races + * don't matter - we either will need an extra transaction in 24 hours + * to log the timestamps, or will clear already cleared fields in the + * worst case. + */ + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + } + + /* + * If we're updating the inode core or the timestamps and it's possible + * to upgrade this inode to bigtime format, do so now. + */ + if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && + xfs_has_bigtime(ip->i_mount) && + !xfs_inode_has_bigtime(ip)) { + ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; + flags |= XFS_ILOG_CORE; + } + + /* + * Inode verifiers do not check that the extent size hint is an integer + * multiple of the rt extent size on a directory with both rtinherit + * and extszinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the hint. + */ + if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + + /* + * Record the specific change for fdatasync optimisation. This allows + * fdatasync to skip log forces for inodes that are only timestamp + * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it + * to XFS_ILOG_CORE so that the actual on-disk dirty tracking + * (ili_fields) correctly tracks that the version has changed. + */ + spin_lock(&iip->ili_lock); + iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + if (!iip->ili_item.li_buf) { + struct xfs_buf *bp; + int error; + + /* + * We hold the ILOCK here, so this inode is not going to be + * flushed while we are here. Further, because there is no + * buffer attached to the item, we know that there is no IO in + * progress, so nothing will clear the ili_fields while we read + * in the buffer. Hence we can safely drop the spin lock and + * read the buffer knowing that the state will not change from + * here. + */ + spin_unlock(&iip->ili_lock); + error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); + if (error) + return error; + + /* + * We need an explicit buffer reference for the log item but + * don't want the buffer to remain attached to the transaction. + * Hold the buffer but release the transaction reference once + * we've attached the inode log item to the buffer log item + * list. + */ + xfs_buf_hold(bp); + spin_lock(&iip->ili_lock); + iip->ili_item.li_buf = bp; + bp->b_flags |= _XBF_INODES; + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); + xfs_trans_brelse(tp, bp); + } + + /* + * Always OR in the bits from the ili_last_fields field. This is to + * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines + * in the eventual clearing of the ili_fields bits. See the big comment + * in xfs_iflush() for an explanation of this coordination mechanism. + */ + iip->ili_fields |= (flags | iip->ili_last_fields); + spin_unlock(&iip->ili_lock); + + /* + * We are done with the log item transaction dirty state, so clear it so + * that it doesn't pollute future transactions. + */ + iip->ili_dirty_flags = 0; + return 0; +} + /* * The logged size of an inode fork is always the current size of the inode * fork. This means that when an inode fork is relogged, the size of the logged @@ -662,6 +809,8 @@ xfs_inode_item_committing( } static const struct xfs_item_ops xfs_inode_item_ops = { + .iop_sort = xfs_inode_item_sort, + .iop_precommit = xfs_inode_item_precommit, .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index bbd836a44ff0..377e06007804 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -17,6 +17,7 @@ struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ unsigned short ili_lock_flags; /* inode lock flags */ + unsigned int ili_dirty_flags; /* dirty in current tx */ /* * The ili_lock protects the interactions between the dirty state and * the flush state of the inode log item. This allows us to do atomic diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 322eb2ee6c55..82c81d20459d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket( * just to flush the inodegc queue and wait for it to * complete. */ - xfs_inodegc_flush(mp); + error = xfs_inodegc_flush(mp); + if (error) + break; } prev_agino = agino; @@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket( } if (prev_ip) { + int error2; + ip->i_prev_unlinked = prev_agino; xfs_irele(prev_ip); + + error2 = xfs_inodegc_flush(mp); + if (error2 && !error) + return error2; } - xfs_inodegc_flush(mp); return error; } @@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag( * bucket and remaining inodes on it unreferenced and * unfreeable. */ - xfs_inodegc_flush(pag->pag_mount); xlog_recover_clear_agi_bucket(pag, bucket); } } @@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks( for_each_perag(log->l_mp, agno, pag) xlog_recover_iunlink_ag(pag); - - /* - * Flush the pending unlinked inodes to ensure that the inactivations - * are fully completed on disk and the incore inodes can be reclaimed - * before we signal that recovery is complete. - */ - xfs_inodegc_flush(log->l_mp); } STATIC void diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index aaaf5ec13492..6c09f89534d3 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -62,6 +62,7 @@ struct xfs_error_cfg { struct xfs_inodegc { struct llist_head list; struct delayed_work work; + int error; /* approximate count of inodes in the list */ unsigned int items; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f5dc46ce9803..abcc559f3c64 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -616,8 +616,10 @@ xfs_reflink_cancel_cow_blocks( xfs_refcount_free_cow_extent(*tpp, del.br_startblock, del.br_blockcount); - xfs_free_extent_later(*tpp, del.br_startblock, + error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL); + if (error) + break; /* Roll the transaction */ error = xfs_defer_finish(tpp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7e706255f165..4120bd1cba90 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1100,6 +1100,7 @@ xfs_inodegc_init_percpu( #endif init_llist_head(&gc->list); gc->items = 0; + gc->error = 0; INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8afc0c080861..8c0bfc9a33b1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -290,7 +290,9 @@ retry: * Do not perform a synchronous scan because callers can hold * other locks. */ - xfs_blockgc_flush_all(mp); + error = xfs_blockgc_flush_all(mp); + if (error) + return error; want_retry = false; goto retry; } @@ -970,6 +972,11 @@ __xfs_trans_commit( error = xfs_defer_finish_noroll(&tp); if (error) goto out_unreserve; + + /* Run precommits from final tx in defer chain. */ + error = xfs_trans_run_precommits(tp); + if (error) + goto out_unreserve; } /* |